/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 202 - (hide annotations) (download)
Fri Aug 3 09:44:26 2007 UTC (7 years ago) by ph10
File MIME type: text/plain
File size: 185235 byte(s)
Refactor the integer overflow testing so as to avoid imposing an artificial 
limit on the size of subpatterns.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 199 #include <config.h>
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 nigel 77 static const short int escapes[] = {
102     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 nigel 77 0, 0, -ESC_z /* x - z */
112     };
113    
114 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
115 nigel 77 static const short int escapes[] = {
116     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139     };
140     #endif
141    
142    
143     /* Tables of names of POSIX character classes and their lengths. The list is
144 nigel 87 terminated by a zero length entry. The first three must be alpha, lower, upper,
145 nigel 77 as this is assumed for handling case independence. */
146    
147     static const char *const posix_names[] = {
148     "alpha", "lower", "upper",
149     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
150     "print", "punct", "space", "word", "xdigit" };
151    
152     static const uschar posix_name_lengths[] = {
153     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
154    
155 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
156     base map, with an optional addition or removal of another map. Then, for some
157     classes, there is some additional tweaking: for [:blank:] the vertical space
158     characters are removed, and for [:alpha:] and [:alnum:] the underscore
159     character is removed. The triples in the table consist of the base map offset,
160     second map offset or -1 if no second map, and a non-negative value for map
161     addition or a negative value for map subtraction (if there are two maps). The
162     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
163     remove vertical space characters, 2 => remove underscore. */
164 nigel 77
165     static const int posix_class_maps[] = {
166 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
167     cbit_lower, -1, 0, /* lower */
168     cbit_upper, -1, 0, /* upper */
169     cbit_word, -1, 2, /* alnum - word without underscore */
170     cbit_print, cbit_cntrl, 0, /* ascii */
171     cbit_space, -1, 1, /* blank - a GNU extension */
172     cbit_cntrl, -1, 0, /* cntrl */
173     cbit_digit, -1, 0, /* digit */
174     cbit_graph, -1, 0, /* graph */
175     cbit_print, -1, 0, /* print */
176     cbit_punct, -1, 0, /* punct */
177     cbit_space, -1, 0, /* space */
178     cbit_word, -1, 0, /* word - a Perl extension */
179     cbit_xdigit,-1, 0 /* xdigit */
180 nigel 77 };
181    
182    
183 nigel 93 #define STRING(a) # a
184     #define XSTRING(s) STRING(s)
185    
186 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
187 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
188     they are documented. Always add a new error instead. Messages marked DEAD below
189     are no longer used. */
190 nigel 77
191     static const char *error_texts[] = {
192     "no error",
193     "\\ at end of pattern",
194     "\\c at end of pattern",
195     "unrecognized character follows \\",
196     "numbers out of order in {} quantifier",
197     /* 5 */
198     "number too big in {} quantifier",
199     "missing terminating ] for character class",
200     "invalid escape sequence in character class",
201     "range out of order in character class",
202     "nothing to repeat",
203     /* 10 */
204 nigel 93 "operand of unlimited repeat could match the empty string", /** DEAD **/
205 nigel 77 "internal error: unexpected repeat",
206     "unrecognized character after (?",
207     "POSIX named classes are supported only within a class",
208     "missing )",
209     /* 15 */
210     "reference to non-existent subpattern",
211     "erroffset passed as NULL",
212     "unknown option bit(s) set",
213     "missing ) after comment",
214 nigel 93 "parentheses nested too deeply", /** DEAD **/
215 nigel 77 /* 20 */
216 ph10 202 "regular expression is too large",
217 nigel 77 "failed to get memory",
218     "unmatched parentheses",
219     "internal error: code overflow",
220     "unrecognized character after (?<",
221     /* 25 */
222     "lookbehind assertion is not fixed length",
223 nigel 91 "malformed number or name after (?(",
224 nigel 77 "conditional group contains more than two branches",
225     "assertion expected after (?(",
226 ph10 166 "(?R or (?[+-]digits must be followed by )",
227 nigel 77 /* 30 */
228     "unknown POSIX class name",
229     "POSIX collating elements are not supported",
230     "this version of PCRE is not compiled with PCRE_UTF8 support",
231 nigel 93 "spare error", /** DEAD **/
232 nigel 77 "character value in \\x{...} sequence is too large",
233     /* 35 */
234     "invalid condition (?(0)",
235     "\\C not allowed in lookbehind assertion",
236     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
237     "number after (?C is > 255",
238     "closing ) for (?C expected",
239     /* 40 */
240     "recursive call could loop indefinitely",
241     "unrecognized character after (?P",
242 nigel 93 "syntax error in subpattern name (missing terminator)",
243 nigel 91 "two named subpatterns have the same name",
244 nigel 77 "invalid UTF-8 string",
245     /* 45 */
246     "support for \\P, \\p, and \\X has not been compiled",
247     "malformed \\P or \\p sequence",
248 nigel 91 "unknown property name after \\P or \\p",
249 nigel 93 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
250     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
251 nigel 91 /* 50 */
252 ph10 202 "repeated subpattern is too long", /** DEAD **/
253 nigel 93 "octal value is greater than \\377 (not in UTF-8 mode)",
254     "internal error: overran compiling workspace",
255     "internal error: previously-checked referenced subpattern not found",
256     "DEFINE group contains more than one branch",
257     /* 55 */
258     "repeating a DEFINE group is not allowed",
259     "inconsistent NEWLINE options",
260 ph10 171 "\\g is not followed by a braced name or an optionally braced non-zero number",
261 ph10 172 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
262 nigel 77 };
263    
264    
265     /* Table to identify digits and hex digits. This is used when compiling
266     patterns. Note that the tables in chartables are dependent on the locale, and
267     may mark arbitrary characters as digits - but the PCRE compiling code expects
268     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
269     a private table here. It costs 256 bytes, but it is a lot faster than doing
270     character value tests (at least in some simple cases I timed), and in some
271     applications one wants PCRE to compile efficiently as well as match
272     efficiently.
273    
274     For convenience, we use the same bit definitions as in chartables:
275    
276     0x04 decimal digit
277     0x08 hexadecimal digit
278    
279     Then we can use ctype_digit and ctype_xdigit in the code. */
280    
281 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
282 nigel 77 static const unsigned char digitab[] =
283     {
284     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
285     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
286     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
287     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
288     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
289     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
290     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
291     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
292     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
293     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
294     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
295     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
296     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
297     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
298     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
299     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
300     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
301     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
302     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
303     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
304     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
305     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
306     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
307     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
308     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
309     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
310     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
311     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
312     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
313     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
314     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
315     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
316    
317 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
318 nigel 77 static const unsigned char digitab[] =
319     {
320     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
321     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
322     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
323     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
324     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
325     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
326     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
331 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
332 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
333     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
334     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
335     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
336     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
337     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
338     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
339     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
340     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
341     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
342     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
343     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
344     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
347     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
348     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
350     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
351     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
352    
353     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
354     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
355     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
356     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
357     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
358     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
359     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
360     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
361     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
362     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
363     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
364     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
365 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
366 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
367     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
368     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
369     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
370     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
371     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
372     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
373     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
374     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
375     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
376     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
377     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
378     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
379     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
380     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
381     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
382     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
383     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
384     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
385     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
386     #endif
387    
388    
389     /* Definition to allow mutual recursion */
390    
391     static BOOL
392 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
393 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
394 nigel 77
395    
396    
397     /*************************************************
398     * Handle escapes *
399     *************************************************/
400    
401     /* This function is called when a \ has been encountered. It either returns a
402     positive value for a simple escape such as \n, or a negative value which
403 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
404     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
405     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
406     ptr is pointing at the \. On exit, it is on the final character of the escape
407     sequence.
408 nigel 77
409     Arguments:
410     ptrptr points to the pattern position pointer
411     errorcodeptr points to the errorcode variable
412     bracount number of previous extracting brackets
413     options the options bits
414     isclass TRUE if inside a character class
415    
416     Returns: zero or positive => a data character
417     negative => a special escape sequence
418     on error, errorptr is set
419     */
420    
421     static int
422     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
423     int options, BOOL isclass)
424     {
425 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
426     const uschar *ptr = *ptrptr + 1;
427 nigel 77 int c, i;
428    
429 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
430     ptr--; /* Set pointer back to the last byte */
431    
432 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
433    
434     if (c == 0) *errorcodeptr = ERR1;
435    
436     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
437     a table. A non-zero result is something that can be returned immediately.
438     Otherwise further processing may be required. */
439    
440 ph10 97 #ifndef EBCDIC /* ASCII coding */
441 nigel 77 else if (c < '0' || c > 'z') {} /* Not alphameric */
442     else if ((i = escapes[c - '0']) != 0) c = i;
443    
444 ph10 97 #else /* EBCDIC coding */
445 nigel 77 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
446     else if ((i = escapes[c - 0x48]) != 0) c = i;
447     #endif
448    
449     /* Escapes that need further processing, or are illegal. */
450    
451     else
452     {
453     const uschar *oldptr;
454 nigel 93 BOOL braced, negated;
455    
456 nigel 77 switch (c)
457     {
458     /* A number of Perl escapes are not handled by PCRE. We give an explicit
459     error. */
460    
461     case 'l':
462     case 'L':
463     case 'N':
464     case 'u':
465     case 'U':
466     *errorcodeptr = ERR37;
467     break;
468    
469 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
470     is an absolute backreference. If negative, it is a relative backreference.
471 ph10 172 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
472     reference to a named group. This is part of Perl's movement towards a
473     unified syntax for back references. As this is synonymous with \k{name}, we
474 ph10 171 fudge it up by pretending it really was \k. */
475 nigel 93
476     case 'g':
477     if (ptr[1] == '{')
478     {
479 ph10 171 const uschar *p;
480     for (p = ptr+2; *p != 0 && *p != '}'; p++)
481     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
482 ph10 172 if (*p != 0 && *p != '}')
483 ph10 171 {
484     c = -ESC_k;
485     break;
486 ph10 172 }
487 nigel 93 braced = TRUE;
488     ptr++;
489     }
490     else braced = FALSE;
491    
492     if (ptr[1] == '-')
493     {
494     negated = TRUE;
495     ptr++;
496     }
497     else negated = FALSE;
498    
499     c = 0;
500     while ((digitab[ptr[1]] & ctype_digit) != 0)
501     c = c * 10 + *(++ptr) - '0';
502    
503     if (c == 0 || (braced && *(++ptr) != '}'))
504     {
505     *errorcodeptr = ERR57;
506     return 0;
507     }
508    
509     if (negated)
510     {
511     if (c > bracount)
512     {
513     *errorcodeptr = ERR15;
514     return 0;
515     }
516     c = bracount - (c - 1);
517     }
518    
519     c = -(ESC_REF + c);
520     break;
521    
522 nigel 77 /* The handling of escape sequences consisting of a string of digits
523     starting with one that is not zero is not straightforward. By experiment,
524     the way Perl works seems to be as follows:
525    
526     Outside a character class, the digits are read as a decimal number. If the
527     number is less than 10, or if there are that many previous extracting
528     left brackets, then it is a back reference. Otherwise, up to three octal
529     digits are read to form an escaped byte. Thus \123 is likely to be octal
530     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
531     value is greater than 377, the least significant 8 bits are taken. Inside a
532     character class, \ followed by a digit is always an octal number. */
533    
534     case '1': case '2': case '3': case '4': case '5':
535     case '6': case '7': case '8': case '9':
536    
537     if (!isclass)
538     {
539     oldptr = ptr;
540     c -= '0';
541     while ((digitab[ptr[1]] & ctype_digit) != 0)
542     c = c * 10 + *(++ptr) - '0';
543     if (c < 10 || c <= bracount)
544     {
545     c = -(ESC_REF + c);
546     break;
547     }
548     ptr = oldptr; /* Put the pointer back and fall through */
549     }
550    
551     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
552     generates a binary zero byte and treats the digit as a following literal.
553     Thus we have to pull back the pointer by one. */
554    
555     if ((c = *ptr) >= '8')
556     {
557     ptr--;
558     c = 0;
559     break;
560     }
561    
562     /* \0 always starts an octal number, but we may drop through to here with a
563 nigel 91 larger first octal digit. The original code used just to take the least
564     significant 8 bits of octal numbers (I think this is what early Perls used
565     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
566     than 3 octal digits. */
567 nigel 77
568     case '0':
569     c -= '0';
570     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
571     c = c * 8 + *(++ptr) - '0';
572 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
573 nigel 77 break;
574    
575 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
576     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
577     treated as a data character. */
578 nigel 77
579     case 'x':
580 nigel 87 if (ptr[1] == '{')
581 nigel 77 {
582     const uschar *pt = ptr + 2;
583 nigel 87 int count = 0;
584    
585 nigel 77 c = 0;
586     while ((digitab[*pt] & ctype_xdigit) != 0)
587     {
588 nigel 87 register int cc = *pt++;
589     if (c == 0 && cc == '0') continue; /* Leading zeroes */
590 nigel 77 count++;
591 nigel 87
592 ph10 97 #ifndef EBCDIC /* ASCII coding */
593 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
594 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
595 ph10 97 #else /* EBCDIC coding */
596 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
597 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
598 nigel 77 #endif
599     }
600 nigel 87
601 nigel 77 if (*pt == '}')
602     {
603 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
604 nigel 77 ptr = pt;
605     break;
606     }
607 nigel 87
608 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
609     recognize this construct; fall through to the normal \x handling. */
610     }
611    
612 nigel 87 /* Read just a single-byte hex-defined char */
613 nigel 77
614     c = 0;
615     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
616     {
617     int cc; /* Some compilers don't like ++ */
618     cc = *(++ptr); /* in initializers */
619 ph10 97 #ifndef EBCDIC /* ASCII coding */
620 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
621     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
622 ph10 97 #else /* EBCDIC coding */
623 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
624     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
625     #endif
626     }
627     break;
628    
629 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
630     This coding is ASCII-specific, but then the whole concept of \cx is
631     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
632 nigel 77
633     case 'c':
634     c = *(++ptr);
635     if (c == 0)
636     {
637     *errorcodeptr = ERR2;
638     return 0;
639     }
640    
641 ph10 97 #ifndef EBCDIC /* ASCII coding */
642 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
643     c ^= 0x40;
644 ph10 97 #else /* EBCDIC coding */
645 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
646     c ^= 0xC0;
647     #endif
648     break;
649    
650     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
651     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
652     for Perl compatibility, it is a literal. This code looks a bit odd, but
653     there used to be some cases other than the default, and there may be again
654     in future, so I haven't "optimized" it. */
655    
656     default:
657     if ((options & PCRE_EXTRA) != 0) switch(c)
658     {
659     default:
660     *errorcodeptr = ERR3;
661     break;
662     }
663     break;
664     }
665     }
666    
667     *ptrptr = ptr;
668     return c;
669     }
670    
671    
672    
673     #ifdef SUPPORT_UCP
674     /*************************************************
675     * Handle \P and \p *
676     *************************************************/
677    
678     /* This function is called after \P or \p has been encountered, provided that
679     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
680     pointing at the P or p. On exit, it is pointing at the final character of the
681     escape sequence.
682    
683     Argument:
684     ptrptr points to the pattern position pointer
685     negptr points to a boolean that is set TRUE for negation else FALSE
686 nigel 87 dptr points to an int that is set to the detailed property value
687 nigel 77 errorcodeptr points to the error code variable
688    
689 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
690 nigel 77 */
691    
692     static int
693 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
694 nigel 77 {
695     int c, i, bot, top;
696     const uschar *ptr = *ptrptr;
697 nigel 87 char name[32];
698 nigel 77
699     c = *(++ptr);
700     if (c == 0) goto ERROR_RETURN;
701    
702     *negptr = FALSE;
703    
704 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
705     negation. */
706 nigel 77
707     if (c == '{')
708     {
709     if (ptr[1] == '^')
710     {
711     *negptr = TRUE;
712     ptr++;
713     }
714 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
715 nigel 77 {
716     c = *(++ptr);
717     if (c == 0) goto ERROR_RETURN;
718     if (c == '}') break;
719     name[i] = c;
720     }
721 nigel 87 if (c !='}') goto ERROR_RETURN;
722 nigel 77 name[i] = 0;
723     }
724    
725     /* Otherwise there is just one following character */
726    
727     else
728     {
729     name[0] = c;
730     name[1] = 0;
731     }
732    
733     *ptrptr = ptr;
734    
735     /* Search for a recognized property name using binary chop */
736    
737     bot = 0;
738     top = _pcre_utt_size;
739    
740     while (bot < top)
741     {
742 nigel 87 i = (bot + top) >> 1;
743 nigel 77 c = strcmp(name, _pcre_utt[i].name);
744 nigel 87 if (c == 0)
745     {
746     *dptr = _pcre_utt[i].value;
747     return _pcre_utt[i].type;
748     }
749 nigel 77 if (c > 0) bot = i + 1; else top = i;
750     }
751    
752     *errorcodeptr = ERR47;
753     *ptrptr = ptr;
754     return -1;
755    
756     ERROR_RETURN:
757     *errorcodeptr = ERR46;
758     *ptrptr = ptr;
759     return -1;
760     }
761     #endif
762    
763    
764    
765    
766     /*************************************************
767     * Check for counted repeat *
768     *************************************************/
769    
770     /* This function is called when a '{' is encountered in a place where it might
771     start a quantifier. It looks ahead to see if it really is a quantifier or not.
772     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
773     where the ddds are digits.
774    
775     Arguments:
776     p pointer to the first char after '{'
777    
778     Returns: TRUE or FALSE
779     */
780    
781     static BOOL
782     is_counted_repeat(const uschar *p)
783     {
784     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
785     while ((digitab[*p] & ctype_digit) != 0) p++;
786     if (*p == '}') return TRUE;
787    
788     if (*p++ != ',') return FALSE;
789     if (*p == '}') return TRUE;
790    
791     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
792     while ((digitab[*p] & ctype_digit) != 0) p++;
793    
794     return (*p == '}');
795     }
796    
797    
798    
799     /*************************************************
800     * Read repeat counts *
801     *************************************************/
802    
803     /* Read an item of the form {n,m} and return the values. This is called only
804     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
805     so the syntax is guaranteed to be correct, but we need to check the values.
806    
807     Arguments:
808     p pointer to first char after '{'
809     minp pointer to int for min
810     maxp pointer to int for max
811     returned as -1 if no max
812     errorcodeptr points to error code variable
813    
814     Returns: pointer to '}' on success;
815     current ptr on error, with errorcodeptr set non-zero
816     */
817    
818     static const uschar *
819     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
820     {
821     int min = 0;
822     int max = -1;
823    
824 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
825     an integer overflow. */
826    
827 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
828 nigel 81 if (min < 0 || min > 65535)
829     {
830     *errorcodeptr = ERR5;
831     return p;
832     }
833 nigel 77
834 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
835     Also, max must not be less than min. */
836    
837 nigel 77 if (*p == '}') max = min; else
838     {
839     if (*(++p) != '}')
840     {
841     max = 0;
842     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
843 nigel 81 if (max < 0 || max > 65535)
844     {
845     *errorcodeptr = ERR5;
846     return p;
847     }
848 nigel 77 if (max < min)
849     {
850     *errorcodeptr = ERR4;
851     return p;
852     }
853     }
854     }
855    
856 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
857     '}'. */
858 nigel 77
859 nigel 81 *minp = min;
860     *maxp = max;
861 nigel 77 return p;
862     }
863    
864    
865    
866     /*************************************************
867 nigel 93 * Find forward referenced subpattern *
868 nigel 91 *************************************************/
869    
870 nigel 93 /* This function scans along a pattern's text looking for capturing
871     subpatterns, and counting them. If it finds a named pattern that matches the
872     name it is given, it returns its number. Alternatively, if the name is NULL, it
873     returns when it reaches a given numbered subpattern. This is used for forward
874     references to subpatterns. We know that if (?P< is encountered, the name will
875     be terminated by '>' because that is checked in the first pass.
876 nigel 91
877     Arguments:
878 nigel 93 ptr current position in the pattern
879     count current count of capturing parens so far encountered
880     name name to seek, or NULL if seeking a numbered subpattern
881     lorn name length, or subpattern number if name is NULL
882     xmode TRUE if we are in /x mode
883 nigel 91
884     Returns: the number of the named subpattern, or -1 if not found
885     */
886    
887     static int
888 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
889     BOOL xmode)
890 nigel 91 {
891     const uschar *thisname;
892 nigel 93
893 nigel 91 for (; *ptr != 0; ptr++)
894     {
895 nigel 93 int term;
896    
897     /* Skip over backslashed characters and also entire \Q...\E */
898    
899     if (*ptr == '\\')
900     {
901     if (*(++ptr) == 0) return -1;
902     if (*ptr == 'Q') for (;;)
903     {
904     while (*(++ptr) != 0 && *ptr != '\\');
905     if (*ptr == 0) return -1;
906     if (*(++ptr) == 'E') break;
907     }
908     continue;
909     }
910    
911     /* Skip over character classes */
912    
913     if (*ptr == '[')
914     {
915     while (*(++ptr) != ']')
916     {
917     if (*ptr == '\\')
918     {
919     if (*(++ptr) == 0) return -1;
920     if (*ptr == 'Q') for (;;)
921     {
922     while (*(++ptr) != 0 && *ptr != '\\');
923     if (*ptr == 0) return -1;
924     if (*(++ptr) == 'E') break;
925     }
926     continue;
927     }
928     }
929     continue;
930     }
931    
932     /* Skip comments in /x mode */
933    
934     if (xmode && *ptr == '#')
935     {
936     while (*(++ptr) != 0 && *ptr != '\n');
937     if (*ptr == 0) return -1;
938     continue;
939     }
940    
941     /* An opening parens must now be a real metacharacter */
942    
943 nigel 91 if (*ptr != '(') continue;
944 nigel 93 if (ptr[1] != '?')
945     {
946     count++;
947     if (name == NULL && count == lorn) return count;
948     continue;
949     }
950    
951     ptr += 2;
952     if (*ptr == 'P') ptr++; /* Allow optional P */
953    
954     /* We have to disambiguate (?<! and (?<= from (?<name> */
955    
956     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
957     *ptr != '\'')
958     continue;
959    
960 nigel 91 count++;
961 nigel 93
962     if (name == NULL && count == lorn) return count;
963     term = *ptr++;
964     if (term == '<') term = '>';
965 nigel 91 thisname = ptr;
966 nigel 93 while (*ptr != term) ptr++;
967     if (name != NULL && lorn == ptr - thisname &&
968     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
969 nigel 91 return count;
970     }
971 nigel 93
972 nigel 91 return -1;
973     }
974    
975    
976    
977     /*************************************************
978 nigel 77 * Find first significant op code *
979     *************************************************/
980    
981     /* This is called by several functions that scan a compiled expression looking
982     for a fixed first character, or an anchoring op code etc. It skips over things
983     that do not influence this. For some calls, a change of option is important.
984     For some calls, it makes sense to skip negative forward and all backward
985     assertions, and also the \b assertion; for others it does not.
986    
987     Arguments:
988     code pointer to the start of the group
989     options pointer to external options
990     optbit the option bit whose changing is significant, or
991     zero if none are
992     skipassert TRUE if certain assertions are to be skipped
993    
994     Returns: pointer to the first significant opcode
995     */
996    
997     static const uschar*
998     first_significant_code(const uschar *code, int *options, int optbit,
999     BOOL skipassert)
1000     {
1001     for (;;)
1002     {
1003     switch ((int)*code)
1004     {
1005     case OP_OPT:
1006     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1007     *options = (int)code[1];
1008     code += 2;
1009     break;
1010    
1011     case OP_ASSERT_NOT:
1012     case OP_ASSERTBACK:
1013     case OP_ASSERTBACK_NOT:
1014     if (!skipassert) return code;
1015     do code += GET(code, 1); while (*code == OP_ALT);
1016     code += _pcre_OP_lengths[*code];
1017     break;
1018    
1019     case OP_WORD_BOUNDARY:
1020     case OP_NOT_WORD_BOUNDARY:
1021     if (!skipassert) return code;
1022     /* Fall through */
1023    
1024     case OP_CALLOUT:
1025     case OP_CREF:
1026 nigel 93 case OP_RREF:
1027     case OP_DEF:
1028 nigel 77 code += _pcre_OP_lengths[*code];
1029     break;
1030    
1031     default:
1032     return code;
1033     }
1034     }
1035     /* Control never reaches here */
1036     }
1037    
1038    
1039    
1040    
1041     /*************************************************
1042     * Find the fixed length of a pattern *
1043     *************************************************/
1044    
1045     /* Scan a pattern and compute the fixed length of subject that will match it,
1046     if the length is fixed. This is needed for dealing with backward assertions.
1047     In UTF8 mode, the result is in characters rather than bytes.
1048    
1049     Arguments:
1050     code points to the start of the pattern (the bracket)
1051     options the compiling options
1052    
1053     Returns: the fixed length, or -1 if there is no fixed length,
1054     or -2 if \C was encountered
1055     */
1056    
1057     static int
1058     find_fixedlength(uschar *code, int options)
1059     {
1060     int length = -1;
1061    
1062     register int branchlength = 0;
1063     register uschar *cc = code + 1 + LINK_SIZE;
1064    
1065     /* Scan along the opcodes for this branch. If we get to the end of the
1066     branch, check the length against that of the other branches. */
1067    
1068     for (;;)
1069     {
1070     int d;
1071     register int op = *cc;
1072    
1073     switch (op)
1074     {
1075 nigel 93 case OP_CBRA:
1076 nigel 77 case OP_BRA:
1077     case OP_ONCE:
1078     case OP_COND:
1079 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1080 nigel 77 if (d < 0) return d;
1081     branchlength += d;
1082     do cc += GET(cc, 1); while (*cc == OP_ALT);
1083     cc += 1 + LINK_SIZE;
1084     break;
1085    
1086     /* Reached end of a branch; if it's a ket it is the end of a nested
1087     call. If it's ALT it is an alternation in a nested call. If it is
1088     END it's the end of the outer call. All can be handled by the same code. */
1089    
1090     case OP_ALT:
1091     case OP_KET:
1092     case OP_KETRMAX:
1093     case OP_KETRMIN:
1094     case OP_END:
1095     if (length < 0) length = branchlength;
1096     else if (length != branchlength) return -1;
1097     if (*cc != OP_ALT) return length;
1098     cc += 1 + LINK_SIZE;
1099     branchlength = 0;
1100     break;
1101    
1102     /* Skip over assertive subpatterns */
1103    
1104     case OP_ASSERT:
1105     case OP_ASSERT_NOT:
1106     case OP_ASSERTBACK:
1107     case OP_ASSERTBACK_NOT:
1108     do cc += GET(cc, 1); while (*cc == OP_ALT);
1109     /* Fall through */
1110    
1111     /* Skip over things that don't match chars */
1112    
1113     case OP_REVERSE:
1114     case OP_CREF:
1115 nigel 93 case OP_RREF:
1116     case OP_DEF:
1117 nigel 77 case OP_OPT:
1118     case OP_CALLOUT:
1119     case OP_SOD:
1120     case OP_SOM:
1121     case OP_EOD:
1122     case OP_EODN:
1123     case OP_CIRC:
1124     case OP_DOLL:
1125     case OP_NOT_WORD_BOUNDARY:
1126     case OP_WORD_BOUNDARY:
1127     cc += _pcre_OP_lengths[*cc];
1128     break;
1129    
1130     /* Handle literal characters */
1131    
1132     case OP_CHAR:
1133     case OP_CHARNC:
1134 nigel 91 case OP_NOT:
1135 nigel 77 branchlength++;
1136     cc += 2;
1137     #ifdef SUPPORT_UTF8
1138     if ((options & PCRE_UTF8) != 0)
1139     {
1140     while ((*cc & 0xc0) == 0x80) cc++;
1141     }
1142     #endif
1143     break;
1144    
1145     /* Handle exact repetitions. The count is already in characters, but we
1146     need to skip over a multibyte character in UTF8 mode. */
1147    
1148     case OP_EXACT:
1149     branchlength += GET2(cc,1);
1150     cc += 4;
1151     #ifdef SUPPORT_UTF8
1152     if ((options & PCRE_UTF8) != 0)
1153     {
1154     while((*cc & 0x80) == 0x80) cc++;
1155     }
1156     #endif
1157     break;
1158    
1159     case OP_TYPEEXACT:
1160     branchlength += GET2(cc,1);
1161     cc += 4;
1162     break;
1163    
1164     /* Handle single-char matchers */
1165    
1166     case OP_PROP:
1167     case OP_NOTPROP:
1168 nigel 87 cc += 2;
1169 nigel 77 /* Fall through */
1170    
1171     case OP_NOT_DIGIT:
1172     case OP_DIGIT:
1173     case OP_NOT_WHITESPACE:
1174     case OP_WHITESPACE:
1175     case OP_NOT_WORDCHAR:
1176     case OP_WORDCHAR:
1177     case OP_ANY:
1178     branchlength++;
1179     cc++;
1180     break;
1181    
1182     /* The single-byte matcher isn't allowed */
1183    
1184     case OP_ANYBYTE:
1185     return -2;
1186    
1187     /* Check a class for variable quantification */
1188    
1189     #ifdef SUPPORT_UTF8
1190     case OP_XCLASS:
1191     cc += GET(cc, 1) - 33;
1192     /* Fall through */
1193     #endif
1194    
1195     case OP_CLASS:
1196     case OP_NCLASS:
1197     cc += 33;
1198    
1199     switch (*cc)
1200     {
1201     case OP_CRSTAR:
1202     case OP_CRMINSTAR:
1203     case OP_CRQUERY:
1204     case OP_CRMINQUERY:
1205     return -1;
1206    
1207     case OP_CRRANGE:
1208     case OP_CRMINRANGE:
1209     if (GET2(cc,1) != GET2(cc,3)) return -1;
1210     branchlength += GET2(cc,1);
1211     cc += 5;
1212     break;
1213    
1214     default:
1215     branchlength++;
1216     }
1217     break;
1218    
1219     /* Anything else is variable length */
1220    
1221     default:
1222     return -1;
1223     }
1224     }
1225     /* Control never gets here */
1226     }
1227    
1228    
1229    
1230    
1231     /*************************************************
1232     * Scan compiled regex for numbered bracket *
1233     *************************************************/
1234    
1235     /* This little function scans through a compiled pattern until it finds a
1236     capturing bracket with the given number.
1237    
1238     Arguments:
1239     code points to start of expression
1240     utf8 TRUE in UTF-8 mode
1241     number the required bracket number
1242    
1243     Returns: pointer to the opcode for the bracket, or NULL if not found
1244     */
1245    
1246     static const uschar *
1247     find_bracket(const uschar *code, BOOL utf8, int number)
1248     {
1249     for (;;)
1250     {
1251     register int c = *code;
1252     if (c == OP_END) return NULL;
1253 nigel 91
1254     /* XCLASS is used for classes that cannot be represented just by a bit
1255     map. This includes negated single high-valued characters. The length in
1256     the table is zero; the actual length is stored in the compiled code. */
1257    
1258     if (c == OP_XCLASS) code += GET(code, 1);
1259    
1260 nigel 93 /* Handle capturing bracket */
1261 nigel 91
1262 nigel 93 else if (c == OP_CBRA)
1263 nigel 77 {
1264 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1265 nigel 77 if (n == number) return (uschar *)code;
1266 nigel 93 code += _pcre_OP_lengths[c];
1267 nigel 77 }
1268 nigel 91
1269 nigel 93 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1270     a multi-byte character. The length in the table is a minimum, so we have to
1271     arrange to skip the extra bytes. */
1272 nigel 91
1273 nigel 77 else
1274     {
1275     code += _pcre_OP_lengths[c];
1276 ph10 107 #ifdef SUPPORT_UTF8
1277 nigel 77 if (utf8) switch(c)
1278     {
1279     case OP_CHAR:
1280     case OP_CHARNC:
1281     case OP_EXACT:
1282     case OP_UPTO:
1283     case OP_MINUPTO:
1284 nigel 93 case OP_POSUPTO:
1285 nigel 77 case OP_STAR:
1286     case OP_MINSTAR:
1287 nigel 93 case OP_POSSTAR:
1288 nigel 77 case OP_PLUS:
1289     case OP_MINPLUS:
1290 nigel 93 case OP_POSPLUS:
1291 nigel 77 case OP_QUERY:
1292     case OP_MINQUERY:
1293 nigel 93 case OP_POSQUERY:
1294     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1295 nigel 77 break;
1296     }
1297 ph10 111 #endif
1298 nigel 77 }
1299     }
1300     }
1301    
1302    
1303    
1304     /*************************************************
1305     * Scan compiled regex for recursion reference *
1306     *************************************************/
1307    
1308     /* This little function scans through a compiled pattern until it finds an
1309     instance of OP_RECURSE.
1310    
1311     Arguments:
1312     code points to start of expression
1313     utf8 TRUE in UTF-8 mode
1314    
1315     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1316     */
1317    
1318     static const uschar *
1319     find_recurse(const uschar *code, BOOL utf8)
1320     {
1321     for (;;)
1322     {
1323     register int c = *code;
1324     if (c == OP_END) return NULL;
1325 nigel 91 if (c == OP_RECURSE) return code;
1326    
1327     /* XCLASS is used for classes that cannot be represented just by a bit
1328     map. This includes negated single high-valued characters. The length in
1329     the table is zero; the actual length is stored in the compiled code. */
1330    
1331     if (c == OP_XCLASS) code += GET(code, 1);
1332    
1333     /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1334     that are followed by a character may be followed by a multi-byte character.
1335 nigel 93 The length in the table is a minimum, so we have to arrange to skip the extra
1336     bytes. */
1337 nigel 91
1338 nigel 77 else
1339     {
1340     code += _pcre_OP_lengths[c];
1341 ph10 107 #ifdef SUPPORT_UTF8
1342 nigel 77 if (utf8) switch(c)
1343     {
1344     case OP_CHAR:
1345     case OP_CHARNC:
1346     case OP_EXACT:
1347     case OP_UPTO:
1348     case OP_MINUPTO:
1349 nigel 93 case OP_POSUPTO:
1350 nigel 77 case OP_STAR:
1351     case OP_MINSTAR:
1352 nigel 93 case OP_POSSTAR:
1353 nigel 77 case OP_PLUS:
1354     case OP_MINPLUS:
1355 nigel 93 case OP_POSPLUS:
1356 nigel 77 case OP_QUERY:
1357     case OP_MINQUERY:
1358 nigel 93 case OP_POSQUERY:
1359     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1360 nigel 77 break;
1361     }
1362 ph10 111 #endif
1363 nigel 77 }
1364     }
1365     }
1366    
1367    
1368    
1369     /*************************************************
1370     * Scan compiled branch for non-emptiness *
1371     *************************************************/
1372    
1373     /* This function scans through a branch of a compiled pattern to see whether it
1374 nigel 93 can match the empty string or not. It is called from could_be_empty()
1375     below and from compile_branch() when checking for an unlimited repeat of a
1376     group that can match nothing. Note that first_significant_code() skips over
1377     assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1378     struck an inner bracket whose current branch will already have been scanned.
1379 nigel 77
1380     Arguments:
1381     code points to start of search
1382     endcode points to where to stop
1383     utf8 TRUE if in UTF8 mode
1384    
1385     Returns: TRUE if what is matched could be empty
1386     */
1387    
1388     static BOOL
1389     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1390     {
1391     register int c;
1392 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1393 nigel 77 code < endcode;
1394     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1395     {
1396     const uschar *ccode;
1397    
1398     c = *code;
1399 ph10 172
1400 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1401 nigel 77
1402 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1403     {
1404 ph10 172 code += _pcre_OP_lengths[c];
1405 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1406     c = *code;
1407     continue;
1408     }
1409    
1410     /* For other groups, scan the branches. */
1411 ph10 172
1412 nigel 93 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1413 nigel 77 {
1414     BOOL empty_branch;
1415     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1416    
1417     /* Scan a closed bracket */
1418    
1419     empty_branch = FALSE;
1420     do
1421     {
1422     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1423     empty_branch = TRUE;
1424     code += GET(code, 1);
1425     }
1426     while (*code == OP_ALT);
1427     if (!empty_branch) return FALSE; /* All branches are non-empty */
1428 ph10 172 c = *code;
1429 nigel 93 continue;
1430 nigel 77 }
1431    
1432 nigel 93 /* Handle the other opcodes */
1433    
1434     switch (c)
1435 nigel 77 {
1436     /* Check for quantifiers after a class */
1437    
1438     #ifdef SUPPORT_UTF8
1439     case OP_XCLASS:
1440     ccode = code + GET(code, 1);
1441     goto CHECK_CLASS_REPEAT;
1442     #endif
1443    
1444     case OP_CLASS:
1445     case OP_NCLASS:
1446     ccode = code + 33;
1447    
1448     #ifdef SUPPORT_UTF8
1449     CHECK_CLASS_REPEAT:
1450     #endif
1451    
1452     switch (*ccode)
1453     {
1454     case OP_CRSTAR: /* These could be empty; continue */
1455     case OP_CRMINSTAR:
1456     case OP_CRQUERY:
1457     case OP_CRMINQUERY:
1458     break;
1459    
1460     default: /* Non-repeat => class must match */
1461     case OP_CRPLUS: /* These repeats aren't empty */
1462     case OP_CRMINPLUS:
1463     return FALSE;
1464    
1465     case OP_CRRANGE:
1466     case OP_CRMINRANGE:
1467     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1468     break;
1469     }
1470     break;
1471    
1472     /* Opcodes that must match a character */
1473    
1474     case OP_PROP:
1475     case OP_NOTPROP:
1476     case OP_EXTUNI:
1477     case OP_NOT_DIGIT:
1478     case OP_DIGIT:
1479     case OP_NOT_WHITESPACE:
1480     case OP_WHITESPACE:
1481     case OP_NOT_WORDCHAR:
1482     case OP_WORDCHAR:
1483     case OP_ANY:
1484     case OP_ANYBYTE:
1485     case OP_CHAR:
1486     case OP_CHARNC:
1487     case OP_NOT:
1488     case OP_PLUS:
1489     case OP_MINPLUS:
1490 nigel 93 case OP_POSPLUS:
1491 nigel 77 case OP_EXACT:
1492     case OP_NOTPLUS:
1493     case OP_NOTMINPLUS:
1494 nigel 93 case OP_NOTPOSPLUS:
1495 nigel 77 case OP_NOTEXACT:
1496     case OP_TYPEPLUS:
1497     case OP_TYPEMINPLUS:
1498 nigel 93 case OP_TYPEPOSPLUS:
1499 nigel 77 case OP_TYPEEXACT:
1500     return FALSE;
1501    
1502     /* End of branch */
1503    
1504     case OP_KET:
1505     case OP_KETRMAX:
1506     case OP_KETRMIN:
1507     case OP_ALT:
1508     return TRUE;
1509    
1510 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1511     MINUPTO, and POSUPTO may be followed by a multibyte character */
1512 nigel 77
1513     #ifdef SUPPORT_UTF8
1514     case OP_STAR:
1515     case OP_MINSTAR:
1516 nigel 93 case OP_POSSTAR:
1517 nigel 77 case OP_QUERY:
1518     case OP_MINQUERY:
1519 nigel 93 case OP_POSQUERY:
1520 nigel 77 case OP_UPTO:
1521     case OP_MINUPTO:
1522 nigel 93 case OP_POSUPTO:
1523 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1524     break;
1525     #endif
1526     }
1527     }
1528    
1529     return TRUE;
1530     }
1531    
1532    
1533    
1534     /*************************************************
1535     * Scan compiled regex for non-emptiness *
1536     *************************************************/
1537    
1538     /* This function is called to check for left recursive calls. We want to check
1539     the current branch of the current pattern to see if it could match the empty
1540     string. If it could, we must look outwards for branches at other levels,
1541     stopping when we pass beyond the bracket which is the subject of the recursion.
1542    
1543     Arguments:
1544     code points to start of the recursion
1545     endcode points to where to stop (current RECURSE item)
1546     bcptr points to the chain of current (unclosed) branch starts
1547     utf8 TRUE if in UTF-8 mode
1548    
1549     Returns: TRUE if what is matched could be empty
1550     */
1551    
1552     static BOOL
1553     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1554     BOOL utf8)
1555     {
1556     while (bcptr != NULL && bcptr->current >= code)
1557     {
1558     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1559     bcptr = bcptr->outer;
1560     }
1561     return TRUE;
1562     }
1563    
1564    
1565    
1566     /*************************************************
1567     * Check for POSIX class syntax *
1568     *************************************************/
1569    
1570     /* This function is called when the sequence "[:" or "[." or "[=" is
1571     encountered in a character class. It checks whether this is followed by an
1572     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1573     ".]" or "=]".
1574    
1575     Argument:
1576     ptr pointer to the initial [
1577     endptr where to return the end pointer
1578     cd pointer to compile data
1579    
1580     Returns: TRUE or FALSE
1581     */
1582    
1583     static BOOL
1584     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1585     {
1586     int terminator; /* Don't combine these lines; the Solaris cc */
1587     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1588     if (*(++ptr) == '^') ptr++;
1589     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1590     if (*ptr == terminator && ptr[1] == ']')
1591     {
1592     *endptr = ptr;
1593     return TRUE;
1594     }
1595     return FALSE;
1596     }
1597    
1598    
1599    
1600    
1601     /*************************************************
1602     * Check POSIX class name *
1603     *************************************************/
1604    
1605     /* This function is called to check the name given in a POSIX-style class entry
1606     such as [:alnum:].
1607    
1608     Arguments:
1609     ptr points to the first letter
1610     len the length of the name
1611    
1612     Returns: a value representing the name, or -1 if unknown
1613     */
1614    
1615     static int
1616     check_posix_name(const uschar *ptr, int len)
1617     {
1618     register int yield = 0;
1619     while (posix_name_lengths[yield] != 0)
1620     {
1621     if (len == posix_name_lengths[yield] &&
1622     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1623     yield++;
1624     }
1625     return -1;
1626     }
1627    
1628    
1629     /*************************************************
1630     * Adjust OP_RECURSE items in repeated group *
1631     *************************************************/
1632    
1633     /* OP_RECURSE items contain an offset from the start of the regex to the group
1634     that is referenced. This means that groups can be replicated for fixed
1635     repetition simply by copying (because the recursion is allowed to refer to
1636     earlier groups that are outside the current group). However, when a group is
1637     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1638     it, after it has been compiled. This means that any OP_RECURSE items within it
1639     that refer to the group itself or any contained groups have to have their
1640 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1641     the partially compiled regex must be temporarily terminated with OP_END.
1642 nigel 77
1643 nigel 93 This function has been extended with the possibility of forward references for
1644     recursions and subroutine calls. It must also check the list of such references
1645     for the group we are dealing with. If it finds that one of the recursions in
1646     the current group is on this list, it adjusts the offset in the list, not the
1647     value in the reference (which is a group number).
1648    
1649 nigel 77 Arguments:
1650     group points to the start of the group
1651     adjust the amount by which the group is to be moved
1652     utf8 TRUE in UTF-8 mode
1653     cd contains pointers to tables etc.
1654 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1655 nigel 77
1656     Returns: nothing
1657     */
1658    
1659     static void
1660 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1661     uschar *save_hwm)
1662 nigel 77 {
1663     uschar *ptr = group;
1664     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1665     {
1666 nigel 93 int offset;
1667     uschar *hc;
1668    
1669     /* See if this recursion is on the forward reference list. If so, adjust the
1670     reference. */
1671    
1672     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1673     {
1674     offset = GET(hc, 0);
1675     if (cd->start_code + offset == ptr + 1)
1676     {
1677     PUT(hc, 0, offset + adjust);
1678     break;
1679     }
1680     }
1681    
1682     /* Otherwise, adjust the recursion offset if it's after the start of this
1683     group. */
1684    
1685     if (hc >= cd->hwm)
1686     {
1687     offset = GET(ptr, 1);
1688     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1689     }
1690    
1691 nigel 77 ptr += 1 + LINK_SIZE;
1692     }
1693     }
1694    
1695    
1696    
1697     /*************************************************
1698     * Insert an automatic callout point *
1699     *************************************************/
1700    
1701     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1702     callout points before each pattern item.
1703    
1704     Arguments:
1705     code current code pointer
1706     ptr current pattern pointer
1707     cd pointers to tables etc
1708    
1709     Returns: new code pointer
1710     */
1711    
1712     static uschar *
1713     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1714     {
1715     *code++ = OP_CALLOUT;
1716     *code++ = 255;
1717     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1718     PUT(code, LINK_SIZE, 0); /* Default length */
1719     return code + 2*LINK_SIZE;
1720     }
1721    
1722    
1723    
1724     /*************************************************
1725     * Complete a callout item *
1726     *************************************************/
1727    
1728     /* A callout item contains the length of the next item in the pattern, which
1729     we can't fill in till after we have reached the relevant point. This is used
1730     for both automatic and manual callouts.
1731    
1732     Arguments:
1733     previous_callout points to previous callout item
1734     ptr current pattern pointer
1735     cd pointers to tables etc
1736    
1737     Returns: nothing
1738     */
1739    
1740     static void
1741     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1742     {
1743     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1744     PUT(previous_callout, 2 + LINK_SIZE, length);
1745     }
1746    
1747    
1748    
1749     #ifdef SUPPORT_UCP
1750     /*************************************************
1751     * Get othercase range *
1752     *************************************************/
1753    
1754     /* This function is passed the start and end of a class range, in UTF-8 mode
1755     with UCP support. It searches up the characters, looking for internal ranges of
1756     characters in the "other" case. Each call returns the next one, updating the
1757     start address.
1758    
1759     Arguments:
1760     cptr points to starting character value; updated
1761     d end value
1762     ocptr where to put start of othercase range
1763     odptr where to put end of othercase range
1764    
1765     Yield: TRUE when range returned; FALSE when no more
1766     */
1767    
1768     static BOOL
1769 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1770     unsigned int *odptr)
1771 nigel 77 {
1772 nigel 93 unsigned int c, othercase, next;
1773 nigel 77
1774     for (c = *cptr; c <= d; c++)
1775 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1776 nigel 77
1777     if (c > d) return FALSE;
1778    
1779     *ocptr = othercase;
1780     next = othercase + 1;
1781    
1782     for (++c; c <= d; c++)
1783     {
1784 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1785 nigel 77 next++;
1786     }
1787    
1788     *odptr = next - 1;
1789     *cptr = c;
1790    
1791     return TRUE;
1792     }
1793     #endif /* SUPPORT_UCP */
1794    
1795    
1796 nigel 93
1797 nigel 77 /*************************************************
1798 nigel 93 * Check if auto-possessifying is possible *
1799     *************************************************/
1800    
1801     /* This function is called for unlimited repeats of certain items, to see
1802     whether the next thing could possibly match the repeated item. If not, it makes
1803     sense to automatically possessify the repeated item.
1804    
1805     Arguments:
1806     op_code the repeated op code
1807     this data for this item, depends on the opcode
1808     utf8 TRUE in UTF-8 mode
1809     utf8_char used for utf8 character bytes, NULL if not relevant
1810     ptr next character in pattern
1811     options options bits
1812     cd contains pointers to tables etc.
1813    
1814     Returns: TRUE if possessifying is wanted
1815     */
1816    
1817     static BOOL
1818     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1819     const uschar *ptr, int options, compile_data *cd)
1820     {
1821     int next;
1822    
1823     /* Skip whitespace and comments in extended mode */
1824    
1825     if ((options & PCRE_EXTENDED) != 0)
1826     {
1827     for (;;)
1828     {
1829     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1830     if (*ptr == '#')
1831     {
1832     while (*(++ptr) != 0)
1833     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1834     }
1835     else break;
1836     }
1837     }
1838    
1839     /* If the next item is one that we can handle, get its value. A non-negative
1840     value is a character, a negative value is an escape value. */
1841    
1842     if (*ptr == '\\')
1843     {
1844     int temperrorcode = 0;
1845     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1846     if (temperrorcode != 0) return FALSE;
1847     ptr++; /* Point after the escape sequence */
1848     }
1849    
1850     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1851     {
1852     #ifdef SUPPORT_UTF8
1853     if (utf8) { GETCHARINC(next, ptr); } else
1854     #endif
1855     next = *ptr++;
1856     }
1857    
1858     else return FALSE;
1859    
1860     /* Skip whitespace and comments in extended mode */
1861    
1862     if ((options & PCRE_EXTENDED) != 0)
1863     {
1864     for (;;)
1865     {
1866     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1867     if (*ptr == '#')
1868     {
1869     while (*(++ptr) != 0)
1870     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1871     }
1872     else break;
1873     }
1874     }
1875    
1876     /* If the next thing is itself optional, we have to give up. */
1877    
1878     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1879     return FALSE;
1880    
1881     /* Now compare the next item with the previous opcode. If the previous is a
1882     positive single character match, "item" either contains the character or, if
1883     "item" is greater than 127 in utf8 mode, the character's bytes are in
1884     utf8_char. */
1885    
1886    
1887     /* Handle cases when the next item is a character. */
1888    
1889     if (next >= 0) switch(op_code)
1890     {
1891     case OP_CHAR:
1892     #ifdef SUPPORT_UTF8
1893     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1894     #endif
1895     return item != next;
1896    
1897     /* For CHARNC (caseless character) we must check the other case. If we have
1898     Unicode property support, we can use it to test the other case of
1899     high-valued characters. */
1900    
1901     case OP_CHARNC:
1902     #ifdef SUPPORT_UTF8
1903     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1904     #endif
1905     if (item == next) return FALSE;
1906     #ifdef SUPPORT_UTF8
1907     if (utf8)
1908     {
1909     unsigned int othercase;
1910     if (next < 128) othercase = cd->fcc[next]; else
1911     #ifdef SUPPORT_UCP
1912     othercase = _pcre_ucp_othercase((unsigned int)next);
1913     #else
1914     othercase = NOTACHAR;
1915     #endif
1916     return (unsigned int)item != othercase;
1917     }
1918     else
1919     #endif /* SUPPORT_UTF8 */
1920     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1921    
1922     /* For OP_NOT, "item" must be a single-byte character. */
1923    
1924     case OP_NOT:
1925     if (next < 0) return FALSE; /* Not a character */
1926     if (item == next) return TRUE;
1927     if ((options & PCRE_CASELESS) == 0) return FALSE;
1928     #ifdef SUPPORT_UTF8
1929     if (utf8)
1930     {
1931     unsigned int othercase;
1932     if (next < 128) othercase = cd->fcc[next]; else
1933     #ifdef SUPPORT_UCP
1934     othercase = _pcre_ucp_othercase(next);
1935     #else
1936     othercase = NOTACHAR;
1937     #endif
1938     return (unsigned int)item == othercase;
1939     }
1940     else
1941     #endif /* SUPPORT_UTF8 */
1942     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1943    
1944     case OP_DIGIT:
1945     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1946    
1947     case OP_NOT_DIGIT:
1948     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1949    
1950     case OP_WHITESPACE:
1951     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1952    
1953     case OP_NOT_WHITESPACE:
1954     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1955    
1956     case OP_WORDCHAR:
1957     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1958    
1959     case OP_NOT_WORDCHAR:
1960     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1961    
1962 ph10 180 case OP_HSPACE:
1963     case OP_NOT_HSPACE:
1964     switch(next)
1965     {
1966     case 0x09:
1967     case 0x20:
1968     case 0xa0:
1969     case 0x1680:
1970     case 0x180e:
1971     case 0x2000:
1972     case 0x2001:
1973     case 0x2002:
1974     case 0x2003:
1975     case 0x2004:
1976     case 0x2005:
1977     case 0x2006:
1978     case 0x2007:
1979     case 0x2008:
1980     case 0x2009:
1981     case 0x200A:
1982     case 0x202f:
1983     case 0x205f:
1984     case 0x3000:
1985     return op_code != OP_HSPACE;
1986     default:
1987     return op_code == OP_HSPACE;
1988     }
1989    
1990     case OP_VSPACE:
1991     case OP_NOT_VSPACE:
1992     switch(next)
1993     {
1994     case 0x0a:
1995     case 0x0b:
1996     case 0x0c:
1997     case 0x0d:
1998     case 0x85:
1999     case 0x2028:
2000     case 0x2029:
2001     return op_code != OP_VSPACE;
2002     default:
2003     return op_code == OP_VSPACE;
2004     }
2005    
2006 nigel 93 default:
2007     return FALSE;
2008     }
2009    
2010    
2011     /* Handle the case when the next item is \d, \s, etc. */
2012    
2013     switch(op_code)
2014     {
2015     case OP_CHAR:
2016     case OP_CHARNC:
2017     #ifdef SUPPORT_UTF8
2018     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2019     #endif
2020     switch(-next)
2021     {
2022     case ESC_d:
2023     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2024    
2025     case ESC_D:
2026     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2027    
2028     case ESC_s:
2029     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2030    
2031     case ESC_S:
2032     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2033    
2034     case ESC_w:
2035     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2036    
2037     case ESC_W:
2038     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2039 ph10 182
2040 ph10 180 case ESC_h:
2041     case ESC_H:
2042     switch(item)
2043     {
2044     case 0x09:
2045     case 0x20:
2046     case 0xa0:
2047     case 0x1680:
2048     case 0x180e:
2049     case 0x2000:
2050     case 0x2001:
2051     case 0x2002:
2052     case 0x2003:
2053     case 0x2004:
2054     case 0x2005:
2055     case 0x2006:
2056     case 0x2007:
2057     case 0x2008:
2058     case 0x2009:
2059     case 0x200A:
2060     case 0x202f:
2061     case 0x205f:
2062     case 0x3000:
2063     return -next != ESC_h;
2064     default:
2065     return -next == ESC_h;
2066 ph10 182 }
2067    
2068 ph10 180 case ESC_v:
2069     case ESC_V:
2070     switch(item)
2071     {
2072     case 0x0a:
2073     case 0x0b:
2074     case 0x0c:
2075     case 0x0d:
2076     case 0x85:
2077     case 0x2028:
2078     case 0x2029:
2079     return -next != ESC_v;
2080     default:
2081     return -next == ESC_v;
2082 ph10 182 }
2083 nigel 93
2084     default:
2085     return FALSE;
2086     }
2087    
2088     case OP_DIGIT:
2089 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2090     next == -ESC_h || next == -ESC_v;
2091 nigel 93
2092     case OP_NOT_DIGIT:
2093     return next == -ESC_d;
2094    
2095     case OP_WHITESPACE:
2096     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2097    
2098     case OP_NOT_WHITESPACE:
2099 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2100 nigel 93
2101 ph10 180 case OP_HSPACE:
2102     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2103    
2104     case OP_NOT_HSPACE:
2105     return next == -ESC_h;
2106 ph10 182
2107 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2108 ph10 182 case OP_VSPACE:
2109 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2110    
2111     case OP_NOT_VSPACE:
2112 ph10 182 return next == -ESC_v;
2113 ph10 180
2114 nigel 93 case OP_WORDCHAR:
2115 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2116 nigel 93
2117     case OP_NOT_WORDCHAR:
2118     return next == -ESC_w || next == -ESC_d;
2119 ph10 182
2120 nigel 93 default:
2121     return FALSE;
2122     }
2123    
2124     /* Control does not reach here */
2125     }
2126    
2127    
2128    
2129     /*************************************************
2130 nigel 77 * Compile one branch *
2131     *************************************************/
2132    
2133 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2134 nigel 77 changed during the branch, the pointer is used to change the external options
2135 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2136     to find out the amount of memory needed, as well as during the real compile
2137     phase. The value of lengthptr distinguishes the two phases.
2138 nigel 77
2139     Arguments:
2140     optionsptr pointer to the option bits
2141     codeptr points to the pointer to the current code point
2142     ptrptr points to the current pattern pointer
2143     errorcodeptr points to error code variable
2144     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2145     reqbyteptr set to the last literal character required, else < 0
2146     bcptr points to current branch chain
2147     cd contains pointers to tables etc.
2148 nigel 93 lengthptr NULL during the real compile phase
2149     points to length accumulator during pre-compile phase
2150 nigel 77
2151     Returns: TRUE on success
2152     FALSE, with *errorcodeptr set non-zero on error
2153     */
2154    
2155     static BOOL
2156 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2157     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2158     compile_data *cd, int *lengthptr)
2159 nigel 77 {
2160     int repeat_type, op_type;
2161     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2162     int bravalue = 0;
2163     int greedy_default, greedy_non_default;
2164     int firstbyte, reqbyte;
2165     int zeroreqbyte, zerofirstbyte;
2166     int req_caseopt, reqvary, tempreqvary;
2167     int options = *optionsptr;
2168     int after_manual_callout = 0;
2169 nigel 93 int length_prevgroup = 0;
2170 nigel 77 register int c;
2171     register uschar *code = *codeptr;
2172 nigel 93 uschar *last_code = code;
2173     uschar *orig_code = code;
2174 nigel 77 uschar *tempcode;
2175     BOOL inescq = FALSE;
2176     BOOL groupsetfirstbyte = FALSE;
2177     const uschar *ptr = *ptrptr;
2178     const uschar *tempptr;
2179     uschar *previous = NULL;
2180     uschar *previous_callout = NULL;
2181 nigel 93 uschar *save_hwm = NULL;
2182 nigel 77 uschar classbits[32];
2183    
2184     #ifdef SUPPORT_UTF8
2185     BOOL class_utf8;
2186     BOOL utf8 = (options & PCRE_UTF8) != 0;
2187     uschar *class_utf8data;
2188     uschar utf8_char[6];
2189     #else
2190     BOOL utf8 = FALSE;
2191 nigel 93 uschar *utf8_char = NULL;
2192 nigel 77 #endif
2193    
2194 nigel 93 #ifdef DEBUG
2195     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2196     #endif
2197    
2198 nigel 77 /* Set up the default and non-default settings for greediness */
2199    
2200     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2201     greedy_non_default = greedy_default ^ 1;
2202    
2203     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2204     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2205     matches a non-fixed char first char; reqbyte just remains unset if we never
2206     find one.
2207    
2208     When we hit a repeat whose minimum is zero, we may have to adjust these values
2209     to take the zero repeat into account. This is implemented by setting them to
2210     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2211     item types that can be repeated set these backoff variables appropriately. */
2212    
2213     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2214    
2215     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2216     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2217     value > 255. It is added into the firstbyte or reqbyte variables to record the
2218     case status of the value. This is used only for ASCII characters. */
2219    
2220     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2221    
2222     /* Switch on next character until the end of the branch */
2223    
2224     for (;; ptr++)
2225     {
2226     BOOL negate_class;
2227     BOOL possessive_quantifier;
2228     BOOL is_quantifier;
2229 nigel 93 BOOL is_recurse;
2230 ph10 180 BOOL reset_bracount;
2231 nigel 77 int class_charcount;
2232     int class_lastchar;
2233     int newoptions;
2234     int recno;
2235 ph10 172 int refsign;
2236 nigel 77 int skipbytes;
2237     int subreqbyte;
2238     int subfirstbyte;
2239 nigel 93 int terminator;
2240 nigel 77 int mclength;
2241     uschar mcbuffer[8];
2242    
2243 nigel 93 /* Get next byte in the pattern */
2244 nigel 77
2245     c = *ptr;
2246    
2247 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2248     previous cycle of this loop. */
2249    
2250     if (lengthptr != NULL)
2251     {
2252     #ifdef DEBUG
2253     if (code > cd->hwm) cd->hwm = code; /* High water info */
2254     #endif
2255     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2256     {
2257     *errorcodeptr = ERR52;
2258     goto FAILED;
2259     }
2260    
2261     /* There is at least one situation where code goes backwards: this is the
2262     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2263     the class is simply eliminated. However, it is created first, so we have to
2264     allow memory for it. Therefore, don't ever reduce the length at this point.
2265     */
2266    
2267     if (code < last_code) code = last_code;
2268 ph10 202
2269     /* Paranoid check for integer overflow */
2270    
2271     if (OFLOW_MAX - *lengthptr < code - last_code)
2272     {
2273     *errorcodeptr = ERR20;
2274     goto FAILED;
2275     }
2276    
2277 nigel 93 *lengthptr += code - last_code;
2278     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2279    
2280     /* If "previous" is set and it is not at the start of the work space, move
2281     it back to there, in order to avoid filling up the work space. Otherwise,
2282     if "previous" is NULL, reset the current code pointer to the start. */
2283    
2284     if (previous != NULL)
2285     {
2286     if (previous > orig_code)
2287     {
2288     memmove(orig_code, previous, code - previous);
2289     code -= previous - orig_code;
2290     previous = orig_code;
2291     }
2292     }
2293     else code = orig_code;
2294    
2295     /* Remember where this code item starts so we can pick up the length
2296     next time round. */
2297    
2298     last_code = code;
2299     }
2300    
2301     /* In the real compile phase, just check the workspace used by the forward
2302     reference list. */
2303    
2304     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2305     {
2306     *errorcodeptr = ERR52;
2307     goto FAILED;
2308     }
2309    
2310 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2311    
2312     if (inescq && c != 0)
2313     {
2314     if (c == '\\' && ptr[1] == 'E')
2315     {
2316     inescq = FALSE;
2317     ptr++;
2318     continue;
2319     }
2320     else
2321     {
2322     if (previous_callout != NULL)
2323     {
2324 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2325     complete_callout(previous_callout, ptr, cd);
2326 nigel 77 previous_callout = NULL;
2327     }
2328     if ((options & PCRE_AUTO_CALLOUT) != 0)
2329     {
2330     previous_callout = code;
2331     code = auto_callout(code, ptr, cd);
2332     }
2333     goto NORMAL_CHAR;
2334     }
2335     }
2336    
2337     /* Fill in length of a previous callout, except when the next thing is
2338     a quantifier. */
2339    
2340     is_quantifier = c == '*' || c == '+' || c == '?' ||
2341     (c == '{' && is_counted_repeat(ptr+1));
2342    
2343     if (!is_quantifier && previous_callout != NULL &&
2344     after_manual_callout-- <= 0)
2345     {
2346 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2347     complete_callout(previous_callout, ptr, cd);
2348 nigel 77 previous_callout = NULL;
2349     }
2350    
2351     /* In extended mode, skip white space and comments */
2352    
2353     if ((options & PCRE_EXTENDED) != 0)
2354     {
2355     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2356     if (c == '#')
2357     {
2358 nigel 93 while (*(++ptr) != 0)
2359 nigel 91 {
2360 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2361 nigel 91 }
2362 nigel 93 if (*ptr != 0) continue;
2363    
2364 nigel 91 /* Else fall through to handle end of string */
2365     c = 0;
2366 nigel 77 }
2367     }
2368    
2369     /* No auto callout for quantifiers. */
2370    
2371     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2372     {
2373     previous_callout = code;
2374     code = auto_callout(code, ptr, cd);
2375     }
2376    
2377     switch(c)
2378     {
2379 nigel 93 /* ===================================================================*/
2380     case 0: /* The branch terminates at string end */
2381     case '|': /* or | or ) */
2382 nigel 77 case ')':
2383     *firstbyteptr = firstbyte;
2384     *reqbyteptr = reqbyte;
2385     *codeptr = code;
2386     *ptrptr = ptr;
2387 nigel 93 if (lengthptr != NULL)
2388     {
2389 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2390     {
2391     *errorcodeptr = ERR20;
2392     goto FAILED;
2393     }
2394 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2395     DPRINTF((">> end branch\n"));
2396     }
2397 nigel 77 return TRUE;
2398    
2399 nigel 93
2400     /* ===================================================================*/
2401 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2402     the setting of any following char as a first character. */
2403    
2404     case '^':
2405     if ((options & PCRE_MULTILINE) != 0)
2406     {
2407     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2408     }
2409     previous = NULL;
2410     *code++ = OP_CIRC;
2411     break;
2412    
2413     case '$':
2414     previous = NULL;
2415     *code++ = OP_DOLL;
2416     break;
2417    
2418     /* There can never be a first char if '.' is first, whatever happens about
2419     repeats. The value of reqbyte doesn't change either. */
2420    
2421     case '.':
2422     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2423     zerofirstbyte = firstbyte;
2424     zeroreqbyte = reqbyte;
2425     previous = code;
2426     *code++ = OP_ANY;
2427     break;
2428    
2429 nigel 93
2430     /* ===================================================================*/
2431 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2432     32-byte bitmap of the permitted characters, except in the special case
2433     where there is only one such character. For negated classes, we build the
2434     map as usual, then invert it at the end. However, we use a different opcode
2435     so that data characters > 255 can be handled correctly.
2436 nigel 77
2437     If the class contains characters outside the 0-255 range, a different
2438     opcode is compiled. It may optionally have a bit map for characters < 256,
2439     but those above are are explicitly listed afterwards. A flag byte tells
2440     whether the bitmap is present, and whether this is a negated class or not.
2441     */
2442    
2443     case '[':
2444     previous = code;
2445    
2446     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2447     they are encountered at the top level, so we'll do that too. */
2448    
2449     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2450     check_posix_syntax(ptr, &tempptr, cd))
2451     {
2452     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2453     goto FAILED;
2454     }
2455    
2456     /* If the first character is '^', set the negation flag and skip it. */
2457    
2458     if ((c = *(++ptr)) == '^')
2459     {
2460     negate_class = TRUE;
2461     c = *(++ptr);
2462     }
2463     else
2464     {
2465     negate_class = FALSE;
2466     }
2467    
2468     /* Keep a count of chars with values < 256 so that we can optimize the case
2469 nigel 93 of just a single character (as long as it's < 256). However, For higher
2470     valued UTF-8 characters, we don't yet do any optimization. */
2471 nigel 77
2472     class_charcount = 0;
2473     class_lastchar = -1;
2474    
2475 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2476     temporary bit of memory, in case the class contains only 1 character (less
2477     than 256), because in that case the compiled code doesn't use the bit map.
2478     */
2479    
2480     memset(classbits, 0, 32 * sizeof(uschar));
2481    
2482 nigel 77 #ifdef SUPPORT_UTF8
2483     class_utf8 = FALSE; /* No chars >= 256 */
2484 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2485 nigel 77 #endif
2486    
2487     /* Process characters until ] is reached. By writing this as a "do" it
2488 nigel 93 means that an initial ] is taken as a data character. At the start of the
2489     loop, c contains the first byte of the character. */
2490 nigel 77
2491 nigel 93 if (c != 0) do
2492 nigel 77 {
2493 nigel 93 const uschar *oldptr;
2494    
2495 nigel 77 #ifdef SUPPORT_UTF8
2496     if (utf8 && c > 127)
2497     { /* Braces are required because the */
2498     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2499     }
2500     #endif
2501    
2502     /* Inside \Q...\E everything is literal except \E */
2503    
2504     if (inescq)
2505     {
2506 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2507 nigel 77 {
2508 nigel 93 inescq = FALSE; /* Reset literal state */
2509     ptr++; /* Skip the 'E' */
2510     continue; /* Carry on with next */
2511 nigel 77 }
2512 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2513 nigel 77 }
2514    
2515     /* Handle POSIX class names. Perl allows a negation extension of the
2516     form [:^name:]. A square bracket that doesn't match the syntax is
2517     treated as a literal. We also recognize the POSIX constructions
2518     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2519     5.6 and 5.8 do. */
2520    
2521     if (c == '[' &&
2522     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2523     check_posix_syntax(ptr, &tempptr, cd))
2524     {
2525     BOOL local_negate = FALSE;
2526 nigel 87 int posix_class, taboffset, tabopt;
2527 nigel 77 register const uschar *cbits = cd->cbits;
2528 nigel 87 uschar pbits[32];
2529 nigel 77
2530     if (ptr[1] != ':')
2531     {
2532     *errorcodeptr = ERR31;
2533     goto FAILED;
2534     }
2535    
2536     ptr += 2;
2537     if (*ptr == '^')
2538     {
2539     local_negate = TRUE;
2540     ptr++;
2541     }
2542    
2543     posix_class = check_posix_name(ptr, tempptr - ptr);
2544     if (posix_class < 0)
2545     {
2546     *errorcodeptr = ERR30;
2547     goto FAILED;
2548     }
2549    
2550     /* If matching is caseless, upper and lower are converted to
2551     alpha. This relies on the fact that the class table starts with
2552     alpha, lower, upper as the first 3 entries. */
2553    
2554     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2555     posix_class = 0;
2556    
2557 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2558     because we may be adding and subtracting from it, and we don't want to
2559     subtract bits that may be in the main map already. At the end we or the
2560     result into the bit map that is being built. */
2561 nigel 77
2562     posix_class *= 3;
2563 nigel 87
2564     /* Copy in the first table (always present) */
2565    
2566     memcpy(pbits, cbits + posix_class_maps[posix_class],
2567     32 * sizeof(uschar));
2568    
2569     /* If there is a second table, add or remove it as required. */
2570    
2571     taboffset = posix_class_maps[posix_class + 1];
2572     tabopt = posix_class_maps[posix_class + 2];
2573    
2574     if (taboffset >= 0)
2575 nigel 77 {
2576 nigel 87 if (tabopt >= 0)
2577     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2578 nigel 77 else
2579 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2580 nigel 77 }
2581    
2582 nigel 87 /* Not see if we need to remove any special characters. An option
2583     value of 1 removes vertical space and 2 removes underscore. */
2584    
2585     if (tabopt < 0) tabopt = -tabopt;
2586     if (tabopt == 1) pbits[1] &= ~0x3c;
2587     else if (tabopt == 2) pbits[11] &= 0x7f;
2588    
2589     /* Add the POSIX table or its complement into the main table that is
2590     being built and we are done. */
2591    
2592     if (local_negate)
2593     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2594     else
2595     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2596    
2597 nigel 77 ptr = tempptr + 1;
2598     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2599     continue; /* End of POSIX syntax handling */
2600     }
2601    
2602     /* Backslash may introduce a single character, or it may introduce one
2603 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2604     case. Inside a class (and only there) it is treated as backspace.
2605     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2606     to or into the one we are building. We assume they have more than one
2607 nigel 77 character in them, so set class_charcount bigger than one. */
2608    
2609     if (c == '\\')
2610     {
2611 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2612     if (*errorcodeptr != 0) goto FAILED;
2613 nigel 77
2614     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2615     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2616 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2617 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2618     {
2619     if (ptr[1] == '\\' && ptr[2] == 'E')
2620     {
2621     ptr += 2; /* avoid empty string */
2622     }
2623     else inescq = TRUE;
2624     continue;
2625     }
2626    
2627     if (c < 0)
2628     {
2629     register const uschar *cbits = cd->cbits;
2630     class_charcount += 2; /* Greater than 1 is what matters */
2631 nigel 93
2632     /* Save time by not doing this in the pre-compile phase. */
2633    
2634     if (lengthptr == NULL) switch (-c)
2635 nigel 77 {
2636     case ESC_d:
2637     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2638     continue;
2639    
2640     case ESC_D:
2641     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2642     continue;
2643    
2644     case ESC_w:
2645     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2646     continue;
2647    
2648     case ESC_W:
2649     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2650     continue;
2651    
2652     case ESC_s:
2653     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2654     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2655     continue;
2656    
2657     case ESC_S:
2658     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2659     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2660     continue;
2661    
2662 nigel 93 case ESC_E: /* Perl ignores an orphan \E */
2663     continue;
2664 ph10 180
2665 nigel 93 default: /* Not recognized; fall through */
2666     break; /* Need "default" setting to stop compiler warning. */
2667     }
2668    
2669     /* In the pre-compile phase, just do the recognition. */
2670    
2671     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2672     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2673 ph10 180
2674 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
2675     they use extra memory. */
2676 ph10 180
2677 ph10 178 if (-c == ESC_h)
2678     {
2679     SETBIT(classbits, 0x09); /* VT */
2680     SETBIT(classbits, 0x20); /* SPACE */
2681 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
2682 ph10 178 #ifdef SUPPORT_UTF8
2683     if (utf8)
2684 ph10 180 {
2685 ph10 178 class_utf8 = TRUE;
2686     *class_utf8data++ = XCL_SINGLE;
2687 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2688 ph10 178 *class_utf8data++ = XCL_SINGLE;
2689 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2690     *class_utf8data++ = XCL_RANGE;
2691     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2692     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2693 ph10 178 *class_utf8data++ = XCL_SINGLE;
2694 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2695 ph10 178 *class_utf8data++ = XCL_SINGLE;
2696 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2697 ph10 178 *class_utf8data++ = XCL_SINGLE;
2698 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2699     }
2700     #endif
2701     continue;
2702     }
2703 nigel 93
2704 ph10 178 if (-c == ESC_H)
2705     {
2706     for (c = 0; c < 32; c++)
2707     {
2708     int x = 0xff;
2709     switch (c)
2710 ph10 180 {
2711 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
2712     case 0x20/8: x ^= 1 << (0x20%8); break;
2713     case 0xa0/8: x ^= 1 << (0xa0%8); break;
2714     default: break;
2715     }
2716     classbits[c] |= x;
2717 ph10 180 }
2718    
2719 ph10 178 #ifdef SUPPORT_UTF8
2720     if (utf8)
2721 ph10 180 {
2722 ph10 178 class_utf8 = TRUE;
2723 ph10 180 *class_utf8data++ = XCL_RANGE;
2724     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2725     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2726     *class_utf8data++ = XCL_RANGE;
2727     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2728     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2729     *class_utf8data++ = XCL_RANGE;
2730     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2731     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2732     *class_utf8data++ = XCL_RANGE;
2733     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2734     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2735     *class_utf8data++ = XCL_RANGE;
2736     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2737     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2738     *class_utf8data++ = XCL_RANGE;
2739     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2740     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2741     *class_utf8data++ = XCL_RANGE;
2742     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2743     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2744     }
2745     #endif
2746     continue;
2747     }
2748 ph10 178
2749     if (-c == ESC_v)
2750     {
2751     SETBIT(classbits, 0x0a); /* LF */
2752     SETBIT(classbits, 0x0b); /* VT */
2753 ph10 180 SETBIT(classbits, 0x0c); /* FF */
2754     SETBIT(classbits, 0x0d); /* CR */
2755     SETBIT(classbits, 0x85); /* NEL */
2756 ph10 178 #ifdef SUPPORT_UTF8
2757     if (utf8)
2758 ph10 180 {
2759 ph10 178 class_utf8 = TRUE;
2760 ph10 180 *class_utf8data++ = XCL_RANGE;
2761     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2762     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2763     }
2764     #endif
2765     continue;
2766     }
2767 ph10 178
2768     if (-c == ESC_V)
2769     {
2770     for (c = 0; c < 32; c++)
2771     {
2772     int x = 0xff;
2773     switch (c)
2774 ph10 180 {
2775 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
2776     x ^= 1 << (0x0b%8);
2777     x ^= 1 << (0x0c%8);
2778 ph10 180 x ^= 1 << (0x0d%8);
2779 ph10 178 break;
2780     case 0x85/8: x ^= 1 << (0x85%8); break;
2781     default: break;
2782     }
2783     classbits[c] |= x;
2784 ph10 180 }
2785    
2786 ph10 178 #ifdef SUPPORT_UTF8
2787     if (utf8)
2788 ph10 180 {
2789 ph10 178 class_utf8 = TRUE;
2790 ph10 180 *class_utf8data++ = XCL_RANGE;
2791     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2792     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2793     *class_utf8data++ = XCL_RANGE;
2794     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2795     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2796     }
2797     #endif
2798     continue;
2799     }
2800 ph10 178
2801 nigel 93 /* We need to deal with \P and \p in both phases. */
2802    
2803 nigel 77 #ifdef SUPPORT_UCP
2804 nigel 93 if (-c == ESC_p || -c == ESC_P)
2805     {
2806     BOOL negated;
2807     int pdata;
2808     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2809     if (ptype < 0) goto FAILED;
2810     class_utf8 = TRUE;
2811     *class_utf8data++ = ((-c == ESC_p) != negated)?
2812     XCL_PROP : XCL_NOTPROP;
2813     *class_utf8data++ = ptype;
2814     *class_utf8data++ = pdata;
2815     class_charcount -= 2; /* Not a < 256 character */
2816 nigel 77 continue;
2817 nigel 93 }
2818 nigel 77 #endif
2819 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
2820     strict mode. By default, for compatibility with Perl, they are
2821     treated as literals. */
2822 nigel 77
2823 nigel 93 if ((options & PCRE_EXTRA) != 0)
2824     {
2825     *errorcodeptr = ERR7;
2826     goto FAILED;
2827     }
2828 nigel 77
2829 nigel 93 class_charcount -= 2; /* Undo the default count from above */
2830     c = *ptr; /* Get the final character and fall through */
2831 nigel 77 }
2832    
2833     /* Fall through if we have a single character (c >= 0). This may be
2834 nigel 93 greater than 256 in UTF-8 mode. */
2835 nigel 77
2836     } /* End of backslash handling */
2837    
2838     /* A single character may be followed by '-' to form a range. However,
2839     Perl does not permit ']' to be the end of the range. A '-' character
2840 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
2841     entirely. The code for handling \Q and \E is messy. */
2842 nigel 77
2843 nigel 93 CHECK_RANGE:
2844     while (ptr[1] == '\\' && ptr[2] == 'E')
2845 nigel 77 {
2846 nigel 93 inescq = FALSE;
2847     ptr += 2;
2848     }
2849    
2850     oldptr = ptr;
2851    
2852     if (!inescq && ptr[1] == '-')
2853     {
2854 nigel 77 int d;
2855     ptr += 2;
2856 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2857 nigel 77
2858 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
2859     mode. */
2860    
2861     while (*ptr == '\\' && ptr[1] == 'Q')
2862     {
2863     ptr += 2;
2864     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2865     inescq = TRUE;
2866     break;
2867     }
2868    
2869     if (*ptr == 0 || (!inescq && *ptr == ']'))
2870     {
2871     ptr = oldptr;
2872     goto LONE_SINGLE_CHARACTER;
2873     }
2874    
2875 nigel 77 #ifdef SUPPORT_UTF8
2876     if (utf8)
2877     { /* Braces are required because the */
2878     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2879     }
2880     else
2881     #endif
2882     d = *ptr; /* Not UTF-8 mode */
2883    
2884     /* The second part of a range can be a single-character escape, but
2885     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2886     in such circumstances. */
2887    
2888 nigel 93 if (!inescq && d == '\\')
2889 nigel 77 {
2890 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2891     if (*errorcodeptr != 0) goto FAILED;
2892 nigel 77
2893 nigel 93 /* \b is backslash; \X is literal X; \R is literal R; any other
2894     special means the '-' was literal */
2895 nigel 77
2896     if (d < 0)
2897     {
2898     if (d == -ESC_b) d = '\b';
2899 nigel 93 else if (d == -ESC_X) d = 'X';
2900     else if (d == -ESC_R) d = 'R'; else
2901 nigel 77 {
2902 nigel 93 ptr = oldptr;
2903 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2904     }
2905     }
2906     }
2907    
2908 nigel 93 /* Check that the two values are in the correct order. Optimize
2909     one-character ranges */
2910 nigel 77
2911 nigel 93 if (d < c)
2912     {
2913     *errorcodeptr = ERR8;
2914     goto FAILED;
2915     }
2916    
2917 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2918    
2919     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2920     matching, we have to use an XCLASS with extra data items. Caseless
2921     matching for characters > 127 is available only if UCP support is
2922     available. */
2923    
2924     #ifdef SUPPORT_UTF8
2925     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2926     {
2927     class_utf8 = TRUE;
2928    
2929     /* With UCP support, we can find the other case equivalents of
2930     the relevant characters. There may be several ranges. Optimize how
2931     they fit with the basic range. */
2932    
2933     #ifdef SUPPORT_UCP
2934     if ((options & PCRE_CASELESS) != 0)
2935     {
2936 nigel 93 unsigned int occ, ocd;
2937     unsigned int cc = c;
2938     unsigned int origd = d;
2939 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
2940     {
2941 ph10 180 if (occ >= (unsigned int)c &&
2942     ocd <= (unsigned int)d)
2943 ph10 176 continue; /* Skip embedded ranges */
2944 nigel 77
2945 ph10 180 if (occ < (unsigned int)c &&
2946 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
2947 nigel 77 { /* if there is overlap, */
2948     c = occ; /* noting that if occ < c */
2949     continue; /* we can't have ocd > d */
2950     } /* because a subrange is */
2951 ph10 180 if (ocd > (unsigned int)d &&
2952 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
2953 nigel 77 { /* the basic range. */
2954     d = ocd;
2955     continue;
2956     }
2957    
2958     if (occ == ocd)
2959     {
2960     *class_utf8data++ = XCL_SINGLE;
2961     }
2962     else
2963     {
2964     *class_utf8data++ = XCL_RANGE;
2965     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2966     }
2967     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2968     }
2969     }
2970     #endif /* SUPPORT_UCP */
2971    
2972     /* Now record the original range, possibly modified for UCP caseless
2973     overlapping ranges. */
2974    
2975     *class_utf8data++ = XCL_RANGE;
2976     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2977     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2978    
2979     /* With UCP support, we are done. Without UCP support, there is no
2980     caseless matching for UTF-8 characters > 127; we can use the bit map
2981     for the smaller ones. */
2982    
2983     #ifdef SUPPORT_UCP
2984     continue; /* With next character in the class */
2985     #else
2986     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2987    
2988     /* Adjust upper limit and fall through to set up the map */
2989    
2990     d = 127;
2991    
2992     #endif /* SUPPORT_UCP */
2993     }
2994     #endif /* SUPPORT_UTF8 */
2995    
2996     /* We use the bit map for all cases when not in UTF-8 mode; else
2997     ranges that lie entirely within 0-127 when there is UCP support; else
2998     for partial ranges without UCP support. */
2999    
3000 nigel 93 class_charcount += d - c + 1;
3001     class_lastchar = d;
3002    
3003     /* We can save a bit of time by skipping this in the pre-compile. */
3004    
3005     if (lengthptr == NULL) for (; c <= d; c++)
3006 nigel 77 {
3007     classbits[c/8] |= (1 << (c&7));
3008     if ((options & PCRE_CASELESS) != 0)
3009     {
3010     int uc = cd->fcc[c]; /* flip case */
3011     classbits[uc/8] |= (1 << (uc&7));
3012     }
3013     }
3014    
3015     continue; /* Go get the next char in the class */
3016     }
3017    
3018     /* Handle a lone single character - we can get here for a normal
3019     non-escape char, or after \ that introduces a single character or for an
3020     apparent range that isn't. */
3021    
3022     LONE_SINGLE_CHARACTER:
3023    
3024     /* Handle a character that cannot go in the bit map */
3025    
3026     #ifdef SUPPORT_UTF8
3027     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3028     {
3029     class_utf8 = TRUE;
3030     *class_utf8data++ = XCL_SINGLE;
3031     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3032    
3033     #ifdef SUPPORT_UCP
3034     if ((options & PCRE_CASELESS) != 0)
3035     {
3036 nigel 93 unsigned int othercase;
3037     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3038 nigel 77 {
3039     *class_utf8data++ = XCL_SINGLE;
3040     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3041     }
3042     }
3043     #endif /* SUPPORT_UCP */
3044    
3045     }
3046     else
3047     #endif /* SUPPORT_UTF8 */
3048    
3049     /* Handle a single-byte character */
3050     {
3051     classbits[c/8] |= (1 << (c&7));
3052     if ((options & PCRE_CASELESS) != 0)
3053     {
3054     c = cd->fcc[c]; /* flip case */
3055     classbits[c/8] |= (1 << (c&7));
3056     }
3057     class_charcount++;
3058     class_lastchar = c;
3059     }
3060     }
3061    
3062 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3063 nigel 77
3064 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3065 nigel 77
3066 nigel 93 if (c == 0) /* Missing terminating ']' */
3067     {
3068     *errorcodeptr = ERR6;
3069     goto FAILED;
3070     }
3071    
3072 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3073     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3074     can optimize the negative case only if there were no characters >= 128
3075     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
3076     single-bytes only. This is an historical hangover. Maybe one day we can
3077     tidy these opcodes to handle multi-byte characters.
3078    
3079     The optimization throws away the bit map. We turn the item into a
3080     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3081     that OP_NOT does not support multibyte characters. In the positive case, it
3082     can cause firstbyte to be set. Otherwise, there can be no first char if
3083     this item is first, whatever repeat count may follow. In the case of
3084     reqbyte, save the previous value for reinstating. */
3085    
3086     #ifdef SUPPORT_UTF8
3087     if (class_charcount == 1 &&
3088     (!utf8 ||
3089     (!class_utf8 && (!negate_class || class_lastchar < 128))))
3090    
3091     #else
3092     if (class_charcount == 1)
3093     #endif
3094     {
3095     zeroreqbyte = reqbyte;
3096    
3097     /* The OP_NOT opcode works on one-byte characters only. */
3098    
3099     if (negate_class)
3100     {
3101     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3102     zerofirstbyte = firstbyte;
3103     *code++ = OP_NOT;
3104     *code++ = class_lastchar;
3105     break;
3106     }
3107    
3108     /* For a single, positive character, get the value into mcbuffer, and
3109     then we can handle this with the normal one-character code. */
3110    
3111     #ifdef SUPPORT_UTF8
3112     if (utf8 && class_lastchar > 127)
3113     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3114     else
3115     #endif
3116     {
3117     mcbuffer[0] = class_lastchar;
3118     mclength = 1;
3119     }
3120     goto ONE_CHAR;
3121     } /* End of 1-char optimization */
3122    
3123     /* The general case - not the one-char optimization. If this is the first
3124     thing in the branch, there can be no first char setting, whatever the
3125     repeat count. Any reqbyte setting must remain unchanged after any kind of
3126     repeat. */
3127    
3128     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3129     zerofirstbyte = firstbyte;
3130     zeroreqbyte = reqbyte;
3131    
3132     /* If there are characters with values > 255, we have to compile an
3133     extended class, with its own opcode. If there are no characters < 256,
3134 nigel 93 we can omit the bitmap in the actual compiled code. */
3135 nigel 77
3136     #ifdef SUPPORT_UTF8
3137     if (class_utf8)
3138     {
3139     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3140     *code++ = OP_XCLASS;
3141     code += LINK_SIZE;
3142     *code = negate_class? XCL_NOT : 0;
3143    
3144 nigel 93 /* If the map is required, move up the extra data to make room for it;
3145     otherwise just move the code pointer to the end of the extra data. */
3146 nigel 77
3147     if (class_charcount > 0)
3148     {
3149     *code++ |= XCL_MAP;
3150 nigel 93 memmove(code + 32, code, class_utf8data - code);
3151 nigel 77 memcpy(code, classbits, 32);
3152 nigel 93 code = class_utf8data + 32;
3153 nigel 77 }
3154 nigel 93 else code = class_utf8data;
3155 nigel 77
3156     /* Now fill in the complete length of the item */
3157    
3158     PUT(previous, 1, code - previous);
3159     break; /* End of class handling */
3160     }
3161     #endif
3162    
3163     /* If there are no characters > 255, negate the 32-byte map if necessary,
3164     and copy it into the code vector. If this is the first thing in the branch,
3165     there can be no first char setting, whatever the repeat count. Any reqbyte
3166     setting must remain unchanged after any kind of repeat. */
3167    
3168     if (negate_class)
3169     {
3170     *code++ = OP_NCLASS;
3171 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3172     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3173 nigel 77 }
3174     else
3175     {
3176     *code++ = OP_CLASS;
3177     memcpy(code, classbits, 32);
3178     }
3179     code += 32;
3180     break;
3181    
3182 nigel 93
3183     /* ===================================================================*/
3184 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3185     has been tested above. */
3186    
3187     case '{':
3188     if (!is_quantifier) goto NORMAL_CHAR;
3189     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3190     if (*errorcodeptr != 0) goto FAILED;
3191     goto REPEAT;
3192    
3193     case '*':
3194     repeat_min = 0;
3195     repeat_max = -1;
3196     goto REPEAT;
3197    
3198     case '+':
3199     repeat_min = 1;
3200     repeat_max = -1;
3201     goto REPEAT;
3202    
3203     case '?':
3204     repeat_min = 0;
3205     repeat_max = 1;
3206    
3207     REPEAT:
3208     if (previous == NULL)
3209     {
3210     *errorcodeptr = ERR9;
3211     goto FAILED;
3212     }
3213    
3214     if (repeat_min == 0)
3215     {
3216     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3217     reqbyte = zeroreqbyte; /* Ditto */
3218     }
3219    
3220     /* Remember whether this is a variable length repeat */
3221    
3222     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3223    
3224     op_type = 0; /* Default single-char op codes */
3225     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3226    
3227     /* Save start of previous item, in case we have to move it up to make space
3228     for an inserted OP_ONCE for the additional '+' extension. */
3229    
3230     tempcode = previous;
3231    
3232     /* If the next character is '+', we have a possessive quantifier. This
3233     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3234     If the next character is '?' this is a minimizing repeat, by default,
3235     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3236     repeat type to the non-default. */
3237    
3238     if (ptr[1] == '+')
3239     {
3240     repeat_type = 0; /* Force greedy */
3241     possessive_quantifier = TRUE;
3242     ptr++;
3243     }
3244     else if (ptr[1] == '?')
3245     {
3246     repeat_type = greedy_non_default;
3247     ptr++;
3248     }
3249     else repeat_type = greedy_default;
3250    
3251     /* If previous was a character match, abolish the item and generate a
3252     repeat item instead. If a char item has a minumum of more than one, ensure
3253     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3254     the first thing in a branch because the x will have gone into firstbyte
3255     instead. */
3256    
3257     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3258     {
3259     /* Deal with UTF-8 characters that take up more than one byte. It's
3260     easier to write this out separately than try to macrify it. Use c to
3261     hold the length of the character in bytes, plus 0x80 to flag that it's a
3262     length rather than a small character. */
3263    
3264     #ifdef SUPPORT_UTF8
3265     if (utf8 && (code[-1] & 0x80) != 0)
3266     {
3267     uschar *lastchar = code - 1;
3268     while((*lastchar & 0xc0) == 0x80) lastchar--;
3269     c = code - lastchar; /* Length of UTF-8 character */
3270     memcpy(utf8_char, lastchar, c); /* Save the char */
3271     c |= 0x80; /* Flag c as a length */
3272     }
3273     else
3274     #endif
3275    
3276     /* Handle the case of a single byte - either with no UTF8 support, or
3277     with UTF-8 disabled, or for a UTF-8 character < 128. */
3278    
3279     {
3280     c = code[-1];
3281     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3282     }
3283    
3284 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3285     the line is something that cannot possibly match this character. If so,
3286     automatically possessifying this item gains some performance in the case
3287     where the match fails. */
3288    
3289     if (!possessive_quantifier &&
3290     repeat_max < 0 &&
3291     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3292     options, cd))
3293     {
3294     repeat_type = 0; /* Force greedy */
3295     possessive_quantifier = TRUE;
3296     }
3297    
3298 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3299     }
3300    
3301     /* If previous was a single negated character ([^a] or similar), we use
3302     one of the special opcodes, replacing it. The code is shared with single-
3303     character repeats by setting opt_type to add a suitable offset into
3304 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3305     currently used only for single-byte chars. */
3306 nigel 77
3307     else if (*previous == OP_NOT)
3308     {
3309     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3310     c = previous[1];
3311 nigel 93 if (!possessive_quantifier &&
3312     repeat_max < 0 &&
3313     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3314     {
3315     repeat_type = 0; /* Force greedy */
3316     possessive_quantifier = TRUE;
3317     }
3318 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3319     }
3320    
3321     /* If previous was a character type match (\d or similar), abolish it and
3322     create a suitable repeat item. The code is shared with single-character
3323     repeats by setting op_type to add a suitable offset into repeat_type. Note
3324     the the Unicode property types will be present only when SUPPORT_UCP is
3325     defined, but we don't wrap the little bits of code here because it just
3326     makes it horribly messy. */
3327    
3328     else if (*previous < OP_EODN)
3329     {
3330     uschar *oldcode;
3331 nigel 87 int prop_type, prop_value;
3332 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3333     c = *previous;
3334    
3335 nigel 93 if (!possessive_quantifier &&
3336     repeat_max < 0 &&
3337     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3338     {
3339     repeat_type = 0; /* Force greedy */
3340     possessive_quantifier = TRUE;
3341     }
3342    
3343 nigel 77 OUTPUT_SINGLE_REPEAT:
3344 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3345     {
3346     prop_type = previous[1];
3347     prop_value = previous[2];
3348     }
3349     else prop_type = prop_value = -1;
3350 nigel 77
3351     oldcode = code;
3352     code = previous; /* Usually overwrite previous item */
3353    
3354     /* If the maximum is zero then the minimum must also be zero; Perl allows
3355     this case, so we do too - by simply omitting the item altogether. */
3356    
3357     if (repeat_max == 0) goto END_REPEAT;
3358    
3359     /* All real repeats make it impossible to handle partial matching (maybe
3360     one day we will be able to remove this restriction). */
3361    
3362     if (repeat_max != 1) cd->nopartial = TRUE;
3363    
3364     /* Combine the op_type with the repeat_type */
3365    
3366     repeat_type += op_type;
3367    
3368     /* A minimum of zero is handled either as the special case * or ?, or as
3369     an UPTO, with the maximum given. */
3370    
3371     if (repeat_min == 0)
3372     {
3373     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3374     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3375     else
3376     {
3377     *code++ = OP_UPTO + repeat_type;
3378     PUT2INC(code, 0, repeat_max);
3379     }
3380     }
3381    
3382     /* A repeat minimum of 1 is optimized into some special cases. If the
3383 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3384 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3385     one less than the maximum. */
3386    
3387     else if (repeat_min == 1)
3388     {
3389     if (repeat_max == -1)
3390     *code++ = OP_PLUS + repeat_type;
3391     else
3392     {
3393     code = oldcode; /* leave previous item in place */
3394     if (repeat_max == 1) goto END_REPEAT;
3395     *code++ = OP_UPTO + repeat_type;
3396     PUT2INC(code, 0, repeat_max - 1);
3397     }
3398     }
3399    
3400     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3401     handled as an EXACT followed by an UPTO. */
3402    
3403     else
3404     {
3405     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3406     PUT2INC(code, 0, repeat_min);
3407    
3408     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3409     we have to insert the character for the previous code. For a repeated
3410 nigel 87 Unicode property match, there are two extra bytes that define the
3411 nigel 77 required property. In UTF-8 mode, long characters have their length in
3412     c, with the 0x80 bit as a flag. */
3413    
3414     if (repeat_max < 0)
3415     {
3416     #ifdef SUPPORT_UTF8
3417     if (utf8 && c >= 128)
3418     {
3419     memcpy(code, utf8_char, c & 7);
3420     code += c & 7;
3421     }
3422     else
3423     #endif
3424     {
3425     *code++ = c;
3426 nigel 87 if (prop_type >= 0)
3427     {
3428     *code++ = prop_type;
3429     *code++ = prop_value;
3430     }
3431 nigel 77 }
3432     *code++ = OP_STAR + repeat_type;
3433     }
3434    
3435     /* Else insert an UPTO if the max is greater than the min, again
3436 nigel 93 preceded by the character, for the previously inserted code. If the
3437     UPTO is just for 1 instance, we can use QUERY instead. */
3438 nigel 77
3439     else if (repeat_max != repeat_min)
3440     {
3441     #ifdef SUPPORT_UTF8
3442     if (utf8 && c >= 128)
3443     {
3444     memcpy(code, utf8_char, c & 7);
3445     code += c & 7;
3446     }
3447     else
3448     #endif
3449     *code++ = c;
3450 nigel 87 if (prop_type >= 0)
3451     {
3452     *code++ = prop_type;
3453     *code++ = prop_value;
3454     }
3455 nigel 77 repeat_max -= repeat_min;
3456 nigel 93
3457     if (repeat_max == 1)
3458     {
3459     *code++ = OP_QUERY + repeat_type;
3460     }
3461     else
3462     {
3463     *code++ = OP_UPTO + repeat_type;
3464     PUT2INC(code, 0, repeat_max);
3465     }
3466 nigel 77 }
3467     }
3468    
3469     /* The character or character type itself comes last in all cases. */
3470    
3471     #ifdef SUPPORT_UTF8
3472     if (utf8 && c >= 128)
3473     {
3474     memcpy(code, utf8_char, c & 7);
3475     code += c & 7;
3476     }
3477     else
3478     #endif
3479     *code++ = c;
3480    
3481 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3482     define the required property. */
3483 nigel 77
3484     #ifdef SUPPORT_UCP
3485 nigel 87 if (prop_type >= 0)
3486     {
3487     *code++ = prop_type;
3488     *code++ = prop_value;
3489     }
3490 nigel 77 #endif
3491     }
3492    
3493     /* If previous was a character class or a back reference, we put the repeat
3494     stuff after it, but just skip the item if the repeat was {0,0}. */
3495    
3496     else if (*previous == OP_CLASS ||
3497     *previous == OP_NCLASS ||
3498     #ifdef SUPPORT_UTF8
3499     *previous == OP_XCLASS ||
3500     #endif
3501     *previous == OP_REF)
3502     {
3503     if (repeat_max == 0)
3504     {
3505     code = previous;
3506     goto END_REPEAT;
3507     }
3508    
3509     /* All real repeats make it impossible to handle partial matching (maybe
3510     one day we will be able to remove this restriction). */
3511    
3512     if (repeat_max != 1) cd->nopartial = TRUE;
3513    
3514     if (repeat_min == 0 && repeat_max == -1)
3515     *code++ = OP_CRSTAR + repeat_type;
3516     else if (repeat_min == 1 && repeat_max == -1)
3517     *code++ = OP_CRPLUS + repeat_type;
3518     else if (repeat_min == 0 && repeat_max == 1)
3519     *code++ = OP_CRQUERY + repeat_type;
3520     else
3521     {
3522     *code++ = OP_CRRANGE + repeat_type;
3523     PUT2INC(code, 0, repeat_min);
3524     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3525     PUT2INC(code, 0, repeat_max);
3526     }
3527     }
3528    
3529     /* If previous was a bracket group, we may have to replicate it in certain
3530     cases. */
3531    
3532 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3533     *previous == OP_ONCE || *previous == OP_COND)
3534 nigel 77 {
3535     register int i;
3536     int ketoffset = 0;
3537     int len = code - previous;
3538     uschar *bralink = NULL;
3539    
3540 nigel 93 /* Repeating a DEFINE group is pointless */
3541    
3542     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3543     {
3544     *errorcodeptr = ERR55;
3545     goto FAILED;
3546     }
3547    
3548 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3549     by scanning through from the start, and compute the offset back to it
3550     from the current code pointer. There may be an OP_OPT setting following
3551     the final KET, so we can't find the end just by going back from the code
3552     pointer. */
3553    
3554     if (repeat_max == -1)
3555     {
3556     register uschar *ket = previous;
3557     do ket += GET(ket, 1); while (*ket != OP_KET);
3558     ketoffset = code - ket;
3559     }
3560    
3561     /* The case of a zero minimum is special because of the need to stick
3562     OP_BRAZERO in front of it, and because the group appears once in the
3563     data, whereas in other cases it appears the minimum number of times. For
3564     this reason, it is simplest to treat this case separately, as otherwise
3565     the code gets far too messy. There are several special subcases when the
3566     minimum is zero. */
3567    
3568     if (repeat_min == 0)
3569     {
3570     /* If the maximum is also zero, we just omit the group from the output
3571     altogether. */
3572    
3573     if (repeat_max == 0)
3574     {
3575     code = previous;
3576     goto END_REPEAT;
3577     }
3578    
3579     /* If the maximum is 1 or unlimited, we just have to stick in the
3580     BRAZERO and do no more at this point. However, we do need to adjust
3581     any OP_RECURSE calls inside the group that refer to the group itself or
3582 nigel 93 any internal or forward referenced group, because the offset is from
3583     the start of the whole regex. Temporarily terminate the pattern while
3584     doing this. */
3585 nigel 77
3586     if (repeat_max <= 1)
3587     {
3588     *code = OP_END;
3589 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3590 nigel 77 memmove(previous+1, previous, len);
3591     code++;
3592     *previous++ = OP_BRAZERO + repeat_type;
3593     }
3594    
3595     /* If the maximum is greater than 1 and limited, we have to replicate
3596     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3597     The first one has to be handled carefully because it's the original
3598     copy, which has to be moved up. The remainder can be handled by code
3599     that is common with the non-zero minimum case below. We have to
3600     adjust the value or repeat_max, since one less copy is required. Once
3601     again, we may have to adjust any OP_RECURSE calls inside the group. */
3602    
3603     else
3604     {
3605     int offset;
3606     *code = OP_END;
3607 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3608 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3609     code += 2 + LINK_SIZE;
3610     *previous++ = OP_BRAZERO + repeat_type;
3611     *previous++ = OP_BRA;
3612    
3613     /* We chain together the bracket offset fields that have to be
3614     filled in later when the ends of the brackets are reached. */
3615    
3616     offset = (bralink == NULL)? 0 : previous - bralink;
3617     bralink = previous;
3618     PUTINC(previous, 0, offset);
3619     }
3620    
3621     repeat_max--;
3622     }
3623    
3624     /* If the minimum is greater than zero, replicate the group as many
3625     times as necessary, and adjust the maximum to the number of subsequent
3626     copies that we need. If we set a first char from the group, and didn't
3627 nigel 93 set a required char, copy the latter from the former. If there are any
3628     forward reference subroutine calls in the group, there will be entries on
3629     the workspace list; replicate these with an appropriate increment. */
3630 nigel 77
3631     else
3632     {
3633     if (repeat_min > 1)
3634     {
3635 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3636 ph10 202 just adjust the length as if we had. Do some paranoid checks for
3637     potential integer overflow. */
3638 nigel 93
3639     if (lengthptr != NULL)
3640 ph10 202 {
3641     int delta = (repeat_min - 1)*length_prevgroup;
3642     if ((double)(repeat_min - 1)*(double)length_prevgroup >
3643     (double)INT_MAX ||
3644     OFLOW_MAX - *lengthptr < delta)
3645     {
3646     *errorcodeptr = ERR20;
3647     goto FAILED;
3648     }
3649     *lengthptr += delta;
3650     }
3651 nigel 93
3652     /* This is compiling for real */
3653    
3654     else
3655 nigel 77 {
3656 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3657     for (i = 1; i < repeat_min; i++)
3658     {
3659     uschar *hc;
3660     uschar *this_hwm = cd->hwm;
3661     memcpy(code, previous, len);
3662     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3663     {
3664     PUT(cd->hwm, 0, GET(hc, 0) + len);
3665     cd->hwm += LINK_SIZE;
3666     }
3667     save_hwm = this_hwm;
3668     code += len;
3669     }
3670 nigel 77 }
3671     }
3672 nigel 93
3673 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3674     }
3675    
3676     /* This code is common to both the zero and non-zero minimum cases. If
3677     the maximum is limited, it replicates the group in a nested fashion,
3678     remembering the bracket starts on a stack. In the case of a zero minimum,
3679     the first one was set up above. In all cases the repeat_max now specifies
3680 nigel 93 the number of additional copies needed. Again, we must remember to
3681     replicate entries on the forward reference list. */
3682 nigel 77
3683     if (repeat_max >= 0)
3684     {
3685 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3686     just adjust the length as if we had. For each repetition we must add 1
3687     to the length for BRAZERO and for all but the last repetition we must
3688 ph10 202 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3689     paranoid checks to avoid integer overflow. */
3690 nigel 93
3691     if (lengthptr != NULL && repeat_max > 0)
3692 ph10 202 {
3693     int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3694     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3695     if ((double)repeat_max *
3696     (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3697     > (double)INT_MAX ||
3698     OFLOW_MAX - *lengthptr < delta)
3699     {
3700     *errorcodeptr = ERR20;
3701     goto FAILED;
3702     }
3703     *lengthptr += delta;
3704     }
3705 nigel 93
3706     /* This is compiling for real */
3707    
3708     else for (i = repeat_max - 1; i >= 0; i--)
3709 nigel 77 {
3710 nigel 93 uschar *hc;
3711     uschar *this_hwm = cd->hwm;
3712    
3713 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3714    
3715     /* All but the final copy start a new nesting, maintaining the
3716     chain of brackets outstanding. */
3717    
3718     if (i != 0)
3719     {
3720     int offset;
3721     *code++ = OP_BRA;
3722     offset = (bralink == NULL)? 0 : code - bralink;
3723     bralink = code;
3724     PUTINC(code, 0, offset);
3725     }
3726    
3727     memcpy(code, previous, len);
3728 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3729     {
3730     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3731     cd->hwm += LINK_SIZE;
3732     }
3733     save_hwm = this_hwm;
3734 nigel 77 code += len;
3735     }
3736    
3737     /* Now chain through the pending brackets, and fill in their length
3738     fields (which are holding the chain links pro tem). */
3739    
3740     while (bralink != NULL)
3741     {
3742     int oldlinkoffset;
3743     int offset = code - bralink + 1;
3744     uschar *bra = code - offset;
3745     oldlinkoffset = GET(bra, 1);
3746     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3747     *code++ = OP_KET;
3748     PUTINC(code, 0, offset);
3749     PUT(bra, 1, offset);
3750     }
3751     }
3752    
3753     /* If the maximum is unlimited, set a repeater in the final copy. We
3754     can't just offset backwards from the current code point, because we
3755     don't know if there's been an options resetting after the ket. The
3756 nigel 93 correct offset was computed above.
3757 nigel 77
3758 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3759     this group is a non-atomic one that could match an empty string. If so,
3760     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3761     that runtime checking can be done. [This check is also applied to
3762     atomic groups at runtime, but in a different way.] */
3763    
3764     else
3765     {
3766     uschar *ketcode = code - ketoffset;
3767     uschar *bracode = ketcode - GET(ketcode, 1);
3768     *ketcode = OP_KETRMAX + repeat_type;
3769     if (lengthptr == NULL && *bracode != OP_ONCE)
3770     {
3771     uschar *scode = bracode;
3772     do
3773     {
3774     if (could_be_empty_branch(scode, ketcode, utf8))
3775     {
3776     *bracode += OP_SBRA - OP_BRA;
3777     break;
3778     }
3779     scode += GET(scode, 1);
3780     }
3781     while (*scode == OP_ALT);
3782     }
3783     }
3784 nigel 77 }
3785    
3786     /* Else there's some kind of shambles */
3787    
3788     else
3789     {
3790     *errorcodeptr = ERR11;
3791     goto FAILED;
3792     }
3793    
3794 nigel 93 /* If the character following a repeat is '+', or if certain optimization
3795     tests above succeeded, possessive_quantifier is TRUE. For some of the
3796     simpler opcodes, there is an special alternative opcode for this. For
3797     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3798     The '+' notation is just syntactic sugar, taken from Sun's Java package,
3799     but the special opcodes can optimize it a bit. The repeated item starts at
3800     tempcode, not at previous, which might be the first part of a string whose
3801     (former) last char we repeated.
3802 nigel 77
3803 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3804     an 'upto' may follow. We skip over an 'exact' item, and then test the
3805     length of what remains before proceeding. */
3806    
3807 nigel 77 if (possessive_quantifier)
3808     {
3809 nigel 93 int len;
3810     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3811     *tempcode == OP_NOTEXACT)
3812     tempcode += _pcre_OP_lengths[*tempcode];
3813     len = code - tempcode;
3814     if (len > 0) switch (*tempcode)
3815     {
3816     case OP_STAR: *tempcode = OP_POSSTAR; break;
3817     case OP_PLUS: *tempcode = OP_POSPLUS; break;
3818     case OP_QUERY: *tempcode = OP_POSQUERY; break;
3819     case OP_UPTO: *tempcode = OP_POSUPTO; break;
3820    
3821     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3822     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3823     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3824     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3825    
3826     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3827     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3828     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3829     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3830    
3831     default:
3832     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3833     code += 1 + LINK_SIZE;
3834     len += 1 + LINK_SIZE;
3835     tempcode[0] = OP_ONCE;
3836     *code++ = OP_KET;
3837     PUTINC(code, 0, len);
3838     PUT(tempcode, 1, len);
3839     break;
3840     }
3841 nigel 77 }
3842    
3843     /* In all case we no longer have a previous item. We also set the
3844     "follows varying string" flag for subsequently encountered reqbytes if
3845     it isn't already set and we have just passed a varying length item. */
3846    
3847     END_REPEAT:
3848     previous = NULL;
3849     cd->req_varyopt |= reqvary;
3850     break;
3851    
3852    
3853 nigel 93 /* ===================================================================*/
3854     /* Start of nested parenthesized sub-expression, or comment or lookahead or
3855     lookbehind or option setting or condition or all the other extended
3856     parenthesis forms. First deal with the specials; all are introduced by ?,
3857     and the appearance of any of them means that this is not a capturing
3858     group. */
3859 nigel 77
3860     case '(':
3861     newoptions = options;
3862     skipbytes = 0;
3863 nigel 93 bravalue = OP_CBRA;
3864     save_hwm = cd->hwm;
3865 ph10 180 reset_bracount = FALSE;
3866 nigel 77
3867     if (*(++ptr) == '?')
3868     {
3869 nigel 93 int i, set, unset, namelen;
3870 nigel 77 int *optset;
3871 nigel 93 const uschar *name;
3872     uschar *slot;
3873 nigel 77
3874     switch (*(++ptr))
3875     {
3876     case '#': /* Comment; skip to ket */
3877     ptr++;
3878 nigel 93 while (*ptr != 0 && *ptr != ')') ptr++;
3879     if (*ptr == 0)
3880     {
3881     *errorcodeptr = ERR18;
3882     goto FAILED;
3883     }
3884 nigel 77 continue;
3885    
3886 nigel 93
3887     /* ------------------------------------------------------------ */
3888 ph10 175 case '|': /* Reset capture count for each branch */
3889     reset_bracount = TRUE;
3890 ph10 180 /* Fall through */
3891 ph10 175
3892     /* ------------------------------------------------------------ */
3893 nigel 93 case ':': /* Non-capturing bracket */
3894 nigel 77 bravalue = OP_BRA;
3895     ptr++;
3896     break;
3897    
3898 nigel 93
3899     /* ------------------------------------------------------------ */
3900 nigel 77 case '(':
3901     bravalue = OP_COND; /* Conditional group */
3902    
3903 nigel 93 /* A condition can be an assertion, a number (referring to a numbered
3904     group), a name (referring to a named group), or 'R', referring to
3905     recursion. R<digits> and R&name are also permitted for recursion tests.
3906 nigel 77
3907 nigel 93 There are several syntaxes for testing a named group: (?(name)) is used
3908     by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3909    
3910     There are two unfortunate ambiguities, caused by history. (a) 'R' can
3911     be the recursive thing or the name 'R' (and similarly for 'R' followed
3912     by digits), and (b) a number could be a name that consists of digits.
3913     In both cases, we look for a name first; if not found, we try the other
3914     cases. */
3915    
3916     /* For conditions that are assertions, check the syntax, and then exit
3917     the switch. This will take control down to where bracketed groups,
3918     including assertions, are processed. */
3919    
3920     if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3921     break;
3922    
3923     /* Most other conditions use OP_CREF (a couple change to OP_RREF
3924     below), and all need to skip 3 bytes at the start of the group. */
3925    
3926     code[1+LINK_SIZE] = OP_CREF;
3927     skipbytes = 3;
3928 ph10 172 refsign = -1;
3929 nigel 93
3930     /* Check for a test for recursion in a named group. */
3931    
3932     if (ptr[1] == 'R' && ptr[2] == '&')
3933 nigel 77 {
3934 nigel 93 terminator = -1;
3935     ptr += 2;
3936     code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3937     }
3938 nigel 91
3939 nigel 93 /* Check for a test for a named group's having been set, using the Perl
3940     syntax (?(<name>) or (?('name') */
3941 nigel 91
3942 nigel 93 else if (ptr[1] == '<')
3943     {
3944     terminator = '>';
3945     ptr++;
3946     }
3947     else if (ptr[1] == '\'')
3948     {
3949     terminator = '\'';
3950     ptr++;
3951     }
3952 ph10 172 else
3953 ph10 167 {
3954     terminator = 0;
3955 ph10 172 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3956     }
3957 nigel 77
3958 nigel 93 /* We now expect to read a name; any thing else is an error */
3959 nigel 77
3960 nigel 93 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3961     {
3962     ptr += 1; /* To get the right offset */
3963     *errorcodeptr = ERR28;
3964     goto FAILED;
3965     }
3966    
3967     /* Read the name, but also get it as a number if it's all digits */
3968    
3969     recno = 0;
3970     name = ++ptr;
3971     while ((cd->ctypes[*ptr] & ctype_word) != 0)
3972     {
3973     if (recno >= 0)
3974     recno = ((digitab[*ptr] & ctype_digit) != 0)?
3975     recno * 10 + *ptr - '0' : -1;
3976 nigel 91 ptr++;
3977 nigel 93 }
3978     namelen = ptr - name;
3979 nigel 91
3980 nigel 93 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3981     {
3982     ptr--; /* Error offset */
3983     *errorcodeptr = ERR26;
3984     goto FAILED;
3985     }
3986 nigel 91
3987 nigel 93 /* Do no further checking in the pre-compile phase. */
3988 nigel 91
3989 nigel 93 if (lengthptr != NULL) break;
3990 nigel 91
3991 nigel 93 /* In the real compile we do the work of looking for the actual
3992 ph10 167 reference. If the string started with "+" or "-" we require the rest to
3993     be digits, in which case recno will be set. */
3994 ph10 172
3995 ph10 167 if (refsign > 0)
3996     {
3997     if (recno <= 0)
3998     {
3999     *errorcodeptr = ERR58;
4000     goto FAILED;
4001 ph10 172 }
4002 ph10 167 if (refsign == '-')
4003     {
4004 ph10 172 recno = cd->bracount - recno + 1;
4005 ph10 167 if (recno <= 0)
4006     {
4007     *errorcodeptr = ERR15;
4008     goto FAILED;
4009 ph10 172 }
4010 ph10 167 }
4011 ph10 172 else recno += cd->bracount;
4012 ph10 167 PUT2(code, 2+LINK_SIZE, recno);
4013     break;
4014 ph10 172 }
4015 nigel 91
4016 ph10 167 /* Otherwise (did not start with "+" or "-"), start by looking for the
4017     name. */
4018 ph10 172
4019 nigel 93 slot = cd->name_table;
4020     for (i = 0; i < cd->names_found; i++)
4021     {
4022     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4023     slot += cd->name_entry_size;
4024     }
4025 nigel 91
4026 nigel 93 /* Found a previous named subpattern */
4027 nigel 91
4028 nigel 93 if (i < cd->names_found)
4029     {
4030     recno = GET2(slot, 0);
4031     PUT2(code, 2+LINK_SIZE, recno);
4032     }
4033 nigel 91
4034 nigel 93 /* Search the pattern for a forward reference */
4035 nigel 91
4036 nigel 93 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4037     (options & PCRE_EXTENDED) != 0)) > 0)
4038     {
4039     PUT2(code, 2+LINK_SIZE, i);
4040     }
4041 nigel 91
4042 nigel 93 /* If terminator == 0 it means that the name followed directly after
4043     the opening parenthesis [e.g. (?(abc)...] and in this case there are
4044     some further alternatives to try. For the cases where terminator != 0
4045     [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4046     now checked all the possibilities, so give an error. */
4047 nigel 91
4048 nigel 93 else if (terminator != 0)
4049     {
4050     *errorcodeptr = ERR15;
4051     goto FAILED;
4052     }
4053    
4054     /* Check for (?(R) for recursion. Allow digits after R to specify a
4055     specific group number. */
4056    
4057     else if (*name == 'R')
4058     {
4059     recno = 0;
4060     for (i = 1; i < namelen; i++)
4061 nigel 91 {
4062 nigel 93 if ((digitab[name[i]] & ctype_digit) == 0)
4063     {
4064     *errorcodeptr = ERR15;
4065     goto FAILED;
4066     }
4067     recno = recno * 10 + name[i] - '0';
4068 nigel 77 }
4069 nigel 93 if (recno == 0) recno = RREF_ANY;
4070     code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4071     PUT2(code, 2+LINK_SIZE, recno);
4072 nigel 77 }
4073 nigel 91
4074 nigel 93 /* Similarly, check for the (?(DEFINE) "condition", which is always
4075     false. */
4076 nigel 91
4077 nigel 93 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4078     {
4079     code[1+LINK_SIZE] = OP_DEF;
4080     skipbytes = 1;
4081     }
4082    
4083     /* Check for the "name" actually being a subpattern number. */
4084    
4085     else if (recno > 0)
4086     {
4087     PUT2(code, 2+LINK_SIZE, recno);
4088     }
4089    
4090     /* Either an unidentified subpattern, or a reference to (?(0) */
4091    
4092     else
4093     {
4094     *errorcodeptr = (recno == 0)? ERR35: ERR15;
4095     goto FAILED;
4096     }
4097 nigel 77 break;
4098    
4099 nigel 93
4100     /* ------------------------------------------------------------ */
4101 nigel 77 case '=': /* Positive lookahead */
4102     bravalue = OP_ASSERT;
4103     ptr++;
4104     break;
4105    
4106 nigel 93
4107     /* ------------------------------------------------------------ */
4108 nigel 77 case '!': /* Negative lookahead */
4109     bravalue = OP_ASSERT_NOT;
4110     ptr++;
4111     break;
4112    
4113 nigel 93
4114     /* ------------------------------------------------------------ */
4115     case '<': /* Lookbehind or named define */
4116     switch (ptr[1])
4117 nigel 77 {
4118     case '=': /* Positive lookbehind */
4119     bravalue = OP_ASSERTBACK;
4120 nigel 93 ptr += 2;
4121