/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 211 - (hide annotations) (download)
Thu Aug 9 09:52:43 2007 UTC (7 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 187286 byte(s)
Update UTF-8 validity check and documentation.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 199 #include <config.h>
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 nigel 77 static const short int escapes[] = {
102     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 nigel 77 0, 0, -ESC_z /* x - z */
112     };
113    
114 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
115 nigel 77 static const short int escapes[] = {
116     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139     };
140     #endif
141    
142    
143 ph10 210 /* Table of special "verbs" like (*PRUNE) */
144    
145     typedef struct verbitem {
146     const char *name;
147     int len;
148     int op;
149 ph10 211 } verbitem;
150 ph10 210
151     static verbitem verbs[] = {
152     { "ACCEPT", 6, OP_ACCEPT },
153     { "COMMIT", 6, OP_COMMIT },
154     { "F", 1, OP_FAIL },
155 ph10 211 { "FAIL", 4, OP_FAIL },
156 ph10 210 { "PRUNE", 5, OP_PRUNE },
157     { "SKIP", 4, OP_SKIP },
158     { "THEN", 4, OP_THEN }
159     };
160    
161     static int verbcount = sizeof(verbs)/sizeof(verbitem);
162    
163    
164 nigel 77 /* Tables of names of POSIX character classes and their lengths. The list is
165 nigel 87 terminated by a zero length entry. The first three must be alpha, lower, upper,
166 nigel 77 as this is assumed for handling case independence. */
167    
168     static const char *const posix_names[] = {
169     "alpha", "lower", "upper",
170     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
171     "print", "punct", "space", "word", "xdigit" };
172    
173     static const uschar posix_name_lengths[] = {
174     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
175    
176 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
177     base map, with an optional addition or removal of another map. Then, for some
178     classes, there is some additional tweaking: for [:blank:] the vertical space
179     characters are removed, and for [:alpha:] and [:alnum:] the underscore
180     character is removed. The triples in the table consist of the base map offset,
181     second map offset or -1 if no second map, and a non-negative value for map
182     addition or a negative value for map subtraction (if there are two maps). The
183     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
184     remove vertical space characters, 2 => remove underscore. */
185 nigel 77
186     static const int posix_class_maps[] = {
187 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
188     cbit_lower, -1, 0, /* lower */
189     cbit_upper, -1, 0, /* upper */
190     cbit_word, -1, 2, /* alnum - word without underscore */
191     cbit_print, cbit_cntrl, 0, /* ascii */
192     cbit_space, -1, 1, /* blank - a GNU extension */
193     cbit_cntrl, -1, 0, /* cntrl */
194     cbit_digit, -1, 0, /* digit */
195     cbit_graph, -1, 0, /* graph */
196     cbit_print, -1, 0, /* print */
197     cbit_punct, -1, 0, /* punct */
198     cbit_space, -1, 0, /* space */
199     cbit_word, -1, 0, /* word - a Perl extension */
200     cbit_xdigit,-1, 0 /* xdigit */
201 nigel 77 };
202    
203    
204 nigel 93 #define STRING(a) # a
205     #define XSTRING(s) STRING(s)
206    
207 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
208 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
209     they are documented. Always add a new error instead. Messages marked DEAD below
210     are no longer used. */
211 nigel 77
212     static const char *error_texts[] = {
213     "no error",
214     "\\ at end of pattern",
215     "\\c at end of pattern",
216     "unrecognized character follows \\",
217     "numbers out of order in {} quantifier",
218     /* 5 */
219     "number too big in {} quantifier",
220     "missing terminating ] for character class",
221     "invalid escape sequence in character class",
222     "range out of order in character class",
223     "nothing to repeat",
224     /* 10 */
225 nigel 93 "operand of unlimited repeat could match the empty string", /** DEAD **/
226 nigel 77 "internal error: unexpected repeat",
227     "unrecognized character after (?",
228     "POSIX named classes are supported only within a class",
229     "missing )",
230     /* 15 */
231     "reference to non-existent subpattern",
232     "erroffset passed as NULL",
233     "unknown option bit(s) set",
234     "missing ) after comment",
235 nigel 93 "parentheses nested too deeply", /** DEAD **/
236 nigel 77 /* 20 */
237 ph10 202 "regular expression is too large",
238 nigel 77 "failed to get memory",
239     "unmatched parentheses",
240     "internal error: code overflow",
241     "unrecognized character after (?<",
242     /* 25 */
243     "lookbehind assertion is not fixed length",
244 nigel 91 "malformed number or name after (?(",
245 nigel 77 "conditional group contains more than two branches",
246     "assertion expected after (?(",
247 ph10 166 "(?R or (?[+-]digits must be followed by )",
248 nigel 77 /* 30 */
249     "unknown POSIX class name",
250     "POSIX collating elements are not supported",
251     "this version of PCRE is not compiled with PCRE_UTF8 support",
252 nigel 93 "spare error", /** DEAD **/
253 nigel 77 "character value in \\x{...} sequence is too large",
254     /* 35 */
255     "invalid condition (?(0)",
256     "\\C not allowed in lookbehind assertion",
257     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
258     "number after (?C is > 255",
259     "closing ) for (?C expected",
260     /* 40 */
261     "recursive call could loop indefinitely",
262     "unrecognized character after (?P",
263 nigel 93 "syntax error in subpattern name (missing terminator)",
264 nigel 91 "two named subpatterns have the same name",
265 nigel 77 "invalid UTF-8 string",
266     /* 45 */
267     "support for \\P, \\p, and \\X has not been compiled",
268     "malformed \\P or \\p sequence",
269 nigel 91 "unknown property name after \\P or \\p",
270 nigel 93 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272 nigel 91 /* 50 */
273 ph10 202 "repeated subpattern is too long", /** DEAD **/
274 nigel 93 "octal value is greater than \\377 (not in UTF-8 mode)",
275     "internal error: overran compiling workspace",
276     "internal error: previously-checked referenced subpattern not found",
277     "DEFINE group contains more than one branch",
278     /* 55 */
279     "repeating a DEFINE group is not allowed",
280     "inconsistent NEWLINE options",
281 ph10 171 "\\g is not followed by a braced name or an optionally braced non-zero number",
282 ph10 210 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283     "(*VERB) with an argument is not supported",
284 ph10 211 /* 60 */
285     "(*VERB) not recognized"
286 nigel 77 };
287    
288    
289     /* Table to identify digits and hex digits. This is used when compiling
290     patterns. Note that the tables in chartables are dependent on the locale, and
291     may mark arbitrary characters as digits - but the PCRE compiling code expects
292     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
293     a private table here. It costs 256 bytes, but it is a lot faster than doing
294     character value tests (at least in some simple cases I timed), and in some
295     applications one wants PCRE to compile efficiently as well as match
296     efficiently.
297    
298     For convenience, we use the same bit definitions as in chartables:
299    
300     0x04 decimal digit
301     0x08 hexadecimal digit
302    
303     Then we can use ctype_digit and ctype_xdigit in the code. */
304    
305 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
306 nigel 77 static const unsigned char digitab[] =
307     {
308     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
309     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
310     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
311     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
312     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
313     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
314     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
315     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
316     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
317     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
318     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
319     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
320     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
321     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
322     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
323     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
324     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
325     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
326     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
333     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
334     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
335     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
336     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
337     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
338     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
339     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
340    
341 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
342 nigel 77 static const unsigned char digitab[] =
343     {
344     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
347     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
348     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
350     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
351     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
352     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
355 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
356 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
357     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
358     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
359     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
360     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
361     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
362     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
363     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
364     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
365     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
366     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
367     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
368     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
369     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
370     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
371     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
374     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
375     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
376    
377     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
378     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
379     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
380     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
381     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
382     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
383     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
386     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
387     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
388     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
389 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
390 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
391     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
393     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
394     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
395     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
396     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
397     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
398     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
399     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
400     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
401     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
402     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
403     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
404     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
405     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
406     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
407     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
408     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
409     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
410     #endif
411    
412    
413     /* Definition to allow mutual recursion */
414    
415     static BOOL
416 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
417 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
418 nigel 77
419    
420    
421     /*************************************************
422     * Handle escapes *
423     *************************************************/
424    
425     /* This function is called when a \ has been encountered. It either returns a
426     positive value for a simple escape such as \n, or a negative value which
427 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
428     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
429     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
430     ptr is pointing at the \. On exit, it is on the final character of the escape
431     sequence.
432 nigel 77
433     Arguments:
434     ptrptr points to the pattern position pointer
435     errorcodeptr points to the errorcode variable
436     bracount number of previous extracting brackets
437     options the options bits
438     isclass TRUE if inside a character class
439    
440     Returns: zero or positive => a data character
441     negative => a special escape sequence
442     on error, errorptr is set
443     */
444    
445     static int
446     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
447     int options, BOOL isclass)
448     {
449 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
450     const uschar *ptr = *ptrptr + 1;
451 nigel 77 int c, i;
452    
453 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
454     ptr--; /* Set pointer back to the last byte */
455    
456 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
457    
458     if (c == 0) *errorcodeptr = ERR1;
459    
460     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
461     a table. A non-zero result is something that can be returned immediately.
462     Otherwise further processing may be required. */
463    
464 ph10 97 #ifndef EBCDIC /* ASCII coding */
465 nigel 77 else if (c < '0' || c > 'z') {} /* Not alphameric */
466     else if ((i = escapes[c - '0']) != 0) c = i;
467    
468 ph10 97 #else /* EBCDIC coding */
469 nigel 77 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
470     else if ((i = escapes[c - 0x48]) != 0) c = i;
471     #endif
472    
473     /* Escapes that need further processing, or are illegal. */
474    
475     else
476     {
477     const uschar *oldptr;
478 nigel 93 BOOL braced, negated;
479    
480 nigel 77 switch (c)
481     {
482     /* A number of Perl escapes are not handled by PCRE. We give an explicit
483     error. */
484    
485     case 'l':
486     case 'L':
487     case 'N':
488     case 'u':
489     case 'U':
490     *errorcodeptr = ERR37;
491     break;
492    
493 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
494     is an absolute backreference. If negative, it is a relative backreference.
495 ph10 172 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
496     reference to a named group. This is part of Perl's movement towards a
497     unified syntax for back references. As this is synonymous with \k{name}, we
498 ph10 171 fudge it up by pretending it really was \k. */
499 nigel 93
500     case 'g':
501     if (ptr[1] == '{')
502     {
503 ph10 171 const uschar *p;
504     for (p = ptr+2; *p != 0 && *p != '}'; p++)
505     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
506 ph10 172 if (*p != 0 && *p != '}')
507 ph10 171 {
508     c = -ESC_k;
509     break;
510 ph10 172 }
511 nigel 93 braced = TRUE;
512     ptr++;
513     }
514     else braced = FALSE;
515    
516     if (ptr[1] == '-')
517     {
518     negated = TRUE;
519     ptr++;
520     }
521     else negated = FALSE;
522    
523     c = 0;
524     while ((digitab[ptr[1]] & ctype_digit) != 0)
525     c = c * 10 + *(++ptr) - '0';
526    
527     if (c == 0 || (braced && *(++ptr) != '}'))
528     {
529     *errorcodeptr = ERR57;
530     return 0;
531     }
532    
533     if (negated)
534     {
535     if (c > bracount)
536     {
537     *errorcodeptr = ERR15;
538     return 0;
539     }
540     c = bracount - (c - 1);
541     }
542    
543     c = -(ESC_REF + c);
544     break;
545    
546 nigel 77 /* The handling of escape sequences consisting of a string of digits
547     starting with one that is not zero is not straightforward. By experiment,
548     the way Perl works seems to be as follows:
549    
550     Outside a character class, the digits are read as a decimal number. If the
551     number is less than 10, or if there are that many previous extracting
552     left brackets, then it is a back reference. Otherwise, up to three octal
553     digits are read to form an escaped byte. Thus \123 is likely to be octal
554     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
555     value is greater than 377, the least significant 8 bits are taken. Inside a
556     character class, \ followed by a digit is always an octal number. */
557    
558     case '1': case '2': case '3': case '4': case '5':
559     case '6': case '7': case '8': case '9':
560    
561     if (!isclass)
562     {
563     oldptr = ptr;
564     c -= '0';
565     while ((digitab[ptr[1]] & ctype_digit) != 0)
566     c = c * 10 + *(++ptr) - '0';
567     if (c < 10 || c <= bracount)
568     {
569     c = -(ESC_REF + c);
570     break;
571     }
572     ptr = oldptr; /* Put the pointer back and fall through */
573     }
574    
575     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
576     generates a binary zero byte and treats the digit as a following literal.
577     Thus we have to pull back the pointer by one. */
578    
579     if ((c = *ptr) >= '8')
580     {
581     ptr--;
582     c = 0;
583     break;
584     }
585    
586     /* \0 always starts an octal number, but we may drop through to here with a
587 nigel 91 larger first octal digit. The original code used just to take the least
588     significant 8 bits of octal numbers (I think this is what early Perls used
589     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
590     than 3 octal digits. */
591 nigel 77
592     case '0':
593     c -= '0';
594     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
595     c = c * 8 + *(++ptr) - '0';
596 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
597 nigel 77 break;
598    
599 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
600     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
601     treated as a data character. */
602 nigel 77
603     case 'x':
604 nigel 87 if (ptr[1] == '{')
605 nigel 77 {
606     const uschar *pt = ptr + 2;
607 nigel 87 int count = 0;
608    
609 nigel 77 c = 0;
610     while ((digitab[*pt] & ctype_xdigit) != 0)
611     {
612 nigel 87 register int cc = *pt++;
613     if (c == 0 && cc == '0') continue; /* Leading zeroes */
614 nigel 77 count++;
615 nigel 87
616 ph10 97 #ifndef EBCDIC /* ASCII coding */
617 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
618 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
619 ph10 97 #else /* EBCDIC coding */
620 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
621 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
622 nigel 77 #endif
623     }
624 nigel 87
625 nigel 77 if (*pt == '}')
626     {
627 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
628 nigel 77 ptr = pt;
629     break;
630     }
631 nigel 87
632 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
633     recognize this construct; fall through to the normal \x handling. */
634     }
635    
636 nigel 87 /* Read just a single-byte hex-defined char */
637 nigel 77
638     c = 0;
639     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
640     {
641     int cc; /* Some compilers don't like ++ */
642     cc = *(++ptr); /* in initializers */
643 ph10 97 #ifndef EBCDIC /* ASCII coding */
644 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
645     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
646 ph10 97 #else /* EBCDIC coding */
647 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
648     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
649     #endif
650     }
651     break;
652    
653 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
654     This coding is ASCII-specific, but then the whole concept of \cx is
655     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
656 nigel 77
657     case 'c':
658     c = *(++ptr);
659     if (c == 0)
660     {
661     *errorcodeptr = ERR2;
662     return 0;
663     }
664    
665 ph10 97 #ifndef EBCDIC /* ASCII coding */
666 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
667     c ^= 0x40;
668 ph10 97 #else /* EBCDIC coding */
669 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
670     c ^= 0xC0;
671     #endif
672     break;
673    
674     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
675     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
676     for Perl compatibility, it is a literal. This code looks a bit odd, but
677     there used to be some cases other than the default, and there may be again
678     in future, so I haven't "optimized" it. */
679    
680     default:
681     if ((options & PCRE_EXTRA) != 0) switch(c)
682     {
683     default:
684     *errorcodeptr = ERR3;
685     break;
686     }
687     break;
688     }
689     }
690    
691     *ptrptr = ptr;
692     return c;
693     }
694    
695    
696    
697     #ifdef SUPPORT_UCP
698     /*************************************************
699     * Handle \P and \p *
700     *************************************************/
701    
702     /* This function is called after \P or \p has been encountered, provided that
703     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
704     pointing at the P or p. On exit, it is pointing at the final character of the
705     escape sequence.
706    
707     Argument:
708     ptrptr points to the pattern position pointer
709     negptr points to a boolean that is set TRUE for negation else FALSE
710 nigel 87 dptr points to an int that is set to the detailed property value
711 nigel 77 errorcodeptr points to the error code variable
712    
713 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
714 nigel 77 */
715    
716     static int
717 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
718 nigel 77 {
719     int c, i, bot, top;
720     const uschar *ptr = *ptrptr;
721 nigel 87 char name[32];
722 nigel 77
723     c = *(++ptr);
724     if (c == 0) goto ERROR_RETURN;
725    
726     *negptr = FALSE;
727    
728 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
729     negation. */
730 nigel 77
731     if (c == '{')
732     {
733     if (ptr[1] == '^')
734     {
735     *negptr = TRUE;
736     ptr++;
737     }
738 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
739 nigel 77 {
740     c = *(++ptr);
741     if (c == 0) goto ERROR_RETURN;
742     if (c == '}') break;
743     name[i] = c;
744     }
745 nigel 87 if (c !='}') goto ERROR_RETURN;
746 nigel 77 name[i] = 0;
747     }
748    
749     /* Otherwise there is just one following character */
750    
751     else
752     {
753     name[0] = c;
754     name[1] = 0;
755     }
756    
757     *ptrptr = ptr;
758    
759     /* Search for a recognized property name using binary chop */
760    
761     bot = 0;
762     top = _pcre_utt_size;
763    
764     while (bot < top)
765     {
766 nigel 87 i = (bot + top) >> 1;
767 nigel 77 c = strcmp(name, _pcre_utt[i].name);
768 nigel 87 if (c == 0)
769     {
770     *dptr = _pcre_utt[i].value;
771     return _pcre_utt[i].type;
772     }
773 nigel 77 if (c > 0) bot = i + 1; else top = i;
774     }
775    
776     *errorcodeptr = ERR47;
777     *ptrptr = ptr;
778     return -1;
779    
780     ERROR_RETURN:
781     *errorcodeptr = ERR46;
782     *ptrptr = ptr;
783     return -1;
784     }
785     #endif
786    
787    
788    
789    
790     /*************************************************
791     * Check for counted repeat *
792     *************************************************/
793    
794     /* This function is called when a '{' is encountered in a place where it might
795     start a quantifier. It looks ahead to see if it really is a quantifier or not.
796     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
797     where the ddds are digits.
798    
799     Arguments:
800     p pointer to the first char after '{'
801    
802     Returns: TRUE or FALSE
803     */
804    
805     static BOOL
806     is_counted_repeat(const uschar *p)
807     {
808     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
809     while ((digitab[*p] & ctype_digit) != 0) p++;
810     if (*p == '}') return TRUE;
811    
812     if (*p++ != ',') return FALSE;
813     if (*p == '}') return TRUE;
814    
815     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
816     while ((digitab[*p] & ctype_digit) != 0) p++;
817    
818     return (*p == '}');
819     }
820    
821    
822    
823     /*************************************************
824     * Read repeat counts *
825     *************************************************/
826    
827     /* Read an item of the form {n,m} and return the values. This is called only
828     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
829     so the syntax is guaranteed to be correct, but we need to check the values.
830    
831     Arguments:
832     p pointer to first char after '{'
833     minp pointer to int for min
834     maxp pointer to int for max
835     returned as -1 if no max
836     errorcodeptr points to error code variable
837    
838     Returns: pointer to '}' on success;
839     current ptr on error, with errorcodeptr set non-zero
840     */
841    
842     static const uschar *
843     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
844     {
845     int min = 0;
846     int max = -1;
847    
848 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
849     an integer overflow. */
850    
851 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
852 nigel 81 if (min < 0 || min > 65535)
853     {
854     *errorcodeptr = ERR5;
855     return p;
856     }
857 nigel 77
858 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
859     Also, max must not be less than min. */
860    
861 nigel 77 if (*p == '}') max = min; else
862     {
863     if (*(++p) != '}')
864     {
865     max = 0;
866     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
867 nigel 81 if (max < 0 || max > 65535)
868     {
869     *errorcodeptr = ERR5;
870     return p;
871     }
872 nigel 77 if (max < min)
873     {
874     *errorcodeptr = ERR4;
875     return p;
876     }
877     }
878     }
879    
880 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
881     '}'. */
882 nigel 77
883 nigel 81 *minp = min;
884     *maxp = max;
885 nigel 77 return p;
886     }
887    
888    
889    
890     /*************************************************
891 nigel 93 * Find forward referenced subpattern *
892 nigel 91 *************************************************/
893    
894 nigel 93 /* This function scans along a pattern's text looking for capturing
895     subpatterns, and counting them. If it finds a named pattern that matches the
896     name it is given, it returns its number. Alternatively, if the name is NULL, it
897     returns when it reaches a given numbered subpattern. This is used for forward
898     references to subpatterns. We know that if (?P< is encountered, the name will
899     be terminated by '>' because that is checked in the first pass.
900 nigel 91
901     Arguments:
902 nigel 93 ptr current position in the pattern
903     count current count of capturing parens so far encountered
904     name name to seek, or NULL if seeking a numbered subpattern
905     lorn name length, or subpattern number if name is NULL
906     xmode TRUE if we are in /x mode
907 nigel 91
908     Returns: the number of the named subpattern, or -1 if not found
909     */
910    
911     static int
912 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
913     BOOL xmode)
914 nigel 91 {
915     const uschar *thisname;
916 nigel 93
917 nigel 91 for (; *ptr != 0; ptr++)
918     {
919 nigel 93 int term;
920    
921     /* Skip over backslashed characters and also entire \Q...\E */
922    
923     if (*ptr == '\\')
924     {
925     if (*(++ptr) == 0) return -1;
926     if (*ptr == 'Q') for (;;)
927     {
928     while (*(++ptr) != 0 && *ptr != '\\');
929     if (*ptr == 0) return -1;
930     if (*(++ptr) == 'E') break;
931     }
932     continue;
933     }
934    
935     /* Skip over character classes */
936    
937     if (*ptr == '[')
938     {
939     while (*(++ptr) != ']')
940     {
941     if (*ptr == '\\')
942     {
943     if (*(++ptr) == 0) return -1;
944     if (*ptr == 'Q') for (;;)
945     {
946     while (*(++ptr) != 0 && *ptr != '\\');
947     if (*ptr == 0) return -1;
948     if (*(++ptr) == 'E') break;
949     }
950     continue;
951     }
952     }
953     continue;
954     }
955    
956     /* Skip comments in /x mode */
957    
958     if (xmode && *ptr == '#')
959     {
960     while (*(++ptr) != 0 && *ptr != '\n');
961     if (*ptr == 0) return -1;
962     continue;
963     }
964    
965     /* An opening parens must now be a real metacharacter */
966    
967 nigel 91 if (*ptr != '(') continue;
968 ph10 210 if (ptr[1] != '?' && ptr[1] != '*')
969 nigel 93 {
970     count++;
971     if (name == NULL && count == lorn) return count;
972     continue;
973     }
974    
975     ptr += 2;
976     if (*ptr == 'P') ptr++; /* Allow optional P */
977    
978     /* We have to disambiguate (?<! and (?<= from (?<name> */
979    
980     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
981     *ptr != '\'')
982     continue;
983    
984 nigel 91 count++;
985 nigel 93
986     if (name == NULL && count == lorn) return count;
987     term = *ptr++;
988     if (term == '<') term = '>';
989 nigel 91 thisname = ptr;
990 nigel 93 while (*ptr != term) ptr++;
991     if (name != NULL && lorn == ptr - thisname &&
992     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
993 nigel 91 return count;
994     }
995 nigel 93
996 nigel 91 return -1;
997     }
998    
999    
1000    
1001     /*************************************************
1002 nigel 77 * Find first significant op code *
1003     *************************************************/
1004    
1005     /* This is called by several functions that scan a compiled expression looking
1006     for a fixed first character, or an anchoring op code etc. It skips over things
1007     that do not influence this. For some calls, a change of option is important.
1008     For some calls, it makes sense to skip negative forward and all backward
1009     assertions, and also the \b assertion; for others it does not.
1010    
1011     Arguments:
1012     code pointer to the start of the group
1013     options pointer to external options
1014     optbit the option bit whose changing is significant, or
1015     zero if none are
1016     skipassert TRUE if certain assertions are to be skipped
1017    
1018     Returns: pointer to the first significant opcode
1019     */
1020    
1021     static const uschar*
1022     first_significant_code(const uschar *code, int *options, int optbit,
1023     BOOL skipassert)
1024     {
1025     for (;;)
1026     {
1027     switch ((int)*code)
1028     {
1029     case OP_OPT:
1030     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1031     *options = (int)code[1];
1032     code += 2;
1033     break;
1034    
1035     case OP_ASSERT_NOT:
1036     case OP_ASSERTBACK:
1037     case OP_ASSERTBACK_NOT:
1038     if (!skipassert) return code;
1039     do code += GET(code, 1); while (*code == OP_ALT);
1040     code += _pcre_OP_lengths[*code];
1041     break;
1042    
1043     case OP_WORD_BOUNDARY:
1044     case OP_NOT_WORD_BOUNDARY:
1045     if (!skipassert) return code;
1046     /* Fall through */
1047    
1048     case OP_CALLOUT:
1049     case OP_CREF:
1050 nigel 93 case OP_RREF:
1051     case OP_DEF:
1052 nigel 77 code += _pcre_OP_lengths[*code];
1053     break;
1054    
1055     default:
1056     return code;
1057     }
1058     }
1059     /* Control never reaches here */
1060     }
1061    
1062    
1063    
1064    
1065     /*************************************************
1066     * Find the fixed length of a pattern *
1067     *************************************************/
1068    
1069     /* Scan a pattern and compute the fixed length of subject that will match it,
1070     if the length is fixed. This is needed for dealing with backward assertions.
1071     In UTF8 mode, the result is in characters rather than bytes.
1072    
1073     Arguments:
1074     code points to the start of the pattern (the bracket)
1075     options the compiling options
1076    
1077     Returns: the fixed length, or -1 if there is no fixed length,
1078     or -2 if \C was encountered
1079     */
1080    
1081     static int
1082     find_fixedlength(uschar *code, int options)
1083     {
1084     int length = -1;
1085    
1086     register int branchlength = 0;
1087     register uschar *cc = code + 1 + LINK_SIZE;
1088    
1089     /* Scan along the opcodes for this branch. If we get to the end of the
1090     branch, check the length against that of the other branches. */
1091    
1092     for (;;)
1093     {
1094     int d;
1095     register int op = *cc;
1096    
1097     switch (op)
1098     {
1099 nigel 93 case OP_CBRA:
1100 nigel 77 case OP_BRA:
1101     case OP_ONCE:
1102     case OP_COND:
1103 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1104 nigel 77 if (d < 0) return d;
1105     branchlength += d;
1106     do cc += GET(cc, 1); while (*cc == OP_ALT);
1107     cc += 1 + LINK_SIZE;
1108     break;
1109    
1110     /* Reached end of a branch; if it's a ket it is the end of a nested
1111     call. If it's ALT it is an alternation in a nested call. If it is
1112     END it's the end of the outer call. All can be handled by the same code. */
1113    
1114     case OP_ALT:
1115     case OP_KET:
1116     case OP_KETRMAX:
1117     case OP_KETRMIN:
1118     case OP_END:
1119     if (length < 0) length = branchlength;
1120     else if (length != branchlength) return -1;
1121     if (*cc != OP_ALT) return length;
1122     cc += 1 + LINK_SIZE;
1123     branchlength = 0;
1124     break;
1125    
1126     /* Skip over assertive subpatterns */
1127    
1128     case OP_ASSERT:
1129     case OP_ASSERT_NOT:
1130     case OP_ASSERTBACK:
1131     case OP_ASSERTBACK_NOT:
1132     do cc += GET(cc, 1); while (*cc == OP_ALT);
1133     /* Fall through */
1134    
1135     /* Skip over things that don't match chars */
1136    
1137     case OP_REVERSE:
1138     case OP_CREF:
1139 nigel 93 case OP_RREF:
1140     case OP_DEF:
1141 nigel 77 case OP_OPT:
1142     case OP_CALLOUT:
1143     case OP_SOD:
1144     case OP_SOM:
1145     case OP_EOD:
1146     case OP_EODN:
1147     case OP_CIRC:
1148     case OP_DOLL:
1149     case OP_NOT_WORD_BOUNDARY:
1150     case OP_WORD_BOUNDARY:
1151     cc += _pcre_OP_lengths[*cc];
1152     break;
1153    
1154     /* Handle literal characters */
1155    
1156     case OP_CHAR:
1157     case OP_CHARNC:
1158 nigel 91 case OP_NOT:
1159 nigel 77 branchlength++;
1160     cc += 2;
1161     #ifdef SUPPORT_UTF8
1162     if ((options & PCRE_UTF8) != 0)
1163     {
1164     while ((*cc & 0xc0) == 0x80) cc++;
1165     }
1166     #endif
1167     break;
1168    
1169     /* Handle exact repetitions. The count is already in characters, but we
1170     need to skip over a multibyte character in UTF8 mode. */
1171    
1172     case OP_EXACT:
1173     branchlength += GET2(cc,1);
1174     cc += 4;
1175     #ifdef SUPPORT_UTF8
1176     if ((options & PCRE_UTF8) != 0)
1177     {
1178     while((*cc & 0x80) == 0x80) cc++;
1179     }
1180     #endif
1181     break;
1182    
1183     case OP_TYPEEXACT:
1184     branchlength += GET2(cc,1);
1185     cc += 4;
1186     break;
1187    
1188     /* Handle single-char matchers */
1189    
1190     case OP_PROP:
1191     case OP_NOTPROP:
1192 nigel 87 cc += 2;
1193 nigel 77 /* Fall through */
1194    
1195     case OP_NOT_DIGIT:
1196     case OP_DIGIT:
1197     case OP_NOT_WHITESPACE:
1198     case OP_WHITESPACE:
1199     case OP_NOT_WORDCHAR:
1200     case OP_WORDCHAR:
1201     case OP_ANY:
1202     branchlength++;
1203     cc++;
1204     break;
1205    
1206     /* The single-byte matcher isn't allowed */
1207    
1208     case OP_ANYBYTE:
1209     return -2;
1210    
1211     /* Check a class for variable quantification */
1212    
1213     #ifdef SUPPORT_UTF8
1214     case OP_XCLASS:
1215     cc += GET(cc, 1) - 33;
1216     /* Fall through */
1217     #endif
1218    
1219     case OP_CLASS:
1220     case OP_NCLASS:
1221     cc += 33;
1222    
1223     switch (*cc)
1224     {
1225     case OP_CRSTAR:
1226     case OP_CRMINSTAR:
1227     case OP_CRQUERY:
1228     case OP_CRMINQUERY:
1229     return -1;
1230    
1231     case OP_CRRANGE:
1232     case OP_CRMINRANGE:
1233     if (GET2(cc,1) != GET2(cc,3)) return -1;
1234     branchlength += GET2(cc,1);
1235     cc += 5;
1236     break;
1237    
1238     default:
1239     branchlength++;
1240     }
1241     break;
1242    
1243     /* Anything else is variable length */
1244    
1245     default:
1246     return -1;
1247     }
1248     }
1249     /* Control never gets here */
1250     }
1251    
1252    
1253    
1254    
1255     /*************************************************
1256     * Scan compiled regex for numbered bracket *
1257     *************************************************/
1258    
1259     /* This little function scans through a compiled pattern until it finds a
1260     capturing bracket with the given number.
1261    
1262     Arguments:
1263     code points to start of expression
1264     utf8 TRUE in UTF-8 mode
1265     number the required bracket number
1266    
1267     Returns: pointer to the opcode for the bracket, or NULL if not found
1268     */
1269    
1270     static const uschar *
1271     find_bracket(const uschar *code, BOOL utf8, int number)
1272     {
1273     for (;;)
1274     {
1275     register int c = *code;
1276     if (c == OP_END) return NULL;
1277 nigel 91
1278     /* XCLASS is used for classes that cannot be represented just by a bit
1279     map. This includes negated single high-valued characters. The length in
1280     the table is zero; the actual length is stored in the compiled code. */
1281    
1282     if (c == OP_XCLASS) code += GET(code, 1);
1283    
1284 nigel 93 /* Handle capturing bracket */
1285 nigel 91
1286 nigel 93 else if (c == OP_CBRA)
1287 nigel 77 {
1288 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1289 nigel 77 if (n == number) return (uschar *)code;
1290 nigel 93 code += _pcre_OP_lengths[c];
1291 nigel 77 }
1292 nigel 91
1293 nigel 93 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1294     a multi-byte character. The length in the table is a minimum, so we have to
1295     arrange to skip the extra bytes. */
1296 nigel 91
1297 nigel 77 else
1298     {
1299     code += _pcre_OP_lengths[c];
1300 ph10 107 #ifdef SUPPORT_UTF8
1301 nigel 77 if (utf8) switch(c)
1302     {
1303     case OP_CHAR:
1304     case OP_CHARNC:
1305     case OP_EXACT:
1306     case OP_UPTO:
1307     case OP_MINUPTO:
1308 nigel 93 case OP_POSUPTO:
1309 nigel 77 case OP_STAR:
1310     case OP_MINSTAR:
1311 nigel 93 case OP_POSSTAR:
1312 nigel 77 case OP_PLUS:
1313     case OP_MINPLUS:
1314 nigel 93 case OP_POSPLUS:
1315 nigel 77 case OP_QUERY:
1316     case OP_MINQUERY:
1317 nigel 93 case OP_POSQUERY:
1318     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1319 nigel 77 break;
1320     }
1321 ph10 111 #endif
1322 nigel 77 }
1323     }
1324     }
1325    
1326    
1327    
1328     /*************************************************
1329     * Scan compiled regex for recursion reference *
1330     *************************************************/
1331    
1332     /* This little function scans through a compiled pattern until it finds an
1333     instance of OP_RECURSE.
1334    
1335     Arguments:
1336     code points to start of expression
1337     utf8 TRUE in UTF-8 mode
1338    
1339     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1340     */
1341    
1342     static const uschar *
1343     find_recurse(const uschar *code, BOOL utf8)
1344     {
1345     for (;;)
1346     {
1347     register int c = *code;
1348     if (c == OP_END) return NULL;
1349 nigel 91 if (c == OP_RECURSE) return code;
1350    
1351     /* XCLASS is used for classes that cannot be represented just by a bit
1352     map. This includes negated single high-valued characters. The length in
1353     the table is zero; the actual length is stored in the compiled code. */
1354    
1355     if (c == OP_XCLASS) code += GET(code, 1);
1356    
1357     /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1358     that are followed by a character may be followed by a multi-byte character.
1359 nigel 93 The length in the table is a minimum, so we have to arrange to skip the extra
1360     bytes. */
1361 nigel 91
1362 nigel 77 else
1363     {
1364     code += _pcre_OP_lengths[c];
1365 ph10 107 #ifdef SUPPORT_UTF8
1366 nigel 77 if (utf8) switch(c)
1367     {
1368     case OP_CHAR:
1369     case OP_CHARNC:
1370     case OP_EXACT:
1371     case OP_UPTO:
1372     case OP_MINUPTO:
1373 nigel 93 case OP_POSUPTO:
1374 nigel 77 case OP_STAR:
1375     case OP_MINSTAR:
1376 nigel 93 case OP_POSSTAR:
1377 nigel 77 case OP_PLUS:
1378     case OP_MINPLUS:
1379 nigel 93 case OP_POSPLUS:
1380 nigel 77 case OP_QUERY:
1381     case OP_MINQUERY:
1382 nigel 93 case OP_POSQUERY:
1383     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1384 nigel 77 break;
1385     }
1386 ph10 111 #endif
1387 nigel 77 }
1388     }
1389     }
1390    
1391    
1392    
1393     /*************************************************
1394     * Scan compiled branch for non-emptiness *
1395     *************************************************/
1396    
1397     /* This function scans through a branch of a compiled pattern to see whether it
1398 nigel 93 can match the empty string or not. It is called from could_be_empty()
1399     below and from compile_branch() when checking for an unlimited repeat of a
1400     group that can match nothing. Note that first_significant_code() skips over
1401     assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1402     struck an inner bracket whose current branch will already have been scanned.
1403 nigel 77
1404     Arguments:
1405     code points to start of search
1406     endcode points to where to stop
1407     utf8 TRUE if in UTF8 mode
1408    
1409     Returns: TRUE if what is matched could be empty
1410     */
1411    
1412     static BOOL
1413     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1414     {
1415     register int c;
1416 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1417 nigel 77 code < endcode;
1418     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1419     {
1420     const uschar *ccode;
1421    
1422     c = *code;
1423 ph10 172
1424 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1425 nigel 77
1426 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1427     {
1428 ph10 172 code += _pcre_OP_lengths[c];
1429 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1430     c = *code;
1431     continue;
1432     }
1433    
1434     /* For other groups, scan the branches. */
1435 ph10 172
1436 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1437 nigel 77 {
1438     BOOL empty_branch;
1439     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1440    
1441     /* Scan a closed bracket */
1442    
1443     empty_branch = FALSE;
1444     do
1445     {
1446     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1447     empty_branch = TRUE;
1448     code += GET(code, 1);
1449     }
1450     while (*code == OP_ALT);
1451     if (!empty_branch) return FALSE; /* All branches are non-empty */
1452 ph10 172 c = *code;
1453 nigel 93 continue;
1454 nigel 77 }
1455    
1456 nigel 93 /* Handle the other opcodes */
1457    
1458     switch (c)
1459 nigel 77 {
1460     /* Check for quantifiers after a class */
1461    
1462     #ifdef SUPPORT_UTF8
1463     case OP_XCLASS:
1464     ccode = code + GET(code, 1);
1465     goto CHECK_CLASS_REPEAT;
1466     #endif
1467    
1468     case OP_CLASS:
1469     case OP_NCLASS:
1470     ccode = code + 33;
1471    
1472     #ifdef SUPPORT_UTF8
1473     CHECK_CLASS_REPEAT:
1474     #endif
1475    
1476     switch (*ccode)
1477     {
1478     case OP_CRSTAR: /* These could be empty; continue */
1479     case OP_CRMINSTAR:
1480     case OP_CRQUERY:
1481     case OP_CRMINQUERY:
1482     break;
1483    
1484     default: /* Non-repeat => class must match */
1485     case OP_CRPLUS: /* These repeats aren't empty */
1486     case OP_CRMINPLUS:
1487     return FALSE;
1488    
1489     case OP_CRRANGE:
1490     case OP_CRMINRANGE:
1491     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1492     break;
1493     }
1494     break;
1495    
1496     /* Opcodes that must match a character */
1497    
1498     case OP_PROP:
1499     case OP_NOTPROP:
1500     case OP_EXTUNI:
1501     case OP_NOT_DIGIT:
1502     case OP_DIGIT:
1503     case OP_NOT_WHITESPACE:
1504     case OP_WHITESPACE:
1505     case OP_NOT_WORDCHAR:
1506     case OP_WORDCHAR:
1507     case OP_ANY:
1508     case OP_ANYBYTE:
1509     case OP_CHAR:
1510     case OP_CHARNC:
1511     case OP_NOT:
1512     case OP_PLUS:
1513     case OP_MINPLUS:
1514 nigel 93 case OP_POSPLUS:
1515 nigel 77 case OP_EXACT:
1516     case OP_NOTPLUS:
1517     case OP_NOTMINPLUS:
1518 nigel 93 case OP_NOTPOSPLUS:
1519 nigel 77 case OP_NOTEXACT:
1520     case OP_TYPEPLUS:
1521     case OP_TYPEMINPLUS:
1522 nigel 93 case OP_TYPEPOSPLUS:
1523 nigel 77 case OP_TYPEEXACT:
1524     return FALSE;
1525    
1526     /* End of branch */
1527    
1528     case OP_KET:
1529     case OP_KETRMAX:
1530     case OP_KETRMIN:
1531     case OP_ALT:
1532     return TRUE;
1533    
1534 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1535     MINUPTO, and POSUPTO may be followed by a multibyte character */
1536 nigel 77
1537     #ifdef SUPPORT_UTF8
1538     case OP_STAR:
1539     case OP_MINSTAR:
1540 nigel 93 case OP_POSSTAR:
1541 nigel 77 case OP_QUERY:
1542     case OP_MINQUERY:
1543 nigel 93 case OP_POSQUERY:
1544 nigel 77 case OP_UPTO:
1545     case OP_MINUPTO:
1546 nigel 93 case OP_POSUPTO:
1547 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1548     break;
1549     #endif
1550     }
1551     }
1552    
1553     return TRUE;
1554     }
1555    
1556    
1557    
1558     /*************************************************
1559     * Scan compiled regex for non-emptiness *
1560     *************************************************/
1561    
1562     /* This function is called to check for left recursive calls. We want to check
1563     the current branch of the current pattern to see if it could match the empty
1564     string. If it could, we must look outwards for branches at other levels,
1565     stopping when we pass beyond the bracket which is the subject of the recursion.
1566    
1567     Arguments:
1568     code points to start of the recursion
1569     endcode points to where to stop (current RECURSE item)
1570     bcptr points to the chain of current (unclosed) branch starts
1571     utf8 TRUE if in UTF-8 mode
1572    
1573     Returns: TRUE if what is matched could be empty
1574     */
1575    
1576     static BOOL
1577     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1578     BOOL utf8)
1579     {
1580     while (bcptr != NULL && bcptr->current >= code)
1581     {
1582     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1583     bcptr = bcptr->outer;
1584     }
1585     return TRUE;
1586     }
1587    
1588    
1589    
1590     /*************************************************
1591     * Check for POSIX class syntax *
1592     *************************************************/
1593    
1594     /* This function is called when the sequence "[:" or "[." or "[=" is
1595     encountered in a character class. It checks whether this is followed by an
1596     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1597     ".]" or "=]".
1598    
1599     Argument:
1600     ptr pointer to the initial [
1601     endptr where to return the end pointer
1602     cd pointer to compile data
1603    
1604     Returns: TRUE or FALSE
1605     */
1606    
1607     static BOOL
1608     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1609     {
1610     int terminator; /* Don't combine these lines; the Solaris cc */
1611     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1612     if (*(++ptr) == '^') ptr++;
1613     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1614     if (*ptr == terminator && ptr[1] == ']')
1615     {
1616     *endptr = ptr;
1617     return TRUE;
1618     }
1619     return FALSE;
1620     }
1621    
1622    
1623    
1624    
1625     /*************************************************
1626     * Check POSIX class name *
1627     *************************************************/
1628    
1629     /* This function is called to check the name given in a POSIX-style class entry
1630     such as [:alnum:].
1631    
1632     Arguments:
1633     ptr points to the first letter
1634     len the length of the name
1635    
1636     Returns: a value representing the name, or -1 if unknown
1637     */
1638    
1639     static int
1640     check_posix_name(const uschar *ptr, int len)
1641     {
1642     register int yield = 0;
1643     while (posix_name_lengths[yield] != 0)
1644     {
1645     if (len == posix_name_lengths[yield] &&
1646     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1647     yield++;
1648     }
1649     return -1;
1650     }
1651    
1652    
1653     /*************************************************
1654     * Adjust OP_RECURSE items in repeated group *
1655     *************************************************/
1656    
1657     /* OP_RECURSE items contain an offset from the start of the regex to the group
1658     that is referenced. This means that groups can be replicated for fixed
1659     repetition simply by copying (because the recursion is allowed to refer to
1660     earlier groups that are outside the current group). However, when a group is
1661     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1662     it, after it has been compiled. This means that any OP_RECURSE items within it
1663     that refer to the group itself or any contained groups have to have their
1664 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1665     the partially compiled regex must be temporarily terminated with OP_END.
1666 nigel 77
1667 nigel 93 This function has been extended with the possibility of forward references for
1668     recursions and subroutine calls. It must also check the list of such references
1669     for the group we are dealing with. If it finds that one of the recursions in
1670     the current group is on this list, it adjusts the offset in the list, not the
1671     value in the reference (which is a group number).
1672    
1673 nigel 77 Arguments:
1674     group points to the start of the group
1675     adjust the amount by which the group is to be moved
1676     utf8 TRUE in UTF-8 mode
1677     cd contains pointers to tables etc.
1678 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1679 nigel 77
1680     Returns: nothing
1681     */
1682    
1683     static void
1684 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1685     uschar *save_hwm)
1686 nigel 77 {
1687     uschar *ptr = group;
1688     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1689     {
1690 nigel 93 int offset;
1691     uschar *hc;
1692    
1693     /* See if this recursion is on the forward reference list. If so, adjust the
1694     reference. */
1695    
1696     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1697     {
1698     offset = GET(hc, 0);
1699     if (cd->start_code + offset == ptr + 1)
1700     {
1701     PUT(hc, 0, offset + adjust);
1702     break;
1703     }
1704     }
1705    
1706     /* Otherwise, adjust the recursion offset if it's after the start of this
1707     group. */
1708    
1709     if (hc >= cd->hwm)
1710     {
1711     offset = GET(ptr, 1);
1712     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1713     }
1714    
1715 nigel 77 ptr += 1 + LINK_SIZE;
1716     }
1717     }
1718    
1719    
1720    
1721     /*************************************************
1722     * Insert an automatic callout point *
1723     *************************************************/
1724    
1725     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1726     callout points before each pattern item.
1727    
1728     Arguments:
1729     code current code pointer
1730     ptr current pattern pointer
1731     cd pointers to tables etc
1732    
1733     Returns: new code pointer
1734     */
1735    
1736     static uschar *
1737     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1738     {
1739     *code++ = OP_CALLOUT;
1740     *code++ = 255;
1741     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1742     PUT(code, LINK_SIZE, 0); /* Default length */
1743     return code + 2*LINK_SIZE;
1744     }
1745    
1746    
1747    
1748     /*************************************************
1749     * Complete a callout item *
1750     *************************************************/
1751    
1752     /* A callout item contains the length of the next item in the pattern, which
1753     we can't fill in till after we have reached the relevant point. This is used
1754     for both automatic and manual callouts.
1755    
1756     Arguments:
1757     previous_callout points to previous callout item
1758     ptr current pattern pointer
1759     cd pointers to tables etc
1760    
1761     Returns: nothing
1762     */
1763    
1764     static void
1765     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1766     {
1767     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1768     PUT(previous_callout, 2 + LINK_SIZE, length);
1769     }
1770    
1771    
1772    
1773     #ifdef SUPPORT_UCP
1774     /*************************************************
1775     * Get othercase range *
1776     *************************************************/
1777    
1778     /* This function is passed the start and end of a class range, in UTF-8 mode
1779     with UCP support. It searches up the characters, looking for internal ranges of
1780     characters in the "other" case. Each call returns the next one, updating the
1781     start address.
1782    
1783     Arguments:
1784     cptr points to starting character value; updated
1785     d end value
1786     ocptr where to put start of othercase range
1787     odptr where to put end of othercase range
1788    
1789     Yield: TRUE when range returned; FALSE when no more
1790     */
1791    
1792     static BOOL
1793 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1794     unsigned int *odptr)
1795 nigel 77 {
1796 nigel 93 unsigned int c, othercase, next;
1797 nigel 77
1798     for (c = *cptr; c <= d; c++)
1799 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1800 nigel 77
1801     if (c > d) return FALSE;
1802    
1803     *ocptr = othercase;
1804     next = othercase + 1;
1805    
1806     for (++c; c <= d; c++)
1807     {
1808 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1809 nigel 77 next++;
1810     }
1811    
1812     *odptr = next - 1;
1813     *cptr = c;
1814    
1815     return TRUE;
1816     }
1817     #endif /* SUPPORT_UCP */
1818    
1819    
1820 nigel 93
1821 nigel 77 /*************************************************
1822 nigel 93 * Check if auto-possessifying is possible *
1823     *************************************************/
1824    
1825     /* This function is called for unlimited repeats of certain items, to see
1826     whether the next thing could possibly match the repeated item. If not, it makes
1827     sense to automatically possessify the repeated item.
1828    
1829     Arguments:
1830     op_code the repeated op code
1831     this data for this item, depends on the opcode
1832     utf8 TRUE in UTF-8 mode
1833     utf8_char used for utf8 character bytes, NULL if not relevant
1834     ptr next character in pattern
1835     options options bits
1836     cd contains pointers to tables etc.
1837    
1838     Returns: TRUE if possessifying is wanted
1839     */
1840    
1841     static BOOL
1842     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1843     const uschar *ptr, int options, compile_data *cd)
1844     {
1845     int next;
1846    
1847     /* Skip whitespace and comments in extended mode */
1848    
1849     if ((options & PCRE_EXTENDED) != 0)
1850     {
1851     for (;;)
1852     {
1853     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1854     if (*ptr == '#')
1855     {
1856     while (*(++ptr) != 0)
1857     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1858     }
1859     else break;
1860     }
1861     }
1862    
1863     /* If the next item is one that we can handle, get its value. A non-negative
1864     value is a character, a negative value is an escape value. */
1865    
1866     if (*ptr == '\\')
1867     {
1868     int temperrorcode = 0;
1869     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1870     if (temperrorcode != 0) return FALSE;
1871     ptr++; /* Point after the escape sequence */
1872     }
1873    
1874     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1875     {
1876     #ifdef SUPPORT_UTF8
1877     if (utf8) { GETCHARINC(next, ptr); } else
1878     #endif
1879     next = *ptr++;
1880     }
1881    
1882     else return FALSE;
1883    
1884     /* Skip whitespace and comments in extended mode */
1885    
1886     if ((options & PCRE_EXTENDED) != 0)
1887     {
1888     for (;;)
1889     {
1890     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1891     if (*ptr == '#')
1892     {
1893     while (*(++ptr) != 0)
1894     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1895     }
1896     else break;
1897     }
1898     }
1899    
1900     /* If the next thing is itself optional, we have to give up. */
1901    
1902     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1903     return FALSE;
1904    
1905     /* Now compare the next item with the previous opcode. If the previous is a
1906     positive single character match, "item" either contains the character or, if
1907     "item" is greater than 127 in utf8 mode, the character's bytes are in
1908     utf8_char. */
1909    
1910    
1911     /* Handle cases when the next item is a character. */
1912    
1913     if (next >= 0) switch(op_code)
1914     {
1915     case OP_CHAR:
1916     #ifdef SUPPORT_UTF8
1917     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1918     #endif
1919     return item != next;
1920    
1921     /* For CHARNC (caseless character) we must check the other case. If we have
1922     Unicode property support, we can use it to test the other case of
1923     high-valued characters. */
1924    
1925     case OP_CHARNC:
1926     #ifdef SUPPORT_UTF8
1927     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1928     #endif
1929     if (item == next) return FALSE;
1930     #ifdef SUPPORT_UTF8
1931     if (utf8)
1932     {
1933     unsigned int othercase;
1934     if (next < 128) othercase = cd->fcc[next]; else
1935     #ifdef SUPPORT_UCP
1936     othercase = _pcre_ucp_othercase((unsigned int)next);
1937     #else
1938     othercase = NOTACHAR;
1939     #endif
1940     return (unsigned int)item != othercase;
1941     }
1942     else
1943     #endif /* SUPPORT_UTF8 */
1944     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1945    
1946     /* For OP_NOT, "item" must be a single-byte character. */
1947    
1948     case OP_NOT:
1949     if (next < 0) return FALSE; /* Not a character */
1950     if (item == next) return TRUE;
1951     if ((options & PCRE_CASELESS) == 0) return FALSE;
1952     #ifdef SUPPORT_UTF8
1953     if (utf8)
1954     {
1955     unsigned int othercase;
1956     if (next < 128) othercase = cd->fcc[next]; else
1957     #ifdef SUPPORT_UCP
1958     othercase = _pcre_ucp_othercase(next);
1959     #else
1960     othercase = NOTACHAR;
1961     #endif
1962     return (unsigned int)item == othercase;
1963     }
1964     else
1965     #endif /* SUPPORT_UTF8 */
1966     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1967    
1968     case OP_DIGIT:
1969     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1970    
1971     case OP_NOT_DIGIT:
1972     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1973    
1974     case OP_WHITESPACE:
1975     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1976    
1977     case OP_NOT_WHITESPACE:
1978     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1979    
1980     case OP_WORDCHAR:
1981     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1982    
1983     case OP_NOT_WORDCHAR:
1984     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1985    
1986 ph10 180 case OP_HSPACE:
1987     case OP_NOT_HSPACE:
1988     switch(next)
1989     {
1990     case 0x09:
1991     case 0x20:
1992     case 0xa0:
1993     case 0x1680:
1994     case 0x180e:
1995     case 0x2000:
1996     case 0x2001:
1997     case 0x2002:
1998     case 0x2003:
1999     case 0x2004:
2000     case 0x2005:
2001     case 0x2006:
2002     case 0x2007:
2003     case 0x2008:
2004     case 0x2009:
2005     case 0x200A:
2006     case 0x202f:
2007     case 0x205f:
2008     case 0x3000:
2009     return op_code != OP_HSPACE;
2010     default:
2011     return op_code == OP_HSPACE;
2012     }
2013    
2014     case OP_VSPACE:
2015     case OP_NOT_VSPACE:
2016     switch(next)
2017     {
2018     case 0x0a:
2019     case 0x0b:
2020     case 0x0c:
2021     case 0x0d:
2022     case 0x85:
2023     case 0x2028:
2024     case 0x2029:
2025     return op_code != OP_VSPACE;
2026     default:
2027     return op_code == OP_VSPACE;
2028     }
2029    
2030 nigel 93 default:
2031     return FALSE;
2032     }
2033    
2034    
2035     /* Handle the case when the next item is \d, \s, etc. */
2036    
2037     switch(op_code)
2038     {
2039     case OP_CHAR:
2040     case OP_CHARNC:
2041     #ifdef SUPPORT_UTF8
2042     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2043     #endif
2044     switch(-next)
2045     {
2046     case ESC_d:
2047     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2048    
2049     case ESC_D:
2050     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2051    
2052     case ESC_s:
2053     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2054    
2055     case ESC_S:
2056     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2057    
2058     case ESC_w:
2059     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2060    
2061     case ESC_W:
2062     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2063 ph10 182
2064 ph10 180 case ESC_h:
2065     case ESC_H:
2066     switch(item)
2067     {
2068     case 0x09:
2069     case 0x20:
2070     case 0xa0:
2071     case 0x1680:
2072     case 0x180e:
2073     case 0x2000:
2074     case 0x2001:
2075     case 0x2002:
2076     case 0x2003:
2077     case 0x2004:
2078     case 0x2005:
2079     case 0x2006:
2080     case 0x2007:
2081     case 0x2008:
2082     case 0x2009:
2083     case 0x200A:
2084     case 0x202f:
2085     case 0x205f:
2086     case 0x3000:
2087     return -next != ESC_h;
2088     default:
2089     return -next == ESC_h;
2090 ph10 182 }
2091    
2092 ph10 180 case ESC_v:
2093     case ESC_V:
2094     switch(item)
2095     {
2096     case 0x0a:
2097     case 0x0b:
2098     case 0x0c:
2099     case 0x0d:
2100     case 0x85:
2101     case 0x2028:
2102     case 0x2029:
2103     return -next != ESC_v;
2104     default:
2105     return -next == ESC_v;
2106 ph10 182 }
2107 nigel 93
2108     default:
2109     return FALSE;
2110     }
2111    
2112     case OP_DIGIT:
2113 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2114     next == -ESC_h || next == -ESC_v;
2115 nigel 93
2116     case OP_NOT_DIGIT:
2117     return next == -ESC_d;
2118    
2119     case OP_WHITESPACE:
2120     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2121    
2122     case OP_NOT_WHITESPACE:
2123 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2124 nigel 93
2125 ph10 180 case OP_HSPACE:
2126     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2127    
2128     case OP_NOT_HSPACE:
2129     return next == -ESC_h;
2130 ph10 182
2131 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2132 ph10 182 case OP_VSPACE:
2133 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2134    
2135     case OP_NOT_VSPACE:
2136 ph10 182 return next == -ESC_v;
2137 ph10 180
2138 nigel 93 case OP_WORDCHAR:
2139 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2140 nigel 93
2141     case OP_NOT_WORDCHAR:
2142     return next == -ESC_w || next == -ESC_d;
2143 ph10 182
2144 nigel 93 default:
2145     return FALSE;
2146     }
2147    
2148     /* Control does not reach here */
2149     }
2150    
2151    
2152    
2153     /*************************************************
2154 nigel 77 * Compile one branch *
2155     *************************************************/
2156    
2157 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2158 nigel 77 changed during the branch, the pointer is used to change the external options
2159 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2160     to find out the amount of memory needed, as well as during the real compile
2161     phase. The value of lengthptr distinguishes the two phases.
2162 nigel 77
2163     Arguments:
2164     optionsptr pointer to the option bits
2165     codeptr points to the pointer to the current code point
2166     ptrptr points to the current pattern pointer
2167     errorcodeptr points to error code variable
2168     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2169     reqbyteptr set to the last literal character required, else < 0
2170     bcptr points to current branch chain
2171     cd contains pointers to tables etc.
2172 nigel 93 lengthptr NULL during the real compile phase
2173     points to length accumulator during pre-compile phase
2174 nigel 77
2175     Returns: TRUE on success
2176     FALSE, with *errorcodeptr set non-zero on error
2177     */
2178    
2179     static BOOL
2180 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2181     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2182     compile_data *cd, int *lengthptr)
2183 nigel 77 {
2184     int repeat_type, op_type;
2185     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2186     int bravalue = 0;
2187     int greedy_default, greedy_non_default;
2188     int firstbyte, reqbyte;
2189     int zeroreqbyte, zerofirstbyte;
2190     int req_caseopt, reqvary, tempreqvary;
2191     int options = *optionsptr;
2192     int after_manual_callout = 0;
2193 nigel 93 int length_prevgroup = 0;
2194 nigel 77 register int c;
2195     register uschar *code = *codeptr;
2196 nigel 93 uschar *last_code = code;
2197     uschar *orig_code = code;
2198 nigel 77 uschar *tempcode;
2199     BOOL inescq = FALSE;
2200     BOOL groupsetfirstbyte = FALSE;
2201     const uschar *ptr = *ptrptr;
2202     const uschar *tempptr;
2203     uschar *previous = NULL;
2204     uschar *previous_callout = NULL;
2205 nigel 93 uschar *save_hwm = NULL;
2206 nigel 77 uschar classbits[32];
2207    
2208     #ifdef SUPPORT_UTF8
2209     BOOL class_utf8;
2210     BOOL utf8 = (options & PCRE_UTF8) != 0;
2211     uschar *class_utf8data;
2212     uschar utf8_char[6];
2213     #else
2214     BOOL utf8 = FALSE;
2215 nigel 93 uschar *utf8_char = NULL;
2216 nigel 77 #endif
2217    
2218 nigel 93 #ifdef DEBUG
2219     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2220     #endif
2221    
2222 nigel 77 /* Set up the default and non-default settings for greediness */
2223    
2224     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2225     greedy_non_default = greedy_default ^ 1;
2226    
2227     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2228     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2229     matches a non-fixed char first char; reqbyte just remains unset if we never
2230     find one.
2231    
2232     When we hit a repeat whose minimum is zero, we may have to adjust these values
2233     to take the zero repeat into account. This is implemented by setting them to
2234     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2235     item types that can be repeated set these backoff variables appropriately. */
2236    
2237     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2238    
2239     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2240     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2241     value > 255. It is added into the firstbyte or reqbyte variables to record the
2242     case status of the value. This is used only for ASCII characters. */
2243    
2244     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2245    
2246     /* Switch on next character until the end of the branch */
2247    
2248     for (;; ptr++)
2249     {
2250     BOOL negate_class;
2251     BOOL possessive_quantifier;
2252     BOOL is_quantifier;
2253 nigel 93 BOOL is_recurse;
2254 ph10 180 BOOL reset_bracount;
2255 nigel 77 int class_charcount;
2256     int class_lastchar;
2257     int newoptions;
2258     int recno;
2259 ph10 172 int refsign;
2260 nigel 77 int skipbytes;
2261     int subreqbyte;
2262     int subfirstbyte;
2263 nigel 93 int terminator;
2264 nigel 77 int mclength;
2265     uschar mcbuffer[8];
2266    
2267 nigel 93 /* Get next byte in the pattern */
2268 nigel 77
2269     c = *ptr;
2270    
2271 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2272     previous cycle of this loop. */
2273    
2274     if (lengthptr != NULL)
2275     {
2276     #ifdef DEBUG
2277     if (code > cd->hwm) cd->hwm = code; /* High water info */
2278     #endif
2279     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2280     {
2281     *errorcodeptr = ERR52;
2282     goto FAILED;
2283     }
2284    
2285     /* There is at least one situation where code goes backwards: this is the
2286     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2287     the class is simply eliminated. However, it is created first, so we have to
2288     allow memory for it. Therefore, don't ever reduce the length at this point.
2289     */
2290    
2291     if (code < last_code) code = last_code;
2292 ph10 202
2293     /* Paranoid check for integer overflow */
2294    
2295     if (OFLOW_MAX - *lengthptr < code - last_code)
2296     {
2297     *errorcodeptr = ERR20;
2298     goto FAILED;
2299     }
2300    
2301 nigel 93 *lengthptr += code - last_code;
2302     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2303    
2304     /* If "previous" is set and it is not at the start of the work space, move
2305     it back to there, in order to avoid filling up the work space. Otherwise,
2306     if "previous" is NULL, reset the current code pointer to the start. */
2307    
2308     if (previous != NULL)
2309     {
2310     if (previous > orig_code)
2311     {
2312     memmove(orig_code, previous, code - previous);
2313     code -= previous - orig_code;
2314     previous = orig_code;
2315     }
2316     }
2317     else code = orig_code;
2318    
2319     /* Remember where this code item starts so we can pick up the length
2320     next time round. */
2321    
2322     last_code = code;
2323     }
2324    
2325     /* In the real compile phase, just check the workspace used by the forward
2326     reference list. */
2327    
2328     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2329     {
2330     *errorcodeptr = ERR52;
2331     goto FAILED;
2332     }
2333    
2334 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2335    
2336     if (inescq && c != 0)
2337     {
2338     if (c == '\\' && ptr[1] == 'E')
2339     {
2340     inescq = FALSE;
2341     ptr++;
2342     continue;
2343     }
2344     else
2345     {
2346     if (previous_callout != NULL)
2347     {
2348 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2349     complete_callout(previous_callout, ptr, cd);
2350 nigel 77 previous_callout = NULL;
2351     }
2352     if ((options & PCRE_AUTO_CALLOUT) != 0)
2353     {
2354     previous_callout = code;
2355     code = auto_callout(code, ptr, cd);
2356     }
2357     goto NORMAL_CHAR;
2358     }
2359     }
2360    
2361     /* Fill in length of a previous callout, except when the next thing is
2362     a quantifier. */
2363    
2364     is_quantifier = c == '*' || c == '+' || c == '?' ||
2365     (c == '{' && is_counted_repeat(ptr+1));
2366    
2367     if (!is_quantifier && previous_callout != NULL &&
2368     after_manual_callout-- <= 0)
2369     {
2370 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2371     complete_callout(previous_callout, ptr, cd);
2372 nigel 77 previous_callout = NULL;
2373     }
2374    
2375     /* In extended mode, skip white space and comments */
2376    
2377     if ((options & PCRE_EXTENDED) != 0)
2378     {
2379     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2380     if (c == '#')
2381     {
2382 nigel 93 while (*(++ptr) != 0)
2383 nigel 91 {
2384 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2385 nigel 91 }
2386 nigel 93 if (*ptr != 0) continue;
2387    
2388 nigel 91 /* Else fall through to handle end of string */
2389     c = 0;
2390 nigel 77 }
2391     }
2392    
2393     /* No auto callout for quantifiers. */
2394    
2395     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2396     {
2397     previous_callout = code;
2398     code = auto_callout(code, ptr, cd);
2399     }
2400    
2401     switch(c)
2402     {
2403 nigel 93 /* ===================================================================*/
2404     case 0: /* The branch terminates at string end */
2405     case '|': /* or | or ) */
2406 nigel 77 case ')':
2407     *firstbyteptr = firstbyte;
2408     *reqbyteptr = reqbyte;
2409     *codeptr = code;
2410     *ptrptr = ptr;
2411 nigel 93 if (lengthptr != NULL)
2412     {
2413 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2414     {
2415     *errorcodeptr = ERR20;
2416     goto FAILED;
2417     }
2418 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2419     DPRINTF((">> end branch\n"));
2420     }
2421 nigel 77 return TRUE;
2422    
2423 nigel 93
2424     /* ===================================================================*/
2425 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2426     the setting of any following char as a first character. */
2427    
2428     case '^':
2429     if ((options & PCRE_MULTILINE) != 0)
2430     {
2431     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2432     }
2433     previous = NULL;
2434     *code++ = OP_CIRC;
2435     break;
2436    
2437     case '$':
2438     previous = NULL;
2439     *code++ = OP_DOLL;
2440     break;
2441    
2442     /* There can never be a first char if '.' is first, whatever happens about
2443     repeats. The value of reqbyte doesn't change either. */
2444    
2445     case '.':
2446     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2447     zerofirstbyte = firstbyte;
2448     zeroreqbyte = reqbyte;
2449     previous = code;
2450     *code++ = OP_ANY;
2451     break;
2452    
2453 nigel 93
2454     /* ===================================================================*/
2455 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2456     32-byte bitmap of the permitted characters, except in the special case
2457     where there is only one such character. For negated classes, we build the
2458     map as usual, then invert it at the end. However, we use a different opcode
2459     so that data characters > 255 can be handled correctly.
2460 nigel 77
2461     If the class contains characters outside the 0-255 range, a different
2462     opcode is compiled. It may optionally have a bit map for characters < 256,
2463     but those above are are explicitly listed afterwards. A flag byte tells
2464     whether the bitmap is present, and whether this is a negated class or not.
2465     */
2466    
2467     case '[':
2468     previous = code;
2469    
2470     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2471     they are encountered at the top level, so we'll do that too. */
2472    
2473     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2474     check_posix_syntax(ptr, &tempptr, cd))
2475     {
2476     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2477     goto FAILED;
2478     }
2479    
2480 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2481 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2482 ph10 205 skip them too. This makes for compatibility with Perl. */
2483 ph10 208
2484 ph10 205 negate_class = FALSE;
2485     for (;;)
2486 nigel 77 {
2487     c = *(++ptr);
2488 ph10 205 if (c == '\\')
2489     {
2490 ph10 208 if (ptr[1] == 'E') ptr++;
2491 ph10 205 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2492 ph10 208 else break;
2493 ph10 205 }
2494     else if (!negate_class && c == '^')
2495     negate_class = TRUE;
2496     else break;
2497 ph10 208 }
2498 nigel 77
2499     /* Keep a count of chars with values < 256 so that we can optimize the case
2500 nigel 93 of just a single character (as long as it's < 256). However, For higher
2501     valued UTF-8 characters, we don't yet do any optimization. */
2502 nigel 77
2503     class_charcount = 0;
2504     class_lastchar = -1;
2505    
2506 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2507     temporary bit of memory, in case the class contains only 1 character (less
2508     than 256), because in that case the compiled code doesn't use the bit map.
2509     */
2510    
2511     memset(classbits, 0, 32 * sizeof(uschar));
2512    
2513 nigel 77 #ifdef SUPPORT_UTF8
2514     class_utf8 = FALSE; /* No chars >= 256 */
2515 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2516 nigel 77 #endif
2517    
2518     /* Process characters until ] is reached. By writing this as a "do" it
2519 nigel 93 means that an initial ] is taken as a data character. At the start of the
2520     loop, c contains the first byte of the character. */
2521 nigel 77
2522 nigel 93 if (c != 0) do
2523 nigel 77 {
2524 nigel 93 const uschar *oldptr;
2525    
2526 nigel 77 #ifdef SUPPORT_UTF8
2527     if (utf8 && c > 127)
2528     { /* Braces are required because the */
2529     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2530     }
2531     #endif
2532    
2533     /* Inside \Q...\E everything is literal except \E */
2534    
2535     if (inescq)
2536     {
2537 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2538 nigel 77 {
2539 nigel 93 inescq = FALSE; /* Reset literal state */
2540     ptr++; /* Skip the 'E' */
2541     continue; /* Carry on with next */
2542 nigel 77 }
2543 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2544 nigel 77 }
2545    
2546     /* Handle POSIX class names. Perl allows a negation extension of the
2547     form [:^name:]. A square bracket that doesn't match the syntax is
2548     treated as a literal. We also recognize the POSIX constructions
2549     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2550     5.6 and 5.8 do. */
2551    
2552     if (c == '[' &&
2553     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2554     check_posix_syntax(ptr, &tempptr, cd))
2555     {
2556     BOOL local_negate = FALSE;
2557 nigel 87 int posix_class, taboffset, tabopt;
2558 nigel 77 register const uschar *cbits = cd->cbits;
2559 nigel 87 uschar pbits[32];
2560 nigel 77
2561     if (ptr[1] != ':')
2562     {
2563     *errorcodeptr = ERR31;
2564     goto FAILED;
2565     }
2566    
2567     ptr += 2;
2568     if (*ptr == '^')
2569     {
2570     local_negate = TRUE;
2571     ptr++;
2572     }
2573    
2574     posix_class = check_posix_name(ptr, tempptr - ptr);
2575     if (posix_class < 0)
2576     {
2577     *errorcodeptr = ERR30;
2578     goto FAILED;
2579     }
2580    
2581     /* If matching is caseless, upper and lower are converted to
2582     alpha. This relies on the fact that the class table starts with
2583     alpha, lower, upper as the first 3 entries. */
2584    
2585     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2586     posix_class = 0;
2587    
2588 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2589     because we may be adding and subtracting from it, and we don't want to
2590     subtract bits that may be in the main map already. At the end we or the
2591     result into the bit map that is being built. */
2592 nigel 77
2593     posix_class *= 3;
2594 nigel 87
2595     /* Copy in the first table (always present) */
2596    
2597     memcpy(pbits, cbits + posix_class_maps[posix_class],
2598     32 * sizeof(uschar));
2599    
2600     /* If there is a second table, add or remove it as required. */
2601    
2602     taboffset = posix_class_maps[posix_class + 1];
2603     tabopt = posix_class_maps[posix_class + 2];
2604    
2605     if (taboffset >= 0)
2606 nigel 77 {
2607 nigel 87 if (tabopt >= 0)
2608     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2609 nigel 77 else
2610 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2611 nigel 77 }
2612    
2613 nigel 87 /* Not see if we need to remove any special characters. An option
2614     value of 1 removes vertical space and 2 removes underscore. */
2615    
2616     if (tabopt < 0) tabopt = -tabopt;
2617     if (tabopt == 1) pbits[1] &= ~0x3c;
2618     else if (tabopt == 2) pbits[11] &= 0x7f;
2619    
2620     /* Add the POSIX table or its complement into the main table that is
2621     being built and we are done. */
2622    
2623     if (local_negate)
2624     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2625     else
2626     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2627    
2628 nigel 77 ptr = tempptr + 1;
2629     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2630     continue; /* End of POSIX syntax handling */
2631     }
2632    
2633     /* Backslash may introduce a single character, or it may introduce one
2634 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2635     case. Inside a class (and only there) it is treated as backspace.
2636     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2637 ph10 205 to 'or' into the one we are building. We assume they have more than one
2638 nigel 77 character in them, so set class_charcount bigger than one. */
2639    
2640     if (c == '\\')
2641     {
2642 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2643     if (*errorcodeptr != 0) goto FAILED;
2644 nigel 77
2645     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2646     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2647 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2648 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2649     {
2650     if (ptr[1] == '\\' && ptr[2] == 'E')
2651     {
2652     ptr += 2; /* avoid empty string */
2653     }
2654     else inescq = TRUE;
2655     continue;
2656     }
2657    
2658     if (c < 0)
2659     {
2660     register const uschar *cbits = cd->cbits;
2661     class_charcount += 2; /* Greater than 1 is what matters */
2662 nigel 93
2663     /* Save time by not doing this in the pre-compile phase. */
2664    
2665     if (lengthptr == NULL) switch (-c)
2666 nigel 77 {
2667     case ESC_d:
2668     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2669     continue;
2670    
2671     case ESC_D:
2672     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2673     continue;
2674    
2675     case ESC_w:
2676     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2677     continue;
2678    
2679     case ESC_W:
2680     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2681     continue;
2682    
2683     case ESC_s:
2684     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2685     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2686     continue;
2687    
2688     case ESC_S:
2689     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2690     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2691     continue;
2692    
2693 nigel 93 case ESC_E: /* Perl ignores an orphan \E */
2694     continue;
2695 ph10 180
2696 nigel 93 default: /* Not recognized; fall through */
2697     break; /* Need "default" setting to stop compiler warning. */
2698     }
2699    
2700     /* In the pre-compile phase, just do the recognition. */
2701    
2702     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2703     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2704 ph10 180
2705 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
2706     they use extra memory. */
2707 ph10 180
2708 ph10 178 if (-c == ESC_h)
2709     {
2710     SETBIT(classbits, 0x09); /* VT */
2711     SETBIT(classbits, 0x20); /* SPACE */
2712 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
2713 ph10 178 #ifdef SUPPORT_UTF8
2714     if (utf8)
2715 ph10 180 {
2716 ph10 178 class_utf8 = TRUE;
2717     *class_utf8data++ = XCL_SINGLE;
2718 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2719 ph10 178 *class_utf8data++ = XCL_SINGLE;
2720 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2721     *class_utf8data++ = XCL_RANGE;
2722     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2723     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2724 ph10 178 *class_utf8data++ = XCL_SINGLE;
2725 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2726 ph10 178 *class_utf8data++ = XCL_SINGLE;
2727 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2728 ph10 178 *class_utf8data++ = XCL_SINGLE;
2729 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2730     }
2731     #endif
2732     continue;
2733     }
2734 nigel 93
2735 ph10 178 if (-c == ESC_H)
2736     {
2737     for (c = 0; c < 32; c++)
2738     {
2739     int x = 0xff;
2740     switch (c)
2741 ph10 180 {
2742 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
2743     case 0x20/8: x ^= 1 << (0x20%8); break;
2744     case 0xa0/8: x ^= 1 << (0xa0%8); break;
2745     default: break;
2746     }
2747     classbits[c] |= x;
2748 ph10 180 }
2749    
2750 ph10 178 #ifdef SUPPORT_UTF8
2751     if (utf8)
2752 ph10 180 {
2753 ph10 178 class_utf8 = TRUE;
2754 ph10 180 *class_utf8data++ = XCL_RANGE;
2755     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2756     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2757     *class_utf8data++ = XCL_RANGE;
2758     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2759     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2760     *class_utf8data++ = XCL_RANGE;
2761     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2762     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2763     *class_utf8data++ = XCL_RANGE;
2764     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2765     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2766     *class_utf8data++ = XCL_RANGE;
2767     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2768     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2769     *class_utf8data++ = XCL_RANGE;
2770     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2771     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2772     *class_utf8data++ = XCL_RANGE;
2773     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2774     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2775     }
2776     #endif
2777     continue;
2778     }
2779 ph10 178
2780     if (-c == ESC_v)
2781     {
2782     SETBIT(classbits, 0x0a); /* LF */
2783     SETBIT(classbits, 0x0b); /* VT */
2784 ph10 180 SETBIT(classbits, 0x0c); /* FF */
2785     SETBIT(classbits, 0x0d); /* CR */
2786     SETBIT(classbits, 0x85); /* NEL */
2787 ph10 178 #ifdef SUPPORT_UTF8
2788     if (utf8)
2789 ph10 180 {
2790 ph10 178 class_utf8 = TRUE;
2791 ph10 180 *class_utf8data++ = XCL_RANGE;
2792     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2793     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2794     }
2795     #endif
2796     continue;
2797     }
2798 ph10 178
2799     if (-c == ESC_V)
2800     {
2801     for (c = 0; c < 32; c++)
2802     {
2803     int x = 0xff;
2804     switch (c)
2805 ph10 180 {
2806 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
2807     x ^= 1 << (0x0b%8);
2808     x ^= 1 << (0x0c%8);
2809 ph10 180 x ^= 1 << (0x0d%8);
2810 ph10 178 break;
2811     case 0x85/8: x ^= 1 << (0x85%8); break;
2812     default: break;
2813     }
2814     classbits[c] |= x;
2815 ph10 180 }
2816    
2817 ph10 178 #ifdef SUPPORT_UTF8
2818     if (utf8)
2819 ph10 180 {
2820 ph10 178 class_utf8 = TRUE;
2821 ph10 180 *class_utf8data++ = XCL_RANGE;
2822     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2823     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2824     *class_utf8data++ = XCL_RANGE;
2825     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2826     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2827     }
2828     #endif
2829     continue;
2830     }
2831 ph10 178
2832 nigel 93 /* We need to deal with \P and \p in both phases. */
2833    
2834 nigel 77 #ifdef SUPPORT_UCP
2835 nigel 93 if (-c == ESC_p || -c == ESC_P)
2836     {
2837     BOOL negated;
2838     int pdata;
2839     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2840     if (ptype < 0) goto FAILED;
2841     class_utf8 = TRUE;
2842     *class_utf8data++ = ((-c == ESC_p) != negated)?
2843     XCL_PROP : XCL_NOTPROP;
2844     *class_utf8data++ = ptype;
2845     *class_utf8data++ = pdata;
2846     class_charcount -= 2; /* Not a < 256 character */
2847 nigel 77 continue;
2848 nigel 93 }
2849 nigel 77 #endif
2850 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
2851     strict mode. By default, for compatibility with Perl, they are
2852     treated as literals. */
2853 nigel 77
2854 nigel 93 if ((options & PCRE_EXTRA) != 0)
2855     {
2856     *errorcodeptr = ERR7;
2857     goto FAILED;
2858     }
2859 nigel 77
2860 nigel 93 class_charcount -= 2; /* Undo the default count from above */
2861     c = *ptr; /* Get the final character and fall through */
2862 nigel 77 }
2863    
2864     /* Fall through if we have a single character (c >= 0). This may be
2865 nigel 93 greater than 256 in UTF-8 mode. */
2866 nigel 77
2867     } /* End of backslash handling */
2868    
2869     /* A single character may be followed by '-' to form a range. However,
2870     Perl does not permit ']' to be the end of the range. A '-' character
2871 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
2872     entirely. The code for handling \Q and \E is messy. */
2873 nigel 77
2874 nigel 93 CHECK_RANGE:
2875     while (ptr[1] == '\\' && ptr[2] == 'E')
2876 nigel 77 {
2877 nigel 93 inescq = FALSE;
2878     ptr += 2;
2879     }
2880    
2881     oldptr = ptr;
2882    
2883     if (!inescq && ptr[1] == '-')
2884     {
2885 nigel 77 int d;
2886     ptr += 2;
2887 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2888 nigel 77
2889 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
2890     mode. */
2891    
2892     while (*ptr == '\\' && ptr[1] == 'Q')
2893     {
2894     ptr += 2;
2895     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2896     inescq = TRUE;
2897     break;
2898     }
2899    
2900     if (*ptr == 0 || (!inescq && *ptr == ']'))
2901     {
2902     ptr = oldptr;
2903     goto LONE_SINGLE_CHARACTER;
2904     }
2905    
2906 nigel 77 #ifdef SUPPORT_UTF8
2907     if (utf8)
2908     { /* Braces are required because the */
2909     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2910     }
2911     else
2912     #endif
2913     d = *ptr; /* Not UTF-8 mode */
2914    
2915     /* The second part of a range can be a single-character escape, but
2916     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2917     in such circumstances. */
2918    
2919 nigel 93 if (!inescq && d == '\\')
2920 nigel 77 {
2921 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2922     if (*errorcodeptr != 0) goto FAILED;
2923 nigel 77
2924 nigel 93 /* \b is backslash; \X is literal X; \R is literal R; any other
2925     special means the '-' was literal */
2926 nigel 77
2927     if (d < 0)
2928     {
2929     if (d == -ESC_b) d = '\b';
2930 nigel 93 else if (d == -ESC_X) d = 'X';
2931     else if (d == -ESC_R) d = 'R'; else
2932 nigel 77 {
2933 nigel 93 ptr = oldptr;
2934 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2935     }
2936     }
2937     }
2938    
2939 nigel 93 /* Check that the two values are in the correct order. Optimize
2940     one-character ranges */
2941 nigel 77
2942 nigel 93 if (d < c)
2943     {
2944     *errorcodeptr = ERR8;
2945     goto FAILED;
2946     }
2947    
2948 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2949    
2950     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2951     matching, we have to use an XCLASS with extra data items. Caseless
2952     matching for characters > 127 is available only if UCP support is
2953     available. */
2954    
2955     #ifdef SUPPORT_UTF8
2956     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2957     {
2958     class_utf8 = TRUE;
2959    
2960     /* With UCP support, we can find the other case equivalents of
2961     the relevant characters. There may be several ranges. Optimize how
2962     they fit with the basic range. */
2963    
2964     #ifdef SUPPORT_UCP
2965     if ((options & PCRE_CASELESS) != 0)
2966     {
2967 nigel 93 unsigned int occ, ocd;
2968     unsigned int cc = c;
2969     unsigned int origd = d;
2970 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
2971     {
2972 ph10 180 if (occ >= (unsigned int)c &&
2973     ocd <= (unsigned int)d)
2974 ph10 176 continue; /* Skip embedded ranges */
2975 nigel 77
2976 ph10 180 if (occ < (unsigned int)c &&
2977 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
2978 nigel 77 { /* if there is overlap, */
2979     c = occ; /* noting that if occ < c */
2980     continue; /* we can't have ocd > d */
2981     } /* because a subrange is */
2982 ph10 180 if (ocd > (unsigned int)d &&
2983 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
2984 nigel 77 { /* the basic range. */
2985     d = ocd;
2986     continue;
2987     }
2988    
2989     if (occ == ocd)
2990     {
2991     *class_utf8data++ = XCL_SINGLE;
2992     }
2993     else
2994     {
2995     *class_utf8data++ = XCL_RANGE;
2996     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2997     }
2998     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2999     }
3000     }
3001     #endif /* SUPPORT_UCP */
3002    
3003     /* Now record the original range, possibly modified for UCP caseless
3004     overlapping ranges. */
3005    
3006     *class_utf8data++ = XCL_RANGE;
3007     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3008     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3009    
3010     /* With UCP support, we are done. Without UCP support, there is no
3011     caseless matching for UTF-8 characters > 127; we can use the bit map
3012     for the smaller ones. */
3013    
3014     #ifdef SUPPORT_UCP
3015     continue; /* With next character in the class */
3016     #else
3017     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3018    
3019     /* Adjust upper limit and fall through to set up the map */
3020    
3021     d = 127;
3022    
3023     #endif /* SUPPORT_UCP */
3024     }
3025     #endif /* SUPPORT_UTF8 */
3026    
3027     /* We use the bit map for all cases when not in UTF-8 mode; else
3028     ranges that lie entirely within 0-127 when there is UCP support; else
3029     for partial ranges without UCP support. */
3030    
3031 nigel 93 class_charcount += d - c + 1;
3032     class_lastchar = d;
3033    
3034     /* We can save a bit of time by skipping this in the pre-compile. */
3035    
3036     if (lengthptr == NULL) for (; c <= d; c++)
3037 nigel 77 {
3038     classbits[c/8] |= (1 << (c&7));
3039     if ((options & PCRE_CASELESS) != 0)
3040     {
3041     int uc = cd->fcc[c]; /* flip case */
3042     classbits[uc/8] |= (1 << (uc&7));
3043     }
3044     }
3045    
3046     continue; /* Go get the next char in the class */
3047     }
3048    
3049     /* Handle a lone single character - we can get here for a normal
3050     non-escape char, or after \ that introduces a single character or for an
3051     apparent range that isn't. */
3052    
3053     LONE_SINGLE_CHARACTER:
3054    
3055     /* Handle a character that cannot go in the bit map */
3056    
3057     #ifdef SUPPORT_UTF8
3058     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3059     {
3060     class_utf8 = TRUE;
3061     *class_utf8data++ = XCL_SINGLE;
3062     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3063    
3064     #ifdef SUPPORT_UCP
3065     if ((options & PCRE_CASELESS) != 0)
3066     {
3067 nigel 93 unsigned int othercase;
3068     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3069 nigel 77 {
3070     *class_utf8data++ = XCL_SINGLE;
3071     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3072     }
3073     }
3074     #endif /* SUPPORT_UCP */
3075    
3076     }
3077     else
3078     #endif /* SUPPORT_UTF8 */
3079    
3080     /* Handle a single-byte character */
3081     {
3082     classbits[c/8] |= (1 << (c&7));
3083     if ((options & PCRE_CASELESS) != 0)
3084     {
3085     c = cd->fcc[c]; /* flip case */
3086     classbits[c/8] |= (1 << (c&7));
3087     }
3088     class_charcount++;
3089     class_lastchar = c;
3090     }
3091     }
3092    
3093 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3094 nigel 77
3095 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3096 nigel 77
3097 nigel 93 if (c == 0) /* Missing terminating ']' */
3098     {
3099     *errorcodeptr = ERR6;
3100     goto FAILED;
3101     }
3102 ph10 208
3103 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3104     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3105     can optimize the negative case only if there were no characters >= 128
3106     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
3107     single-bytes only. This is an historical hangover. Maybe one day we can
3108     tidy these opcodes to handle multi-byte characters.
3109    
3110     The optimization throws away the bit map. We turn the item into a
3111     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3112     that OP_NOT does not support multibyte characters. In the positive case, it
3113     can cause firstbyte to be set. Otherwise, there can be no first char if
3114     this item is first, whatever repeat count may follow. In the case of
3115     reqbyte, save the previous value for reinstating. */
3116    
3117     #ifdef SUPPORT_UTF8
3118     if (class_charcount == 1 &&
3119     (!utf8 ||
3120     (!class_utf8 && (!negate_class || class_lastchar < 128))))
3121    
3122     #else
3123     if (class_charcount == 1)
3124     #endif
3125     {
3126     zeroreqbyte = reqbyte;
3127    
3128     /* The OP_NOT opcode works on one-byte characters only. */
3129    
3130     if (negate_class)
3131     {
3132     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3133     zerofirstbyte = firstbyte;
3134     *code++ = OP_NOT;
3135     *code++ = class_lastchar;
3136     break;
3137     }
3138    
3139     /* For a single, positive character, get the value into mcbuffer, and
3140     then we can handle this with the normal one-character code. */
3141    
3142     #ifdef SUPPORT_UTF8
3143     if (utf8 && class_lastchar > 127)
3144     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3145     else
3146     #endif
3147     {
3148     mcbuffer[0] = class_lastchar;
3149     mclength = 1;
3150     }
3151     goto ONE_CHAR;
3152     } /* End of 1-char optimization */
3153    
3154     /* The general case - not the one-char optimization. If this is the first
3155     thing in the branch, there can be no first char setting, whatever the
3156     repeat count. Any reqbyte setting must remain unchanged after any kind of
3157     repeat. */
3158    
3159     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3160     zerofirstbyte = firstbyte;
3161     zeroreqbyte = reqbyte;
3162    
3163     /* If there are characters with values > 255, we have to compile an
3164     extended class, with its own opcode. If there are no characters < 256,
3165 nigel 93 we can omit the bitmap in the actual compiled code. */
3166 nigel 77
3167     #ifdef SUPPORT_UTF8
3168     if (class_utf8)
3169     {
3170     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3171     *code++ = OP_XCLASS;
3172     code += LINK_SIZE;
3173     *code = negate_class? XCL_NOT : 0;
3174    
3175 nigel 93 /* If the map is required, move up the extra data to make room for it;
3176     otherwise just move the code pointer to the end of the extra data. */
3177 nigel 77
3178     if (class_charcount > 0)
3179     {
3180     *code++ |= XCL_MAP;
3181 nigel 93 memmove(code + 32, code, class_utf8data - code);
3182 nigel 77 memcpy(code, classbits, 32);
3183 nigel 93 code = class_utf8data + 32;
3184 nigel 77 }
3185 nigel 93 else code = class_utf8data;
3186 nigel 77
3187     /* Now fill in the complete length of the item */
3188    
3189     PUT(previous, 1, code - previous);
3190     break; /* End of class handling */
3191     }
3192     #endif
3193    
3194     /* If there are no characters > 255, negate the 32-byte map if necessary,
3195     and copy it into the code vector. If this is the first thing in the branch,
3196     there can be no first char setting, whatever the repeat count. Any reqbyte
3197     setting must remain unchanged after any kind of repeat. */
3198    
3199     if (negate_class)
3200     {
3201     *code++ = OP_NCLASS;
3202 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3203     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3204 nigel 77 }
3205     else
3206     {
3207     *code++ = OP_CLASS;
3208     memcpy(code, classbits, 32);
3209     }
3210     code += 32;
3211     break;
3212    
3213 nigel 93
3214     /* ===================================================================*/
3215 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3216     has been tested above. */
3217    
3218     case '{':
3219     if (!is_quantifier) goto NORMAL_CHAR;
3220     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3221     if (*errorcodeptr != 0) goto FAILED;
3222     goto REPEAT;
3223    
3224     case '*':
3225     repeat_min = 0;
3226     repeat_max = -1;
3227     goto REPEAT;
3228    
3229     case '+':
3230     repeat_min = 1;
3231     repeat_max = -1;
3232     goto REPEAT;
3233    
3234     case '?':
3235     repeat_min = 0;
3236     repeat_max = 1;
3237    
3238     REPEAT:
3239     if (previous == NULL)
3240     {
3241     *errorcodeptr = ERR9;
3242     goto FAILED;
3243     }
3244    
3245     if (repeat_min == 0)
3246     {
3247     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3248     reqbyte = zeroreqbyte; /* Ditto */
3249     }
3250    
3251     /* Remember whether this is a variable length repeat */
3252    
3253     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3254    
3255     op_type = 0; /* Default single-char op codes */
3256     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3257    
3258     /* Save start of previous item, in case we have to move it up to make space
3259     for an inserted OP_ONCE for the additional '+' extension. */
3260    
3261     tempcode = previous;
3262    
3263     /* If the next character is '+', we have a possessive quantifier. This
3264     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3265     If the next character is '?' this is a minimizing repeat, by default,
3266     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3267     repeat type to the non-default. */
3268    
3269     if (ptr[1] == '+')
3270     {
3271     repeat_type = 0; /* Force greedy */
3272     possessive_quantifier = TRUE;
3273     ptr++;
3274     }
3275     else if (ptr[1] == '?')
3276     {
3277     repeat_type = greedy_non_default;
3278     ptr++;
3279     }
3280     else repeat_type = greedy_default;
3281    
3282     /* If previous was a character match, abolish the item and generate a
3283     repeat item instead. If a char item has a minumum of more than one, ensure
3284     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3285     the first thing in a branch because the x will have gone into firstbyte
3286     instead. */
3287    
3288     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3289     {
3290     /* Deal with UTF-8 characters that take up more than one byte. It's
3291     easier to write this out separately than try to macrify it. Use c to
3292     hold the length of the character in bytes, plus 0x80 to flag that it's a
3293     length rather than a small character. */
3294    
3295     #ifdef SUPPORT_UTF8
3296     if (utf8 && (code[-1] & 0x80) != 0)
3297     {
3298     uschar *lastchar = code - 1;
3299     while((*lastchar & 0xc0) == 0x80) lastchar--;
3300     c = code - lastchar; /* Length of UTF-8 character */
3301     memcpy(utf8_char, lastchar, c); /* Save the char */
3302     c |= 0x80; /* Flag c as a length */
3303     }
3304     else
3305     #endif
3306    
3307     /* Handle the case of a single byte - either with no UTF8 support, or
3308     with UTF-8 disabled, or for a UTF-8 character < 128. */
3309    
3310     {
3311     c = code[-1];
3312     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3313     }
3314    
3315 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3316     the line is something that cannot possibly match this character. If so,
3317     automatically possessifying this item gains some performance in the case
3318     where the match fails. */
3319    
3320     if (!possessive_quantifier &&
3321     repeat_max < 0 &&
3322     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3323     options, cd))
3324     {
3325     repeat_type = 0; /* Force greedy */
3326     possessive_quantifier = TRUE;
3327     }
3328    
3329 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3330     }
3331    
3332     /* If previous was a single negated character ([^a] or similar), we use
3333     one of the special opcodes, replacing it. The code is shared with single-
3334     character repeats by setting opt_type to add a suitable offset into
3335 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3336     currently used only for single-byte chars. */
3337 nigel 77
3338     else if (*previous == OP_NOT)
3339     {
3340     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3341     c = previous[1];
3342 nigel 93 if (!possessive_quantifier &&
3343     repeat_max < 0 &&
3344     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3345     {
3346     repeat_type = 0; /* Force greedy */
3347     possessive_quantifier = TRUE;
3348     }
3349 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3350     }
3351    
3352     /* If previous was a character type match (\d or similar), abolish it and
3353     create a suitable repeat item. The code is shared with single-character
3354     repeats by setting op_type to add a suitable offset into repeat_type. Note
3355     the the Unicode property types will be present only when SUPPORT_UCP is
3356     defined, but we don't wrap the little bits of code here because it just
3357     makes it horribly messy. */
3358    
3359     else if (*previous < OP_EODN)
3360     {
3361     uschar *oldcode;
3362 nigel 87 int prop_type, prop_value;
3363 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3364     c = *previous;
3365    
3366 nigel 93 if (!possessive_quantifier &&
3367     repeat_max < 0 &&
3368     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3369     {
3370     repeat_type = 0; /* Force greedy */
3371     possessive_quantifier = TRUE;
3372     }
3373    
3374 nigel 77 OUTPUT_SINGLE_REPEAT:
3375 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3376     {
3377     prop_type = previous[1];
3378     prop_value = previous[2];
3379     }
3380     else prop_type = prop_value = -1;
3381 nigel 77
3382     oldcode = code;
3383     code = previous; /* Usually overwrite previous item */
3384    
3385     /* If the maximum is zero then the minimum must also be zero; Perl allows
3386     this case, so we do too - by simply omitting the item altogether. */
3387    
3388     if (repeat_max == 0) goto END_REPEAT;
3389    
3390     /* All real repeats make it impossible to handle partial matching (maybe
3391     one day we will be able to remove this restriction). */
3392    
3393     if (repeat_max != 1) cd->nopartial = TRUE;
3394    
3395     /* Combine the op_type with the repeat_type */
3396    
3397     repeat_type += op_type;
3398    
3399     /* A minimum of zero is handled either as the special case * or ?, or as
3400     an UPTO, with the maximum given. */
3401    
3402     if (repeat_min == 0)
3403     {
3404     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3405     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3406     else
3407     {
3408     *code++ = OP_UPTO + repeat_type;
3409     PUT2INC(code, 0, repeat_max);
3410     }
3411     }
3412    
3413     /* A repeat minimum of 1 is optimized into some special cases. If the
3414 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3415 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3416     one less than the maximum. */
3417    
3418     else if (repeat_min == 1)
3419     {
3420     if (repeat_max == -1)
3421     *code++ = OP_PLUS + repeat_type;
3422     else
3423     {
3424     code = oldcode; /* leave previous item in place */
3425     if (repeat_max == 1) goto END_REPEAT;
3426     *code++ = OP_UPTO + repeat_type;
3427     PUT2INC(code, 0, repeat_max - 1);
3428     }
3429     }
3430    
3431     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3432     handled as an EXACT followed by an UPTO. */
3433    
3434     else
3435     {
3436     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3437     PUT2INC(code, 0, repeat_min);
3438    
3439     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3440     we have to insert the character for the previous code. For a repeated
3441 nigel 87 Unicode property match, there are two extra bytes that define the
3442 nigel 77 required property. In UTF-8 mode, long characters have their length in
3443     c, with the 0x80 bit as a flag. */
3444    
3445     if (repeat_max < 0)
3446     {
3447     #ifdef SUPPORT_UTF8
3448     if (utf8 && c >= 128)
3449     {
3450     memcpy(code, utf8_char, c & 7);
3451     code += c & 7;
3452     }
3453     else
3454     #endif
3455     {
3456     *code++ = c;
3457 nigel 87 if (prop_type >= 0)
3458     {
3459     *code++ = prop_type;
3460     *code++ = prop_value;
3461     }
3462 nigel 77 }
3463     *code++ = OP_STAR + repeat_type;
3464     }
3465    
3466     /* Else insert an UPTO if the max is greater than the min, again
3467 nigel 93 preceded by the character, for the previously inserted code. If the
3468     UPTO is just for 1 instance, we can use QUERY instead. */
3469 nigel 77
3470     else if (repeat_max != repeat_min)
3471     {
3472     #ifdef SUPPORT_UTF8
3473     if (utf8 && c >= 128)
3474     {
3475     memcpy(code, utf8_char, c & 7);
3476     code += c & 7;
3477     }
3478     else
3479     #endif
3480     *code++ = c;
3481 nigel 87 if (prop_type >= 0)
3482     {
3483     *code++ = prop_type;
3484     *code++ = prop_value;
3485     }
3486 nigel 77 repeat_max -= repeat_min;
3487 nigel 93
3488     if (repeat_max == 1)
3489     {
3490     *code++ = OP_QUERY + repeat_type;
3491     }
3492     else
3493     {
3494     *code++ = OP_UPTO + repeat_type;
3495     PUT2INC(code, 0, repeat_max);
3496     }
3497 nigel 77 }
3498     }
3499    
3500     /* The character or character type itself comes last in all cases. */
3501    
3502     #ifdef SUPPORT_UTF8
3503     if (utf8 && c >= 128)
3504     {
3505     memcpy(code, utf8_char, c & 7);
3506     code += c & 7;
3507     }
3508     else
3509     #endif
3510     *code++ = c;
3511    
3512 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3513     define the required property. */
3514 nigel 77
3515     #ifdef SUPPORT_UCP
3516 nigel 87 if (prop_type >= 0)
3517     {
3518     *code++ = prop_type;
3519     *code++ = prop_value;
3520     }
3521 nigel 77 #endif
3522     }
3523    
3524     /* If previous was a character class or a back reference, we put the repeat
3525     stuff after it, but just skip the item if the repeat was {0,0}. */
3526    
3527     else if (*previous == OP_CLASS ||
3528     *previous == OP_NCLASS ||
3529     #ifdef SUPPORT_UTF8
3530     *previous == OP_XCLASS ||
3531     #endif
3532     *previous == OP_REF)
3533     {
3534     if (repeat_max == 0)
3535     {
3536     code = previous;
3537     goto END_REPEAT;
3538     }
3539    
3540     /* All real repeats make it impossible to handle partial matching (maybe
3541     one day we will be able to remove this restriction). */
3542    
3543     if (repeat_max != 1) cd->nopartial = TRUE;
3544    
3545     if (repeat_min == 0 && repeat_max == -1)
3546     *code++ = OP_CRSTAR + repeat_type;
3547     else if (repeat_min == 1 && repeat_max == -1)
3548     *code++ = OP_CRPLUS + repeat_type;
3549     else if (repeat_min == 0 && repeat_max == 1)
3550     *code++ = OP_CRQUERY + repeat_type;
3551     else
3552     {
3553     *code++ = OP_CRRANGE + repeat_type;
3554     PUT2INC(code, 0, repeat_min);
3555     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3556     PUT2INC(code, 0, repeat_max);
3557     }
3558     }
3559    
3560     /* If previous was a bracket group, we may have to replicate it in certain
3561     cases. */
3562    
3563 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3564     *previous == OP_ONCE || *previous == OP_COND)
3565 nigel 77 {
3566     register int i;
3567     int ketoffset = 0;
3568     int len = code - previous;
3569     uschar *bralink = NULL;
3570    
3571 nigel 93 /* Repeating a DEFINE group is pointless */
3572    
3573     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3574     {
3575     *errorcodeptr = ERR55;
3576     goto FAILED;
3577     }
3578    
3579 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3580     by scanning through from the start, and compute the offset back to it
3581     from the current code pointer. There may be an OP_OPT setting following
3582     the final KET, so we can't find the end just by going back from the code
3583     pointer. */
3584    
3585     if (repeat_max == -1)
3586     {
3587     register uschar *ket = previous;
3588     do ket += GET(ket, 1); while (*ket != OP_KET);
3589     ketoffset = code - ket;
3590     }
3591    
3592     /* The case of a zero minimum is special because of the need to stick
3593     OP_BRAZERO in front of it, and because the group appears once in the
3594     data, whereas in other cases it appears the minimum number of times. For
3595     this reason, it is simplest to treat this case separately, as otherwise
3596     the code gets far too messy. There are several special subcases when the
3597     minimum is zero. */
3598    
3599     if (repeat_min == 0)
3600     {
3601     /* If the maximum is also zero, we just omit the group from the output
3602     altogether. */
3603    
3604     if (repeat_max == 0)
3605     {
3606     code = previous;
3607     goto END_REPEAT;
3608     }
3609    
3610     /* If the maximum is 1 or unlimited, we just have to stick in the
3611     BRAZERO and do no more at this point. However, we do need to adjust
3612     any OP_RECURSE calls inside the group that refer to the group itself or
3613 nigel 93 any internal or forward referenced group, because the offset is from
3614     the start of the whole regex. Temporarily terminate the pattern while
3615     doing this. */
3616 nigel 77
3617     if (repeat_max <= 1)
3618     {
3619     *code = OP_END;
3620 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3621 nigel 77 memmove(previous+1, previous, len);
3622     code++;
3623     *previous++ = OP_BRAZERO + repeat_type;
3624     }
3625    
3626     /* If the maximum is greater than 1 and limited, we have to replicate
3627     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3628     The first one has to be handled carefully because it's the original
3629     copy, which has to be moved up. The remainder can be handled by code
3630     that is common with the non-zero minimum case below. We have to
3631     adjust the value or repeat_max, since one less copy is required. Once
3632     again, we may have to adjust any OP_RECURSE calls inside the group. */
3633    
3634     else
3635     {
3636     int offset;
3637     *code = OP_END;
3638 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3639 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3640     code += 2 + LINK_SIZE;
3641     *previous++ = OP_BRAZERO + repeat_type;
3642     *previous++ = OP_BRA;
3643    
3644     /* We chain together the bracket offset fields that have to be
3645     filled in later when the ends of the brackets are reached. */
3646    
3647     offset = (bralink == NULL)? 0 : previous - bralink;
3648     bralink = previous;
3649     PUTINC(previous, 0, offset);
3650     }
3651    
3652     repeat_max--;
3653     }
3654    
3655     /* If the minimum is greater than zero, replicate the group as many
3656     times as necessary, and adjust the maximum to the number of subsequent
3657     copies that we need. If we set a first char from the group, and didn't
3658 nigel 93 set a required char, copy the latter from the former. If there are any
3659     forward reference subroutine calls in the group, there will be entries on
3660     the workspace list; replicate these with an appropriate increment. */
3661 nigel 77
3662     else
3663     {
3664     if (repeat_min > 1)
3665     {
3666 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3667 ph10 202 just adjust the length as if we had. Do some paranoid checks for
3668     potential integer overflow. */
3669 nigel 93
3670     if (lengthptr != NULL)
3671 ph10 202 {
3672     int delta = (repeat_min - 1)*length_prevgroup;
3673     if ((double)(repeat_min - 1)*(double)length_prevgroup >
3674     (double)INT_MAX ||
3675     OFLOW_MAX - *lengthptr < delta)
3676     {
3677     *errorcodeptr = ERR20;
3678     goto FAILED;
3679     }
3680     *lengthptr += delta;
3681     }
3682 nigel 93
3683     /* This is compiling for real */
3684    
3685     else
3686 nigel 77 {
3687 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3688     for (i = 1; i < repeat_min; i++)
3689     {
3690     uschar *hc;
3691     uschar *this_hwm = cd->hwm;
3692     memcpy(code, previous, len);
3693     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3694     {
3695     PUT(cd->hwm, 0, GET(hc, 0) + len);
3696     cd->hwm += LINK_SIZE;
3697     }
3698     save_hwm = this_hwm;
3699     code += len;
3700     }
3701 nigel 77 }
3702     }
3703 nigel 93
3704 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3705     }
3706    
3707     /* This code is common to both the zero and non-zero minimum cases. If
3708     the maximum is limited, it replicates the group in a nested fashion,
3709     remembering the bracket starts on a stack. In the case of a zero minimum,
3710     the first one was set up above. In all cases the repeat_max now specifies
3711 nigel 93 the number of additional copies needed. Again, we must remember to
3712     replicate entries on the forward reference list. */
3713 nigel 77
3714     if (repeat_max >= 0)
3715     {
3716 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3717     just adjust the length as if we had. For each repetition we must add 1
3718     to the length for BRAZERO and for all but the last repetition we must
3719 ph10 202 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3720     paranoid checks to avoid integer overflow. */
3721 nigel 93
3722     if (lengthptr != NULL && repeat_max > 0)
3723 ph10 202 {
3724     int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3725     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3726     if ((double)repeat_max *
3727     (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3728     > (double)INT_MAX ||
3729     OFLOW_MAX - *lengthptr < delta)
3730     {
3731     *errorcodeptr = ERR20;
3732     goto FAILED;
3733     }
3734     *lengthptr += delta;
3735     }
3736 nigel 93
3737     /* This is compiling for real */
3738    
3739     else for (i = repeat_max - 1; i >= 0; i--)
3740 nigel 77 {
3741 nigel 93 uschar *hc;
3742     uschar *this_hwm = cd->hwm;
3743    
3744 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3745    
3746     /* All but the final copy start a new nesting, maintaining the
3747     chain of brackets outstanding. */
3748    
3749     if (i != 0)
3750     {
3751     int offset;
3752     *code++ = OP_BRA;
3753     offset = (bralink == NULL)? 0 : code - bralink;
3754     bralink = code;
3755     PUTINC(code, 0, offset);
3756     }
3757    
3758     memcpy(code, previous, len);
3759 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3760     {
3761     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3762     cd->hwm += LINK_SIZE;
3763     }
3764     save_hwm = this_hwm;
3765 nigel 77 code += len;
3766     }
3767    
3768     /* Now chain through the pending brackets, and fill in their length
3769     fields (which are holding the chain links pro tem). */
3770    
3771     while (bralink != NULL)
3772     {
3773     int oldlinkoffset;
3774     int offset = code - bralink + 1;
3775     uschar *bra = code - offset;
3776     oldlinkoffset = GET(bra, 1);
3777     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3778     *code++ = OP_KET;
3779     PUTINC(code, 0, offset);
3780     PUT(bra, 1, offset);
3781     }
3782     }
3783    
3784     /* If the maximum is unlimited, set a repeater in the final copy. We
3785     can't just offset backwards from the current code point, because we
3786     don't know if there's been an options resetting after the ket. The
3787 nigel 93 correct offset was computed above.
3788 nigel 77
3789 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3790     this group is a non-atomic one that could match an empty string. If so,
3791     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3792     that runtime checking can be done. [This check is also applied to
3793     atomic groups at runtime, but in a different way.] */
3794    
3795     else
3796     {
3797     uschar *ketcode = code - ketoffset;
3798     uschar *bracode = ketcode - GET(ketcode, 1);
3799     *ketcode = OP_KETRMAX + repeat_type;
3800     if (lengthptr == NULL && *bracode != OP_ONCE)
3801     {
3802     uschar *scode = bracode;
3803     do
3804     {
3805     if (could_be_empty_branch(scode, ketcode, utf8))
3806     {
3807     *bracode += OP_SBRA - OP_BRA;
3808     break;
3809     }
3810     scode += GET(scode, 1);
3811     }
3812     while (*scode == OP_ALT);
3813     }
3814     }
3815 nigel 77 }
3816    
3817     /* Else there's some kind of shambles */
3818    
3819     else
3820     {
3821     *errorcodeptr = ERR11;
3822     goto FAILED;
3823     }
3824    
3825 nigel 93 /* If the character following a repeat is '+', or if certain optimization
3826     tests above succeeded, possessive_quantifier is TRUE. For some of the
3827     simpler opcodes, there is an special alternative opcode for this. For
3828     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3829     The '+' notation is just syntactic sugar, taken from Sun's Java package,
3830     but the special opcodes can optimize it a bit. The repeated item starts at
3831     tempcode, not at previous, which might be the first part of a string whose
3832     (former) last char we repeated.
3833 nigel 77
3834 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3835     an 'upto' may follow. We skip over an 'exact' item, and then test the
3836     length of what remains before proceeding. */
3837    
3838 nigel 77 if (possessive_quantifier)
3839     {
3840 nigel 93 int len;
3841     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3842     *tempcode == OP_NOTEXACT)
3843     tempcode += _pcre_OP_lengths[*tempcode];
3844     len = code - tempcode;
3845     if (len > 0) switch (*tempcode)
3846     {
3847     case OP_STAR: *tempcode = OP_POSSTAR; break;
3848     case OP_PLUS: *tempcode = OP_POSPLUS; break;
3849     case OP_QUERY: *tempcode = OP_POSQUERY; break;
3850     case OP_UPTO: *tempcode = OP_POSUPTO; break;
3851    
3852     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3853     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3854     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3855     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3856    
3857     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3858     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3859     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3860     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3861    
3862     default:
3863     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3864     code += 1 + LINK_SIZE;
3865     len += 1 + LINK_SIZE;
3866     tempcode[0] = OP_ONCE;
3867     *code++ = OP_KET;
3868     PUTINC(code, 0, len);
3869     PUT(tempcode, 1, len);
3870     break;
3871     }
3872 nigel 77 }
3873    
3874     /* In all case we no longer have a previous item. We also set the
3875     "follows varying string" flag for subsequently encountered reqbytes if
3876     it isn't already set and we have just passed a varying length item. */
3877    
3878     END_REPEAT:
3879     previous = NULL;
3880     cd->req_varyopt |= reqvary;
3881     break;
3882    
3883    
3884 nigel 93 /* ===================================================================*/
3885     /* Start of nested parenthesized sub-expression, or comment or lookahead or
3886     lookbehind or option setting or condition or all the other extended
3887 ph10 210 parenthesis forms. */
3888 nigel 77
3889     case '(':
3890     newoptions = options;
3891     skipbytes = 0;
3892 nigel 93 bravalue = OP_CBRA;
3893     save_hwm = cd->hwm;
3894 ph10 180 reset_bracount = FALSE;
3895 ph10 211
3896 ph10 210 /* First deal with various "verbs" that can be introduced by '*'. */
3897 ph10 211
3898 ph10 210 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
3899     {
3900 ph10 211 int i, namelen;
3901 ph10 210 const uschar *name = ++ptr;
3902     previous = NULL;
3903     while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
3904     if (*ptr == ':')
3905     {
3906     *errorcodeptr = ERR59; /* Not supported */
3907 ph10 211 goto FAILED;
3908     }
3909 ph10 210 if (*ptr != ')')
3910     {
3911     *errorcodeptr = ERR60;
3912     goto FAILED;
3913     }
3914 ph10 211 namelen = ptr - name;
3915 ph10 210 for (i = 0; i < verbcount; i++)
3916 ph10 211 {
3917 ph10 210 if (namelen == verbs[i].len &&
3918     strncmp((char *)name, verbs[i].name, namelen) == 0)
3919     {
3920     *code = verbs[i].op;
3921     if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
3922     break;
3923 ph10 211 }
3924     }
3925     if (i < verbcount) continue;
3926 ph10 210 *errorcodeptr = ERR60;
3927 ph10 211 goto FAILED;
3928     }
3929    
3930 ph10 210 /* Deal with the extended parentheses; all are introduced by '?', and the
3931     appearance of any of them means that this is not a capturing group. */
3932 nigel 77
3933 ph10 210 else if (*ptr == '?')
3934 nigel 77 {
3935 nigel 93 int i, set, unset, namelen;
3936 nigel 77 int *optset;
3937 nigel 93 const uschar *name;
3938     uschar *slot;
3939 nigel 77
3940     switch (*(++ptr))
3941     {
3942     case '#': /* Comment; skip to ket */
3943     ptr++;
3944 nigel 93 while (*ptr != 0 && *ptr != ')') ptr++;
3945     if (*ptr == 0)
3946     {
3947     *errorcodeptr = ERR18;
3948     goto FAILED;
3949     }
3950 nigel 77 continue;
3951    
3952 nigel 93
3953     /* ------------------------------------------------------------ */
3954 ph10 175 case '|': /* Reset capture count for each branch */
3955     reset_bracount = TRUE;
3956 ph10 180 /* Fall through */
3957 ph10 175
3958     /* ------------------------------------------------------------ */
3959 nigel 93 case ':': /* Non-capturing bracket */
3960 nigel 77 bravalue = OP_BRA;
3961     ptr++;
3962     break;
3963    
3964 nigel 93
3965     /* ------------------------------------------------------------ */
3966 nigel 77 case '(':
3967     bravalue = OP_COND; /* Conditional group */
3968    
3969 nigel 93 /* A condition can be an assertion, a number (referring to a numbered
3970     group), a name (referring to a named group), or 'R', referring to
3971     recursion. R<digits> and R&name are also permitted for recursion tests.
3972 nigel 77
3973 nigel 93 There are several syntaxes for testing a named group: (?(name)) is used
3974     by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3975    
3976     There are two unfortunate ambiguities, caused by history. (a) 'R' can
3977     be the recursive thing or the name 'R' (and similarly for 'R' followed
3978     by digits), and (b) a number could be a name that consists of digits.
3979     In both cases, we look for a name first; if not found, we try the other
3980     cases. */
3981    
3982     /* For conditions that are assertions, check the syntax, and then exit
3983     the switch. This will take control down to where bracketed groups,
3984     including assertions, are processed. */
3985    
3986     if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3987     break;
3988    
3989     /* Most other conditions use OP_CREF (a couple change to OP_RREF
3990     below), and all need to skip 3 bytes at the start of the group. */
3991    
3992     code[1+LINK_SIZE] = OP_CREF;
3993     skipbytes = 3;
3994 ph10 172 refsign = -1;
3995 nigel 93
3996     /* Check for a test for recursion in a named group. */
3997    
3998     if (ptr[1] == 'R' && ptr[2] == '&')
3999 nigel 77 {
4000 nigel 93 terminator = -1;
4001     ptr += 2;
4002     code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4003     }
4004 nigel 91
4005 nigel 93 /* Check for a test for a named group's having been set, using the Perl
4006     syntax (?(<name>) or (?('name') */
4007 nigel 91
4008 nigel 93 else if (ptr[1] == '<')
4009     {
4010     terminator = '>';
4011     ptr++;
4012     }
4013     else if (ptr[1] == '\'')
4014     {
4015     terminator = '\'';
4016     ptr++;
4017     }
4018 ph10 172 else
4019 ph10 167 {
4020     terminator = 0;
4021 ph10 172 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4022     }
4023 nigel 77
4024 nigel 93 /* We now expect to read a name; any thing else is an error */
4025 nigel 77
4026 nigel 93 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4027     {
4028     ptr += 1; /* To get the right offset */
4029     *errorcodeptr = ERR28;
4030     goto FAILED;
4031     }
4032    
4033     /* Read the name, but also get it as a number if it's all digits */
4034    
4035     recno = 0;
4036     name = ++ptr;
4037     while ((cd->ctypes[*ptr] & ctype_word) != 0)
4038     {
4039     if (recno >= 0)
4040     recno = ((digitab[*ptr] & ctype_digit) != 0)?
4041     recno * 10 + *ptr - '0' : -1;
4042 nigel 91 ptr++;
4043 nigel 93 }
4044     namelen = ptr - name;
4045 nigel 91
4046 nigel 93 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4047     {
4048     ptr--; /* Error offset */
4049     *errorcodeptr = ERR26;
4050     goto FAILED;
4051     }
4052 nigel 91
4053 nigel 93 /* Do no further checking in the pre-compile phase. */
4054 nigel 91
4055 nigel 93 if (lengthptr != NULL) break;
4056 nigel 91
4057 nigel 93 /* In the real compile we do the work of looking for the actual
4058 ph10 167 reference. If the string started with "+" or "-" we require the rest to
4059     be digits, in which case recno will be set. */
4060 ph10 172
4061 ph10 167 if (refsign > 0)
4062     {
4063     if (recno <= 0)
4064     {
4065     *errorcodeptr = ERR58;
4066     goto FAILED;
4067 ph10 172 }
4068 ph10 167 if (refsign == '-')
4069     {
4070 ph10 172 recno = cd->bracount - recno + 1;
4071 ph10 167 if (recno <= 0)
4072     {
4073     *errorcodeptr = ERR15;
4074     goto FAILED;
4075 ph10 172 }
4076 ph10 167 }
4077 ph10 172 else recno += cd->bracount;
4078 ph10 167 PUT2(code, 2+LINK_SIZE, recno);
4079     break;
4080 ph10 172 }
4081 nigel 91
4082 ph10 167 /* Otherwise (did not start with "+" or "-"), start by looking for the
4083     name. */
4084 ph10 172
4085 nigel 93 slot = cd->name_table;
4086     for (i = 0; i < cd->names_found; i++)
4087     {
4088     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4089     slot += cd->name_entry_size;
4090     }
4091 nigel 91
4092 nigel 93 /* Found a previous named subpattern */
4093 nigel 91
4094 nigel 93 if (i < cd->names_found)
4095     {
4096     recno = GET2(slot, 0);
4097     PUT2(code, 2+LINK_SIZE, recno);
4098     }
4099 nigel 91
4100 nigel 93 /* Search the pattern for a forward reference */
4101 nigel 91
4102 nigel 93 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4103     (options & PCRE_EXTENDED) != 0)) > 0)
4104     {
4105     PUT2(code, 2+LINK_SIZE, i);
4106     }
4107 nigel 91
4108 nigel 93 /* If terminator == 0 it means that the name followed directly after
4109     the opening parenthesis [e.g. (?(abc)...] and in this case there are
4110     some further alternatives to try. For the cases where terminator != 0
4111     [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4112     now checked all the possibilities, so give an error. */
4113 nigel 91
4114 nigel 93 else if (terminator != 0)
4115     {
4116     *errorcodeptr = ERR15;
4117     goto FAILED;
4118     }
4119