/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 216 - (hide annotations) (download)
Wed Aug 15 14:35:57 2007 UTC (7 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 187780 byte(s)
Fixed compile-time loop for patterns like (?:[\PPa*]*){8,} (extended class 
inside group with unlimited repeat).

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 199 #include <config.h>
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 nigel 77 static const short int escapes[] = {
102     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 nigel 77 0, 0, -ESC_z /* x - z */
112     };
113    
114 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
115 nigel 77 static const short int escapes[] = {
116     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139     };
140     #endif
141    
142    
143 ph10 210 /* Table of special "verbs" like (*PRUNE) */
144    
145     typedef struct verbitem {
146     const char *name;
147     int len;
148     int op;
149 ph10 211 } verbitem;
150 ph10 210
151     static verbitem verbs[] = {
152     { "ACCEPT", 6, OP_ACCEPT },
153     { "COMMIT", 6, OP_COMMIT },
154     { "F", 1, OP_FAIL },
155 ph10 211 { "FAIL", 4, OP_FAIL },
156 ph10 210 { "PRUNE", 5, OP_PRUNE },
157     { "SKIP", 4, OP_SKIP },
158     { "THEN", 4, OP_THEN }
159     };
160    
161     static int verbcount = sizeof(verbs)/sizeof(verbitem);
162    
163    
164 nigel 77 /* Tables of names of POSIX character classes and their lengths. The list is
165 nigel 87 terminated by a zero length entry. The first three must be alpha, lower, upper,
166 nigel 77 as this is assumed for handling case independence. */
167    
168     static const char *const posix_names[] = {
169     "alpha", "lower", "upper",
170     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
171     "print", "punct", "space", "word", "xdigit" };
172    
173     static const uschar posix_name_lengths[] = {
174     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
175    
176 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
177     base map, with an optional addition or removal of another map. Then, for some
178     classes, there is some additional tweaking: for [:blank:] the vertical space
179     characters are removed, and for [:alpha:] and [:alnum:] the underscore
180     character is removed. The triples in the table consist of the base map offset,
181     second map offset or -1 if no second map, and a non-negative value for map
182     addition or a negative value for map subtraction (if there are two maps). The
183     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
184     remove vertical space characters, 2 => remove underscore. */
185 nigel 77
186     static const int posix_class_maps[] = {
187 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
188     cbit_lower, -1, 0, /* lower */
189     cbit_upper, -1, 0, /* upper */
190     cbit_word, -1, 2, /* alnum - word without underscore */
191     cbit_print, cbit_cntrl, 0, /* ascii */
192     cbit_space, -1, 1, /* blank - a GNU extension */
193     cbit_cntrl, -1, 0, /* cntrl */
194     cbit_digit, -1, 0, /* digit */
195     cbit_graph, -1, 0, /* graph */
196     cbit_print, -1, 0, /* print */
197     cbit_punct, -1, 0, /* punct */
198     cbit_space, -1, 0, /* space */
199     cbit_word, -1, 0, /* word - a Perl extension */
200     cbit_xdigit,-1, 0 /* xdigit */
201 nigel 77 };
202    
203    
204 nigel 93 #define STRING(a) # a
205     #define XSTRING(s) STRING(s)
206    
207 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
208 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
209     they are documented. Always add a new error instead. Messages marked DEAD below
210     are no longer used. */
211 nigel 77
212     static const char *error_texts[] = {
213     "no error",
214     "\\ at end of pattern",
215     "\\c at end of pattern",
216     "unrecognized character follows \\",
217     "numbers out of order in {} quantifier",
218     /* 5 */
219     "number too big in {} quantifier",
220     "missing terminating ] for character class",
221     "invalid escape sequence in character class",
222     "range out of order in character class",
223     "nothing to repeat",
224     /* 10 */
225 nigel 93 "operand of unlimited repeat could match the empty string", /** DEAD **/
226 nigel 77 "internal error: unexpected repeat",
227     "unrecognized character after (?",
228     "POSIX named classes are supported only within a class",
229     "missing )",
230     /* 15 */
231     "reference to non-existent subpattern",
232     "erroffset passed as NULL",
233     "unknown option bit(s) set",
234     "missing ) after comment",
235 nigel 93 "parentheses nested too deeply", /** DEAD **/
236 nigel 77 /* 20 */
237 ph10 202 "regular expression is too large",
238 nigel 77 "failed to get memory",
239     "unmatched parentheses",
240     "internal error: code overflow",
241     "unrecognized character after (?<",
242     /* 25 */
243     "lookbehind assertion is not fixed length",
244 nigel 91 "malformed number or name after (?(",
245 nigel 77 "conditional group contains more than two branches",
246     "assertion expected after (?(",
247 ph10 166 "(?R or (?[+-]digits must be followed by )",
248 nigel 77 /* 30 */
249     "unknown POSIX class name",
250     "POSIX collating elements are not supported",
251     "this version of PCRE is not compiled with PCRE_UTF8 support",
252 nigel 93 "spare error", /** DEAD **/
253 nigel 77 "character value in \\x{...} sequence is too large",
254     /* 35 */
255     "invalid condition (?(0)",
256     "\\C not allowed in lookbehind assertion",
257     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
258     "number after (?C is > 255",
259     "closing ) for (?C expected",
260     /* 40 */
261     "recursive call could loop indefinitely",
262     "unrecognized character after (?P",
263 nigel 93 "syntax error in subpattern name (missing terminator)",
264 nigel 91 "two named subpatterns have the same name",
265 nigel 77 "invalid UTF-8 string",
266     /* 45 */
267     "support for \\P, \\p, and \\X has not been compiled",
268     "malformed \\P or \\p sequence",
269 nigel 91 "unknown property name after \\P or \\p",
270 nigel 93 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
271     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
272 nigel 91 /* 50 */
273 ph10 202 "repeated subpattern is too long", /** DEAD **/
274 nigel 93 "octal value is greater than \\377 (not in UTF-8 mode)",
275     "internal error: overran compiling workspace",
276     "internal error: previously-checked referenced subpattern not found",
277     "DEFINE group contains more than one branch",
278     /* 55 */
279     "repeating a DEFINE group is not allowed",
280     "inconsistent NEWLINE options",
281 ph10 171 "\\g is not followed by a braced name or an optionally braced non-zero number",
282 ph10 210 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
283     "(*VERB) with an argument is not supported",
284 ph10 211 /* 60 */
285 ph10 213 "(*VERB) not recognized",
286     "number is too big"
287 nigel 77 };
288    
289    
290     /* Table to identify digits and hex digits. This is used when compiling
291     patterns. Note that the tables in chartables are dependent on the locale, and
292     may mark arbitrary characters as digits - but the PCRE compiling code expects
293     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
294     a private table here. It costs 256 bytes, but it is a lot faster than doing
295     character value tests (at least in some simple cases I timed), and in some
296     applications one wants PCRE to compile efficiently as well as match
297     efficiently.
298    
299     For convenience, we use the same bit definitions as in chartables:
300    
301     0x04 decimal digit
302     0x08 hexadecimal digit
303    
304     Then we can use ctype_digit and ctype_xdigit in the code. */
305    
306 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
307 nigel 77 static const unsigned char digitab[] =
308     {
309     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
310     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
311     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
312     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
313     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
314     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
315     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
316     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
317     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
318     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
319     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
320     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
321     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
322     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
323     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
324     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
325     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
326     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
333     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
334     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
335     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
336     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
337     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
338     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
339     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
340     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
341    
342 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
343 nigel 77 static const unsigned char digitab[] =
344     {
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
347     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
348     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
350     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
351     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
352     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
355     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
356 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
357 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
358     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
359     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
360     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
361     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
362     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
363     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
364     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
365     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
366     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
367     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
368     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
369     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
370     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
371     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
374     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
375     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
376     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
377    
378     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
379     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
380     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
381     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
382     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
383     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
387     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
388     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
390 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
391 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
392     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
393     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
394     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
395     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
396     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
397     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
398     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
399     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
400     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
401     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
402     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
403     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
404     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
405     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
406     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
407     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
408     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
409     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
410     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
411     #endif
412    
413    
414     /* Definition to allow mutual recursion */
415    
416     static BOOL
417 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
418 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
419 nigel 77
420    
421    
422     /*************************************************
423     * Handle escapes *
424     *************************************************/
425    
426     /* This function is called when a \ has been encountered. It either returns a
427     positive value for a simple escape such as \n, or a negative value which
428 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
429     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
430     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
431     ptr is pointing at the \. On exit, it is on the final character of the escape
432     sequence.
433 nigel 77
434     Arguments:
435     ptrptr points to the pattern position pointer
436     errorcodeptr points to the errorcode variable
437     bracount number of previous extracting brackets
438     options the options bits
439     isclass TRUE if inside a character class
440    
441     Returns: zero or positive => a data character
442     negative => a special escape sequence
443 ph10 213 on error, errorcodeptr is set
444 nigel 77 */
445    
446     static int
447     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
448     int options, BOOL isclass)
449     {
450 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
451     const uschar *ptr = *ptrptr + 1;
452 nigel 77 int c, i;
453    
454 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
455     ptr--; /* Set pointer back to the last byte */
456    
457 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
458    
459     if (c == 0) *errorcodeptr = ERR1;
460    
461     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
462     a table. A non-zero result is something that can be returned immediately.
463     Otherwise further processing may be required. */
464    
465 ph10 97 #ifndef EBCDIC /* ASCII coding */
466 nigel 77 else if (c < '0' || c > 'z') {} /* Not alphameric */
467     else if ((i = escapes[c - '0']) != 0) c = i;
468    
469 ph10 97 #else /* EBCDIC coding */
470 nigel 77 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
471     else if ((i = escapes[c - 0x48]) != 0) c = i;
472     #endif
473    
474     /* Escapes that need further processing, or are illegal. */
475    
476     else
477     {
478     const uschar *oldptr;
479 nigel 93 BOOL braced, negated;
480    
481 nigel 77 switch (c)
482     {
483     /* A number of Perl escapes are not handled by PCRE. We give an explicit
484     error. */
485    
486     case 'l':
487     case 'L':
488     case 'N':
489     case 'u':
490     case 'U':
491     *errorcodeptr = ERR37;
492     break;
493    
494 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
495     is an absolute backreference. If negative, it is a relative backreference.
496 ph10 172 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
497     reference to a named group. This is part of Perl's movement towards a
498     unified syntax for back references. As this is synonymous with \k{name}, we
499 ph10 171 fudge it up by pretending it really was \k. */
500 nigel 93
501     case 'g':
502     if (ptr[1] == '{')
503     {
504 ph10 171 const uschar *p;
505     for (p = ptr+2; *p != 0 && *p != '}'; p++)
506     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
507 ph10 172 if (*p != 0 && *p != '}')
508 ph10 171 {
509     c = -ESC_k;
510     break;
511 ph10 172 }
512 nigel 93 braced = TRUE;
513     ptr++;
514     }
515     else braced = FALSE;
516    
517     if (ptr[1] == '-')
518     {
519     negated = TRUE;
520     ptr++;
521     }
522     else negated = FALSE;
523    
524     c = 0;
525     while ((digitab[ptr[1]] & ctype_digit) != 0)
526     c = c * 10 + *(++ptr) - '0';
527 ph10 213
528     if (c < 0)
529     {
530     *errorcodeptr = ERR61;
531     break;
532     }
533 nigel 93
534     if (c == 0 || (braced && *(++ptr) != '}'))
535     {
536     *errorcodeptr = ERR57;
537 ph10 213 break;
538 nigel 93 }
539    
540     if (negated)
541     {
542     if (c > bracount)
543     {
544     *errorcodeptr = ERR15;
545 ph10 213 break;
546 nigel 93 }
547     c = bracount - (c - 1);
548     }
549    
550     c = -(ESC_REF + c);
551     break;
552    
553 nigel 77 /* The handling of escape sequences consisting of a string of digits
554     starting with one that is not zero is not straightforward. By experiment,
555     the way Perl works seems to be as follows:
556    
557     Outside a character class, the digits are read as a decimal number. If the
558     number is less than 10, or if there are that many previous extracting
559     left brackets, then it is a back reference. Otherwise, up to three octal
560     digits are read to form an escaped byte. Thus \123 is likely to be octal
561     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
562     value is greater than 377, the least significant 8 bits are taken. Inside a
563     character class, \ followed by a digit is always an octal number. */
564    
565     case '1': case '2': case '3': case '4': case '5':
566     case '6': case '7': case '8': case '9':
567    
568     if (!isclass)
569     {
570     oldptr = ptr;
571     c -= '0';
572     while ((digitab[ptr[1]] & ctype_digit) != 0)
573     c = c * 10 + *(++ptr) - '0';
574 ph10 213 if (c < 0)
575     {
576     *errorcodeptr = ERR61;
577     break;
578     }
579 nigel 77 if (c < 10 || c <= bracount)
580     {
581     c = -(ESC_REF + c);
582     break;
583     }
584     ptr = oldptr; /* Put the pointer back and fall through */
585     }
586    
587     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
588     generates a binary zero byte and treats the digit as a following literal.
589     Thus we have to pull back the pointer by one. */
590    
591     if ((c = *ptr) >= '8')
592     {
593     ptr--;
594     c = 0;
595     break;
596     }
597    
598     /* \0 always starts an octal number, but we may drop through to here with a
599 nigel 91 larger first octal digit. The original code used just to take the least
600     significant 8 bits of octal numbers (I think this is what early Perls used
601     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
602     than 3 octal digits. */
603 nigel 77
604     case '0':
605     c -= '0';
606     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
607     c = c * 8 + *(++ptr) - '0';
608 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
609 nigel 77 break;
610    
611 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
612     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
613     treated as a data character. */
614 nigel 77
615     case 'x':
616 nigel 87 if (ptr[1] == '{')
617 nigel 77 {
618     const uschar *pt = ptr + 2;
619 nigel 87 int count = 0;
620    
621 nigel 77 c = 0;
622     while ((digitab[*pt] & ctype_xdigit) != 0)
623     {
624 nigel 87 register int cc = *pt++;
625     if (c == 0 && cc == '0') continue; /* Leading zeroes */
626 nigel 77 count++;
627 nigel 87
628 ph10 97 #ifndef EBCDIC /* ASCII coding */
629 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
630 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
631 ph10 97 #else /* EBCDIC coding */
632 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
633 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
634 nigel 77 #endif
635     }
636 nigel 87
637 nigel 77 if (*pt == '}')
638     {
639 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
640 nigel 77 ptr = pt;
641     break;
642     }
643 nigel 87
644 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
645     recognize this construct; fall through to the normal \x handling. */
646     }
647    
648 nigel 87 /* Read just a single-byte hex-defined char */
649 nigel 77
650     c = 0;
651     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
652     {
653     int cc; /* Some compilers don't like ++ */
654     cc = *(++ptr); /* in initializers */
655 ph10 97 #ifndef EBCDIC /* ASCII coding */
656 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
657     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
658 ph10 97 #else /* EBCDIC coding */
659 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
660     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
661     #endif
662     }
663     break;
664    
665 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
666     This coding is ASCII-specific, but then the whole concept of \cx is
667     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
668 nigel 77
669     case 'c':
670     c = *(++ptr);
671     if (c == 0)
672     {
673     *errorcodeptr = ERR2;
674 ph10 213 break;
675 nigel 77 }
676    
677 ph10 97 #ifndef EBCDIC /* ASCII coding */
678 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
679     c ^= 0x40;
680 ph10 97 #else /* EBCDIC coding */
681 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
682     c ^= 0xC0;
683     #endif
684     break;
685    
686     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
687     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
688     for Perl compatibility, it is a literal. This code looks a bit odd, but
689     there used to be some cases other than the default, and there may be again
690     in future, so I haven't "optimized" it. */
691    
692     default:
693     if ((options & PCRE_EXTRA) != 0) switch(c)
694     {
695     default:
696     *errorcodeptr = ERR3;
697     break;
698     }
699     break;
700     }
701     }
702    
703     *ptrptr = ptr;
704     return c;
705     }
706    
707    
708    
709     #ifdef SUPPORT_UCP
710     /*************************************************
711     * Handle \P and \p *
712     *************************************************/
713    
714     /* This function is called after \P or \p has been encountered, provided that
715     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
716     pointing at the P or p. On exit, it is pointing at the final character of the
717     escape sequence.
718    
719     Argument:
720     ptrptr points to the pattern position pointer
721     negptr points to a boolean that is set TRUE for negation else FALSE
722 nigel 87 dptr points to an int that is set to the detailed property value
723 nigel 77 errorcodeptr points to the error code variable
724    
725 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
726 nigel 77 */
727    
728     static int
729 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
730 nigel 77 {
731     int c, i, bot, top;
732     const uschar *ptr = *ptrptr;
733 nigel 87 char name[32];
734 nigel 77
735     c = *(++ptr);
736     if (c == 0) goto ERROR_RETURN;
737    
738     *negptr = FALSE;
739    
740 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
741     negation. */
742 nigel 77
743     if (c == '{')
744     {
745     if (ptr[1] == '^')
746     {
747     *negptr = TRUE;
748     ptr++;
749     }
750 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
751 nigel 77 {
752     c = *(++ptr);
753     if (c == 0) goto ERROR_RETURN;
754     if (c == '}') break;
755     name[i] = c;
756     }
757 nigel 87 if (c !='}') goto ERROR_RETURN;
758 nigel 77 name[i] = 0;
759     }
760    
761     /* Otherwise there is just one following character */
762    
763     else
764     {
765     name[0] = c;
766     name[1] = 0;
767     }
768    
769     *ptrptr = ptr;
770    
771     /* Search for a recognized property name using binary chop */
772    
773     bot = 0;
774     top = _pcre_utt_size;
775    
776     while (bot < top)
777     {
778 nigel 87 i = (bot + top) >> 1;
779 nigel 77 c = strcmp(name, _pcre_utt[i].name);
780 nigel 87 if (c == 0)
781     {
782     *dptr = _pcre_utt[i].value;
783     return _pcre_utt[i].type;
784     }
785 nigel 77 if (c > 0) bot = i + 1; else top = i;
786     }
787    
788     *errorcodeptr = ERR47;
789     *ptrptr = ptr;
790     return -1;
791    
792     ERROR_RETURN:
793     *errorcodeptr = ERR46;
794     *ptrptr = ptr;
795     return -1;
796     }
797     #endif
798    
799    
800    
801    
802     /*************************************************
803     * Check for counted repeat *
804     *************************************************/
805    
806     /* This function is called when a '{' is encountered in a place where it might
807     start a quantifier. It looks ahead to see if it really is a quantifier or not.
808     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
809     where the ddds are digits.
810    
811     Arguments:
812     p pointer to the first char after '{'
813    
814     Returns: TRUE or FALSE
815     */
816    
817     static BOOL
818     is_counted_repeat(const uschar *p)
819     {
820     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
821     while ((digitab[*p] & ctype_digit) != 0) p++;
822     if (*p == '}') return TRUE;
823    
824     if (*p++ != ',') return FALSE;
825     if (*p == '}') return TRUE;
826    
827     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
828     while ((digitab[*p] & ctype_digit) != 0) p++;
829    
830     return (*p == '}');
831     }
832    
833    
834    
835     /*************************************************
836     * Read repeat counts *
837     *************************************************/
838    
839     /* Read an item of the form {n,m} and return the values. This is called only
840     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
841     so the syntax is guaranteed to be correct, but we need to check the values.
842    
843     Arguments:
844     p pointer to first char after '{'
845     minp pointer to int for min
846     maxp pointer to int for max
847     returned as -1 if no max
848     errorcodeptr points to error code variable
849    
850     Returns: pointer to '}' on success;
851     current ptr on error, with errorcodeptr set non-zero
852     */
853    
854     static const uschar *
855     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
856     {
857     int min = 0;
858     int max = -1;
859    
860 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
861     an integer overflow. */
862    
863 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
864 nigel 81 if (min < 0 || min > 65535)
865     {
866     *errorcodeptr = ERR5;
867     return p;
868     }
869 nigel 77
870 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
871     Also, max must not be less than min. */
872    
873 nigel 77 if (*p == '}') max = min; else
874     {
875     if (*(++p) != '}')
876     {
877     max = 0;
878     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
879 nigel 81 if (max < 0 || max > 65535)
880     {
881     *errorcodeptr = ERR5;
882     return p;
883     }
884 nigel 77 if (max < min)
885     {
886     *errorcodeptr = ERR4;
887     return p;
888     }
889     }
890     }
891    
892 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
893     '}'. */
894 nigel 77
895 nigel 81 *minp = min;
896     *maxp = max;
897 nigel 77 return p;
898     }
899    
900    
901    
902     /*************************************************
903 nigel 93 * Find forward referenced subpattern *
904 nigel 91 *************************************************/
905    
906 nigel 93 /* This function scans along a pattern's text looking for capturing
907     subpatterns, and counting them. If it finds a named pattern that matches the
908     name it is given, it returns its number. Alternatively, if the name is NULL, it
909     returns when it reaches a given numbered subpattern. This is used for forward
910     references to subpatterns. We know that if (?P< is encountered, the name will
911     be terminated by '>' because that is checked in the first pass.
912 nigel 91
913     Arguments:
914 nigel 93 ptr current position in the pattern
915     count current count of capturing parens so far encountered
916     name name to seek, or NULL if seeking a numbered subpattern
917     lorn name length, or subpattern number if name is NULL
918     xmode TRUE if we are in /x mode
919 nigel 91
920     Returns: the number of the named subpattern, or -1 if not found
921     */
922    
923     static int
924 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
925     BOOL xmode)
926 nigel 91 {
927     const uschar *thisname;
928 nigel 93
929 nigel 91 for (; *ptr != 0; ptr++)
930     {
931 nigel 93 int term;
932    
933     /* Skip over backslashed characters and also entire \Q...\E */
934    
935     if (*ptr == '\\')
936     {
937     if (*(++ptr) == 0) return -1;
938     if (*ptr == 'Q') for (;;)
939     {
940     while (*(++ptr) != 0 && *ptr != '\\');
941     if (*ptr == 0) return -1;
942     if (*(++ptr) == 'E') break;
943     }
944     continue;
945     }
946    
947     /* Skip over character classes */
948    
949     if (*ptr == '[')
950     {
951     while (*(++ptr) != ']')
952     {
953 ph10 215 if (*ptr == 0) return -1;
954 nigel 93 if (*ptr == '\\')
955     {
956     if (*(++ptr) == 0) return -1;
957     if (*ptr == 'Q') for (;;)
958     {
959     while (*(++ptr) != 0 && *ptr != '\\');
960     if (*ptr == 0) return -1;
961     if (*(++ptr) == 'E') break;
962     }
963     continue;
964     }
965     }
966     continue;
967     }
968    
969     /* Skip comments in /x mode */
970    
971     if (xmode && *ptr == '#')
972     {
973     while (*(++ptr) != 0 && *ptr != '\n');
974     if (*ptr == 0) return -1;
975     continue;
976     }
977    
978     /* An opening parens must now be a real metacharacter */
979    
980 nigel 91 if (*ptr != '(') continue;
981 ph10 210 if (ptr[1] != '?' && ptr[1] != '*')
982 nigel 93 {
983     count++;
984     if (name == NULL && count == lorn) return count;
985     continue;
986     }
987    
988     ptr += 2;
989     if (*ptr == 'P') ptr++; /* Allow optional P */
990    
991     /* We have to disambiguate (?<! and (?<= from (?<name> */
992    
993     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
994     *ptr != '\'')
995     continue;
996    
997 nigel 91 count++;
998 nigel 93
999     if (name == NULL && count == lorn) return count;
1000     term = *ptr++;
1001     if (term == '<') term = '>';
1002 nigel 91 thisname = ptr;
1003 nigel 93 while (*ptr != term) ptr++;
1004     if (name != NULL && lorn == ptr - thisname &&
1005     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1006 nigel 91 return count;
1007     }
1008 nigel 93
1009 nigel 91 return -1;
1010     }
1011    
1012    
1013    
1014     /*************************************************
1015 nigel 77 * Find first significant op code *
1016     *************************************************/
1017    
1018     /* This is called by several functions that scan a compiled expression looking
1019     for a fixed first character, or an anchoring op code etc. It skips over things
1020     that do not influence this. For some calls, a change of option is important.
1021     For some calls, it makes sense to skip negative forward and all backward
1022     assertions, and also the \b assertion; for others it does not.
1023    
1024     Arguments:
1025     code pointer to the start of the group
1026     options pointer to external options
1027     optbit the option bit whose changing is significant, or
1028     zero if none are
1029     skipassert TRUE if certain assertions are to be skipped
1030    
1031     Returns: pointer to the first significant opcode
1032     */
1033    
1034     static const uschar*
1035     first_significant_code(const uschar *code, int *options, int optbit,
1036     BOOL skipassert)
1037     {
1038     for (;;)
1039     {
1040     switch ((int)*code)
1041     {
1042     case OP_OPT:
1043     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1044     *options = (int)code[1];
1045     code += 2;
1046     break;
1047    
1048     case OP_ASSERT_NOT:
1049     case OP_ASSERTBACK:
1050     case OP_ASSERTBACK_NOT:
1051     if (!skipassert) return code;
1052     do code += GET(code, 1); while (*code == OP_ALT);
1053     code += _pcre_OP_lengths[*code];
1054     break;
1055    
1056     case OP_WORD_BOUNDARY:
1057     case OP_NOT_WORD_BOUNDARY:
1058     if (!skipassert) return code;
1059     /* Fall through */
1060    
1061     case OP_CALLOUT:
1062     case OP_CREF:
1063 nigel 93 case OP_RREF:
1064     case OP_DEF:
1065 nigel 77 code += _pcre_OP_lengths[*code];
1066     break;
1067    
1068     default:
1069     return code;
1070     }
1071     }
1072     /* Control never reaches here */
1073     }
1074    
1075    
1076    
1077    
1078     /*************************************************
1079     * Find the fixed length of a pattern *
1080     *************************************************/
1081    
1082     /* Scan a pattern and compute the fixed length of subject that will match it,
1083     if the length is fixed. This is needed for dealing with backward assertions.
1084     In UTF8 mode, the result is in characters rather than bytes.
1085    
1086     Arguments:
1087     code points to the start of the pattern (the bracket)
1088     options the compiling options
1089    
1090     Returns: the fixed length, or -1 if there is no fixed length,
1091     or -2 if \C was encountered
1092     */
1093    
1094     static int
1095     find_fixedlength(uschar *code, int options)
1096     {
1097     int length = -1;
1098    
1099     register int branchlength = 0;
1100     register uschar *cc = code + 1 + LINK_SIZE;
1101    
1102     /* Scan along the opcodes for this branch. If we get to the end of the
1103     branch, check the length against that of the other branches. */
1104    
1105     for (;;)
1106     {
1107     int d;
1108     register int op = *cc;
1109    
1110     switch (op)
1111     {
1112 nigel 93 case OP_CBRA:
1113 nigel 77 case OP_BRA:
1114     case OP_ONCE:
1115     case OP_COND:
1116 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1117 nigel 77 if (d < 0) return d;
1118     branchlength += d;
1119     do cc += GET(cc, 1); while (*cc == OP_ALT);
1120     cc += 1 + LINK_SIZE;
1121     break;
1122    
1123     /* Reached end of a branch; if it's a ket it is the end of a nested
1124     call. If it's ALT it is an alternation in a nested call. If it is
1125     END it's the end of the outer call. All can be handled by the same code. */
1126    
1127     case OP_ALT:
1128     case OP_KET:
1129     case OP_KETRMAX:
1130     case OP_KETRMIN:
1131     case OP_END:
1132     if (length < 0) length = branchlength;
1133     else if (length != branchlength) return -1;
1134     if (*cc != OP_ALT) return length;
1135     cc += 1 + LINK_SIZE;
1136     branchlength = 0;
1137     break;
1138    
1139     /* Skip over assertive subpatterns */
1140    
1141     case OP_ASSERT:
1142     case OP_ASSERT_NOT:
1143     case OP_ASSERTBACK:
1144     case OP_ASSERTBACK_NOT:
1145     do cc += GET(cc, 1); while (*cc == OP_ALT);
1146     /* Fall through */
1147    
1148     /* Skip over things that don't match chars */
1149    
1150     case OP_REVERSE:
1151     case OP_CREF:
1152 nigel 93 case OP_RREF:
1153     case OP_DEF:
1154 nigel 77 case OP_OPT:
1155     case OP_CALLOUT:
1156     case OP_SOD:
1157     case OP_SOM:
1158     case OP_EOD:
1159     case OP_EODN:
1160     case OP_CIRC:
1161     case OP_DOLL:
1162     case OP_NOT_WORD_BOUNDARY:
1163     case OP_WORD_BOUNDARY:
1164     cc += _pcre_OP_lengths[*cc];
1165     break;
1166    
1167     /* Handle literal characters */
1168    
1169     case OP_CHAR:
1170     case OP_CHARNC:
1171 nigel 91 case OP_NOT:
1172 nigel 77 branchlength++;
1173     cc += 2;
1174     #ifdef SUPPORT_UTF8
1175     if ((options & PCRE_UTF8) != 0)
1176     {
1177     while ((*cc & 0xc0) == 0x80) cc++;
1178     }
1179     #endif
1180     break;
1181    
1182     /* Handle exact repetitions. The count is already in characters, but we
1183     need to skip over a multibyte character in UTF8 mode. */
1184    
1185     case OP_EXACT:
1186     branchlength += GET2(cc,1);
1187     cc += 4;
1188     #ifdef SUPPORT_UTF8
1189     if ((options & PCRE_UTF8) != 0)
1190     {
1191     while((*cc & 0x80) == 0x80) cc++;
1192     }
1193     #endif
1194     break;
1195    
1196     case OP_TYPEEXACT:
1197     branchlength += GET2(cc,1);
1198     cc += 4;
1199     break;
1200    
1201     /* Handle single-char matchers */
1202    
1203     case OP_PROP:
1204     case OP_NOTPROP:
1205 nigel 87 cc += 2;
1206 nigel 77 /* Fall through */
1207    
1208     case OP_NOT_DIGIT:
1209     case OP_DIGIT:
1210     case OP_NOT_WHITESPACE:
1211     case OP_WHITESPACE:
1212     case OP_NOT_WORDCHAR:
1213     case OP_WORDCHAR:
1214     case OP_ANY:
1215     branchlength++;
1216     cc++;
1217     break;
1218    
1219     /* The single-byte matcher isn't allowed */
1220    
1221     case OP_ANYBYTE:
1222     return -2;
1223    
1224     /* Check a class for variable quantification */
1225    
1226     #ifdef SUPPORT_UTF8
1227     case OP_XCLASS:
1228     cc += GET(cc, 1) - 33;
1229     /* Fall through */
1230     #endif
1231    
1232     case OP_CLASS:
1233     case OP_NCLASS:
1234     cc += 33;
1235    
1236     switch (*cc)
1237     {
1238     case OP_CRSTAR:
1239     case OP_CRMINSTAR:
1240     case OP_CRQUERY:
1241     case OP_CRMINQUERY:
1242     return -1;
1243    
1244     case OP_CRRANGE:
1245     case OP_CRMINRANGE:
1246     if (GET2(cc,1) != GET2(cc,3)) return -1;
1247     branchlength += GET2(cc,1);
1248     cc += 5;
1249     break;
1250    
1251     default:
1252     branchlength++;
1253     }
1254     break;
1255    
1256     /* Anything else is variable length */
1257    
1258     default:
1259     return -1;
1260     }
1261     }
1262     /* Control never gets here */
1263     }
1264    
1265    
1266    
1267    
1268     /*************************************************
1269     * Scan compiled regex for numbered bracket *
1270     *************************************************/
1271    
1272     /* This little function scans through a compiled pattern until it finds a
1273     capturing bracket with the given number.
1274    
1275     Arguments:
1276     code points to start of expression
1277     utf8 TRUE in UTF-8 mode
1278     number the required bracket number
1279    
1280     Returns: pointer to the opcode for the bracket, or NULL if not found
1281     */
1282    
1283     static const uschar *
1284     find_bracket(const uschar *code, BOOL utf8, int number)
1285     {
1286     for (;;)
1287     {
1288     register int c = *code;
1289     if (c == OP_END) return NULL;
1290 nigel 91
1291     /* XCLASS is used for classes that cannot be represented just by a bit
1292     map. This includes negated single high-valued characters. The length in
1293     the table is zero; the actual length is stored in the compiled code. */
1294    
1295     if (c == OP_XCLASS) code += GET(code, 1);
1296    
1297 nigel 93 /* Handle capturing bracket */
1298 nigel 91
1299 nigel 93 else if (c == OP_CBRA)
1300 nigel 77 {
1301 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1302 nigel 77 if (n == number) return (uschar *)code;
1303 nigel 93 code += _pcre_OP_lengths[c];
1304 nigel 77 }
1305 nigel 91
1306 nigel 93 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1307     a multi-byte character. The length in the table is a minimum, so we have to
1308     arrange to skip the extra bytes. */
1309 nigel 91
1310 nigel 77 else
1311     {
1312     code += _pcre_OP_lengths[c];
1313 ph10 107 #ifdef SUPPORT_UTF8
1314 nigel 77 if (utf8) switch(c)
1315     {
1316     case OP_CHAR:
1317     case OP_CHARNC:
1318     case OP_EXACT:
1319     case OP_UPTO:
1320     case OP_MINUPTO:
1321 nigel 93 case OP_POSUPTO:
1322 nigel 77 case OP_STAR:
1323     case OP_MINSTAR:
1324 nigel 93 case OP_POSSTAR:
1325 nigel 77 case OP_PLUS:
1326     case OP_MINPLUS:
1327 nigel 93 case OP_POSPLUS:
1328 nigel 77 case OP_QUERY:
1329     case OP_MINQUERY:
1330 nigel 93 case OP_POSQUERY:
1331     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1332 nigel 77 break;
1333     }
1334 ph10 111 #endif
1335 nigel 77 }
1336     }
1337     }
1338    
1339    
1340    
1341     /*************************************************
1342     * Scan compiled regex for recursion reference *
1343     *************************************************/
1344    
1345     /* This little function scans through a compiled pattern until it finds an
1346     instance of OP_RECURSE.
1347    
1348     Arguments:
1349     code points to start of expression
1350     utf8 TRUE in UTF-8 mode
1351    
1352     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1353     */
1354    
1355     static const uschar *
1356     find_recurse(const uschar *code, BOOL utf8)
1357     {
1358     for (;;)
1359     {
1360     register int c = *code;
1361     if (c == OP_END) return NULL;
1362 nigel 91 if (c == OP_RECURSE) return code;
1363    
1364     /* XCLASS is used for classes that cannot be represented just by a bit
1365     map. This includes negated single high-valued characters. The length in
1366     the table is zero; the actual length is stored in the compiled code. */
1367    
1368     if (c == OP_XCLASS) code += GET(code, 1);
1369    
1370     /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1371     that are followed by a character may be followed by a multi-byte character.
1372 nigel 93 The length in the table is a minimum, so we have to arrange to skip the extra
1373     bytes. */
1374 nigel 91
1375 nigel 77 else
1376     {
1377     code += _pcre_OP_lengths[c];
1378 ph10 107 #ifdef SUPPORT_UTF8
1379 nigel 77 if (utf8) switch(c)
1380     {
1381     case OP_CHAR:
1382     case OP_CHARNC:
1383     case OP_EXACT:
1384     case OP_UPTO:
1385     case OP_MINUPTO:
1386 nigel 93 case OP_POSUPTO:
1387 nigel 77 case OP_STAR:
1388     case OP_MINSTAR:
1389 nigel 93 case OP_POSSTAR:
1390 nigel 77 case OP_PLUS:
1391     case OP_MINPLUS:
1392 nigel 93 case OP_POSPLUS:
1393 nigel 77 case OP_QUERY:
1394     case OP_MINQUERY:
1395 nigel 93 case OP_POSQUERY:
1396     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1397 nigel 77 break;
1398     }
1399 ph10 111 #endif
1400 nigel 77 }
1401     }
1402     }
1403    
1404    
1405    
1406     /*************************************************
1407     * Scan compiled branch for non-emptiness *
1408     *************************************************/
1409    
1410     /* This function scans through a branch of a compiled pattern to see whether it
1411 nigel 93 can match the empty string or not. It is called from could_be_empty()
1412     below and from compile_branch() when checking for an unlimited repeat of a
1413     group that can match nothing. Note that first_significant_code() skips over
1414     assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1415     struck an inner bracket whose current branch will already have been scanned.
1416 nigel 77
1417     Arguments:
1418     code points to start of search
1419     endcode points to where to stop
1420     utf8 TRUE if in UTF8 mode
1421    
1422     Returns: TRUE if what is matched could be empty
1423     */
1424    
1425     static BOOL
1426     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1427     {
1428     register int c;
1429 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1430 nigel 77 code < endcode;
1431     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1432     {
1433     const uschar *ccode;
1434    
1435     c = *code;
1436 ph10 172
1437 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1438 nigel 77
1439 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1440     {
1441 ph10 172 code += _pcre_OP_lengths[c];
1442 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1443     c = *code;
1444     continue;
1445     }
1446    
1447     /* For other groups, scan the branches. */
1448 ph10 172
1449 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1450 nigel 77 {
1451     BOOL empty_branch;
1452     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1453    
1454     /* Scan a closed bracket */
1455    
1456     empty_branch = FALSE;
1457     do
1458     {
1459     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1460     empty_branch = TRUE;
1461     code += GET(code, 1);
1462     }
1463     while (*code == OP_ALT);
1464     if (!empty_branch) return FALSE; /* All branches are non-empty */
1465 ph10 172 c = *code;
1466 nigel 93 continue;
1467 nigel 77 }
1468    
1469 nigel 93 /* Handle the other opcodes */
1470    
1471     switch (c)
1472 nigel 77 {
1473 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1474     cannot be represented just by a bit map. This includes negated single
1475     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1476     actual length is stored in the compiled code, so we must update "code"
1477     here. */
1478 nigel 77
1479     #ifdef SUPPORT_UTF8
1480     case OP_XCLASS:
1481 ph10 216 ccode = code += GET(code, 1);
1482 nigel 77 goto CHECK_CLASS_REPEAT;
1483     #endif
1484    
1485     case OP_CLASS:
1486     case OP_NCLASS:
1487     ccode = code + 33;
1488    
1489     #ifdef SUPPORT_UTF8
1490     CHECK_CLASS_REPEAT:
1491     #endif
1492    
1493     switch (*ccode)
1494     {
1495     case OP_CRSTAR: /* These could be empty; continue */
1496     case OP_CRMINSTAR:
1497     case OP_CRQUERY:
1498     case OP_CRMINQUERY:
1499     break;
1500    
1501     default: /* Non-repeat => class must match */
1502     case OP_CRPLUS: /* These repeats aren't empty */
1503     case OP_CRMINPLUS:
1504     return FALSE;
1505    
1506     case OP_CRRANGE:
1507     case OP_CRMINRANGE:
1508     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1509     break;
1510     }
1511     break;
1512    
1513     /* Opcodes that must match a character */
1514    
1515     case OP_PROP:
1516     case OP_NOTPROP:
1517     case OP_EXTUNI:
1518     case OP_NOT_DIGIT:
1519     case OP_DIGIT:
1520     case OP_NOT_WHITESPACE:
1521     case OP_WHITESPACE:
1522     case OP_NOT_WORDCHAR:
1523     case OP_WORDCHAR:
1524     case OP_ANY:
1525     case OP_ANYBYTE:
1526     case OP_CHAR:
1527     case OP_CHARNC:
1528     case OP_NOT:
1529     case OP_PLUS:
1530     case OP_MINPLUS:
1531 nigel 93 case OP_POSPLUS:
1532 nigel 77 case OP_EXACT:
1533     case OP_NOTPLUS:
1534     case OP_NOTMINPLUS:
1535 nigel 93 case OP_NOTPOSPLUS:
1536 nigel 77 case OP_NOTEXACT:
1537     case OP_TYPEPLUS:
1538     case OP_TYPEMINPLUS:
1539 nigel 93 case OP_TYPEPOSPLUS:
1540 nigel 77 case OP_TYPEEXACT:
1541     return FALSE;
1542    
1543     /* End of branch */
1544    
1545     case OP_KET:
1546     case OP_KETRMAX:
1547     case OP_KETRMIN:
1548     case OP_ALT:
1549     return TRUE;
1550    
1551 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1552     MINUPTO, and POSUPTO may be followed by a multibyte character */
1553 nigel 77
1554     #ifdef SUPPORT_UTF8
1555     case OP_STAR:
1556     case OP_MINSTAR:
1557 nigel 93 case OP_POSSTAR:
1558 nigel 77 case OP_QUERY:
1559     case OP_MINQUERY:
1560 nigel 93 case OP_POSQUERY:
1561 nigel 77 case OP_UPTO:
1562     case OP_MINUPTO:
1563 nigel 93 case OP_POSUPTO:
1564 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1565     break;
1566     #endif
1567     }
1568     }
1569    
1570     return TRUE;
1571     }
1572    
1573    
1574    
1575     /*************************************************
1576     * Scan compiled regex for non-emptiness *
1577     *************************************************/
1578    
1579     /* This function is called to check for left recursive calls. We want to check
1580     the current branch of the current pattern to see if it could match the empty
1581     string. If it could, we must look outwards for branches at other levels,
1582     stopping when we pass beyond the bracket which is the subject of the recursion.
1583    
1584     Arguments:
1585     code points to start of the recursion
1586     endcode points to where to stop (current RECURSE item)
1587     bcptr points to the chain of current (unclosed) branch starts
1588     utf8 TRUE if in UTF-8 mode
1589    
1590     Returns: TRUE if what is matched could be empty
1591     */
1592    
1593     static BOOL
1594     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1595     BOOL utf8)
1596     {
1597     while (bcptr != NULL && bcptr->current >= code)
1598     {
1599     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1600     bcptr = bcptr->outer;
1601     }
1602     return TRUE;
1603     }
1604    
1605    
1606    
1607     /*************************************************
1608     * Check for POSIX class syntax *
1609     *************************************************/
1610    
1611     /* This function is called when the sequence "[:" or "[." or "[=" is
1612     encountered in a character class. It checks whether this is followed by an
1613     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1614     ".]" or "=]".
1615    
1616     Argument:
1617     ptr pointer to the initial [
1618     endptr where to return the end pointer
1619     cd pointer to compile data
1620    
1621     Returns: TRUE or FALSE
1622     */
1623    
1624     static BOOL
1625     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1626     {
1627     int terminator; /* Don't combine these lines; the Solaris cc */
1628     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1629     if (*(++ptr) == '^') ptr++;
1630     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1631     if (*ptr == terminator && ptr[1] == ']')
1632     {
1633     *endptr = ptr;
1634     return TRUE;
1635     }
1636     return FALSE;
1637     }
1638    
1639    
1640    
1641    
1642     /*************************************************
1643     * Check POSIX class name *
1644     *************************************************/
1645    
1646     /* This function is called to check the name given in a POSIX-style class entry
1647     such as [:alnum:].
1648    
1649     Arguments:
1650     ptr points to the first letter
1651     len the length of the name
1652    
1653     Returns: a value representing the name, or -1 if unknown
1654     */
1655    
1656     static int
1657     check_posix_name(const uschar *ptr, int len)
1658     {
1659     register int yield = 0;
1660     while (posix_name_lengths[yield] != 0)
1661     {
1662     if (len == posix_name_lengths[yield] &&
1663     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1664     yield++;
1665     }
1666     return -1;
1667     }
1668    
1669    
1670     /*************************************************
1671     * Adjust OP_RECURSE items in repeated group *
1672     *************************************************/
1673    
1674     /* OP_RECURSE items contain an offset from the start of the regex to the group
1675     that is referenced. This means that groups can be replicated for fixed
1676     repetition simply by copying (because the recursion is allowed to refer to
1677     earlier groups that are outside the current group). However, when a group is
1678     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1679     it, after it has been compiled. This means that any OP_RECURSE items within it
1680     that refer to the group itself or any contained groups have to have their
1681 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1682     the partially compiled regex must be temporarily terminated with OP_END.
1683 nigel 77
1684 nigel 93 This function has been extended with the possibility of forward references for
1685     recursions and subroutine calls. It must also check the list of such references
1686     for the group we are dealing with. If it finds that one of the recursions in
1687     the current group is on this list, it adjusts the offset in the list, not the
1688     value in the reference (which is a group number).
1689    
1690 nigel 77 Arguments:
1691     group points to the start of the group
1692     adjust the amount by which the group is to be moved
1693     utf8 TRUE in UTF-8 mode
1694     cd contains pointers to tables etc.
1695 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1696 nigel 77
1697     Returns: nothing
1698     */
1699    
1700     static void
1701 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1702     uschar *save_hwm)
1703 nigel 77 {
1704     uschar *ptr = group;
1705     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1706     {
1707 nigel 93 int offset;
1708     uschar *hc;
1709    
1710     /* See if this recursion is on the forward reference list. If so, adjust the
1711     reference. */
1712    
1713     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1714     {
1715     offset = GET(hc, 0);
1716     if (cd->start_code + offset == ptr + 1)
1717     {
1718     PUT(hc, 0, offset + adjust);
1719     break;
1720     }
1721     }
1722    
1723     /* Otherwise, adjust the recursion offset if it's after the start of this
1724     group. */
1725    
1726     if (hc >= cd->hwm)
1727     {
1728     offset = GET(ptr, 1);
1729     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1730     }
1731    
1732 nigel 77 ptr += 1 + LINK_SIZE;
1733     }
1734     }
1735    
1736    
1737    
1738     /*************************************************
1739     * Insert an automatic callout point *
1740     *************************************************/
1741    
1742     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1743     callout points before each pattern item.
1744    
1745     Arguments:
1746     code current code pointer
1747     ptr current pattern pointer
1748     cd pointers to tables etc
1749    
1750     Returns: new code pointer
1751     */
1752    
1753     static uschar *
1754     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1755     {
1756     *code++ = OP_CALLOUT;
1757     *code++ = 255;
1758     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1759     PUT(code, LINK_SIZE, 0); /* Default length */
1760     return code + 2*LINK_SIZE;
1761     }
1762    
1763    
1764    
1765     /*************************************************
1766     * Complete a callout item *
1767     *************************************************/
1768    
1769     /* A callout item contains the length of the next item in the pattern, which
1770     we can't fill in till after we have reached the relevant point. This is used
1771     for both automatic and manual callouts.
1772    
1773     Arguments:
1774     previous_callout points to previous callout item
1775     ptr current pattern pointer
1776     cd pointers to tables etc
1777    
1778     Returns: nothing
1779     */
1780    
1781     static void
1782     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1783     {
1784     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1785     PUT(previous_callout, 2 + LINK_SIZE, length);
1786     }
1787    
1788    
1789    
1790     #ifdef SUPPORT_UCP
1791     /*************************************************
1792     * Get othercase range *
1793     *************************************************/
1794    
1795     /* This function is passed the start and end of a class range, in UTF-8 mode
1796     with UCP support. It searches up the characters, looking for internal ranges of
1797     characters in the "other" case. Each call returns the next one, updating the
1798     start address.
1799    
1800     Arguments:
1801     cptr points to starting character value; updated
1802     d end value
1803     ocptr where to put start of othercase range
1804     odptr where to put end of othercase range
1805    
1806     Yield: TRUE when range returned; FALSE when no more
1807     */
1808    
1809     static BOOL
1810 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1811     unsigned int *odptr)
1812 nigel 77 {
1813 nigel 93 unsigned int c, othercase, next;
1814 nigel 77
1815     for (c = *cptr; c <= d; c++)
1816 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1817 nigel 77
1818     if (c > d) return FALSE;
1819    
1820     *ocptr = othercase;
1821     next = othercase + 1;
1822    
1823     for (++c; c <= d; c++)
1824     {
1825 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1826 nigel 77 next++;
1827     }
1828    
1829     *odptr = next - 1;
1830     *cptr = c;
1831    
1832     return TRUE;
1833     }
1834     #endif /* SUPPORT_UCP */
1835    
1836    
1837 nigel 93
1838 nigel 77 /*************************************************
1839 nigel 93 * Check if auto-possessifying is possible *
1840     *************************************************/
1841    
1842     /* This function is called for unlimited repeats of certain items, to see
1843     whether the next thing could possibly match the repeated item. If not, it makes
1844     sense to automatically possessify the repeated item.
1845    
1846     Arguments:
1847     op_code the repeated op code
1848     this data for this item, depends on the opcode
1849     utf8 TRUE in UTF-8 mode
1850     utf8_char used for utf8 character bytes, NULL if not relevant
1851     ptr next character in pattern
1852     options options bits
1853     cd contains pointers to tables etc.
1854    
1855     Returns: TRUE if possessifying is wanted
1856     */
1857    
1858     static BOOL
1859     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1860     const uschar *ptr, int options, compile_data *cd)
1861     {
1862     int next;
1863    
1864     /* Skip whitespace and comments in extended mode */
1865    
1866     if ((options & PCRE_EXTENDED) != 0)
1867     {
1868     for (;;)
1869     {
1870     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1871     if (*ptr == '#')
1872     {
1873     while (*(++ptr) != 0)
1874     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1875     }
1876     else break;
1877     }
1878     }
1879    
1880     /* If the next item is one that we can handle, get its value. A non-negative
1881     value is a character, a negative value is an escape value. */
1882    
1883     if (*ptr == '\\')
1884     {
1885     int temperrorcode = 0;
1886     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1887     if (temperrorcode != 0) return FALSE;
1888     ptr++; /* Point after the escape sequence */
1889     }
1890    
1891     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1892     {
1893     #ifdef SUPPORT_UTF8
1894     if (utf8) { GETCHARINC(next, ptr); } else
1895     #endif
1896     next = *ptr++;
1897     }
1898    
1899     else return FALSE;
1900    
1901     /* Skip whitespace and comments in extended mode */
1902    
1903     if ((options & PCRE_EXTENDED) != 0)
1904     {
1905     for (;;)
1906     {
1907     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1908     if (*ptr == '#')
1909     {
1910     while (*(++ptr) != 0)
1911     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1912     }
1913     else break;
1914     }
1915     }
1916    
1917     /* If the next thing is itself optional, we have to give up. */
1918    
1919     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1920     return FALSE;
1921    
1922     /* Now compare the next item with the previous opcode. If the previous is a
1923     positive single character match, "item" either contains the character or, if
1924     "item" is greater than 127 in utf8 mode, the character's bytes are in
1925     utf8_char. */
1926    
1927    
1928     /* Handle cases when the next item is a character. */
1929    
1930     if (next >= 0) switch(op_code)
1931     {
1932     case OP_CHAR:
1933     #ifdef SUPPORT_UTF8
1934     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1935     #endif
1936     return item != next;
1937    
1938     /* For CHARNC (caseless character) we must check the other case. If we have
1939     Unicode property support, we can use it to test the other case of
1940     high-valued characters. */
1941    
1942     case OP_CHARNC:
1943     #ifdef SUPPORT_UTF8
1944     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1945     #endif
1946     if (item == next) return FALSE;
1947     #ifdef SUPPORT_UTF8
1948     if (utf8)
1949     {
1950     unsigned int othercase;
1951     if (next < 128) othercase = cd->fcc[next]; else
1952     #ifdef SUPPORT_UCP
1953     othercase = _pcre_ucp_othercase((unsigned int)next);
1954     #else
1955     othercase = NOTACHAR;
1956     #endif
1957     return (unsigned int)item != othercase;
1958     }
1959     else
1960     #endif /* SUPPORT_UTF8 */
1961     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1962    
1963     /* For OP_NOT, "item" must be a single-byte character. */
1964    
1965     case OP_NOT:
1966     if (next < 0) return FALSE; /* Not a character */
1967     if (item == next) return TRUE;
1968     if ((options & PCRE_CASELESS) == 0) return FALSE;
1969     #ifdef SUPPORT_UTF8
1970     if (utf8)
1971     {
1972     unsigned int othercase;
1973     if (next < 128) othercase = cd->fcc[next]; else
1974     #ifdef SUPPORT_UCP
1975     othercase = _pcre_ucp_othercase(next);
1976     #else
1977     othercase = NOTACHAR;
1978     #endif
1979     return (unsigned int)item == othercase;
1980     }
1981     else
1982     #endif /* SUPPORT_UTF8 */
1983     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1984    
1985     case OP_DIGIT:
1986     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1987    
1988     case OP_NOT_DIGIT:
1989     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1990    
1991     case OP_WHITESPACE:
1992     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1993    
1994     case OP_NOT_WHITESPACE:
1995     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1996    
1997     case OP_WORDCHAR:
1998     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1999    
2000     case OP_NOT_WORDCHAR:
2001     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2002    
2003 ph10 180 case OP_HSPACE:
2004     case OP_NOT_HSPACE:
2005     switch(next)
2006     {
2007     case 0x09:
2008     case 0x20:
2009     case 0xa0:
2010     case 0x1680:
2011     case 0x180e:
2012     case 0x2000:
2013     case 0x2001:
2014     case 0x2002:
2015     case 0x2003:
2016     case 0x2004:
2017     case 0x2005:
2018     case 0x2006:
2019     case 0x2007:
2020     case 0x2008:
2021     case 0x2009:
2022     case 0x200A:
2023     case 0x202f:
2024     case 0x205f:
2025     case 0x3000:
2026     return op_code != OP_HSPACE;
2027     default:
2028     return op_code == OP_HSPACE;
2029     }
2030    
2031     case OP_VSPACE:
2032     case OP_NOT_VSPACE:
2033     switch(next)
2034     {
2035     case 0x0a:
2036     case 0x0b:
2037     case 0x0c:
2038     case 0x0d:
2039     case 0x85:
2040     case 0x2028:
2041     case 0x2029:
2042     return op_code != OP_VSPACE;
2043     default:
2044     return op_code == OP_VSPACE;
2045     }
2046    
2047 nigel 93 default:
2048     return FALSE;
2049     }
2050    
2051    
2052     /* Handle the case when the next item is \d, \s, etc. */
2053    
2054     switch(op_code)
2055     {
2056     case OP_CHAR:
2057     case OP_CHARNC:
2058     #ifdef SUPPORT_UTF8
2059     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2060     #endif
2061     switch(-next)
2062     {
2063     case ESC_d:
2064     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2065    
2066     case ESC_D:
2067     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2068    
2069     case ESC_s:
2070     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2071    
2072     case ESC_S:
2073     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2074    
2075     case ESC_w:
2076     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2077    
2078     case ESC_W:
2079     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2080 ph10 182
2081 ph10 180 case ESC_h:
2082     case ESC_H:
2083     switch(item)
2084     {
2085     case 0x09:
2086     case 0x20:
2087     case 0xa0:
2088     case 0x1680:
2089     case 0x180e:
2090     case 0x2000:
2091     case 0x2001:
2092     case 0x2002:
2093     case 0x2003:
2094     case 0x2004:
2095     case 0x2005:
2096     case 0x2006:
2097     case 0x2007:
2098     case 0x2008:
2099     case 0x2009:
2100     case 0x200A:
2101     case 0x202f:
2102     case 0x205f:
2103     case 0x3000:
2104     return -next != ESC_h;
2105     default:
2106     return -next == ESC_h;
2107 ph10 182 }
2108    
2109 ph10 180 case ESC_v:
2110     case ESC_V:
2111     switch(item)
2112     {
2113     case 0x0a:
2114     case 0x0b:
2115     case 0x0c:
2116     case 0x0d:
2117     case 0x85:
2118     case 0x2028:
2119     case 0x2029:
2120     return -next != ESC_v;
2121     default:
2122     return -next == ESC_v;
2123 ph10 182 }
2124 nigel 93
2125     default:
2126     return FALSE;
2127     }
2128    
2129     case OP_DIGIT:
2130 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2131     next == -ESC_h || next == -ESC_v;
2132 nigel 93
2133     case OP_NOT_DIGIT:
2134     return next == -ESC_d;
2135    
2136     case OP_WHITESPACE:
2137     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2138    
2139     case OP_NOT_WHITESPACE:
2140 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2141 nigel 93
2142 ph10 180 case OP_HSPACE:
2143     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2144    
2145     case OP_NOT_HSPACE:
2146     return next == -ESC_h;
2147 ph10 182
2148 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2149 ph10 182 case OP_VSPACE:
2150 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2151    
2152     case OP_NOT_VSPACE:
2153 ph10 182 return next == -ESC_v;
2154 ph10 180
2155 nigel 93 case OP_WORDCHAR:
2156 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2157 nigel 93
2158     case OP_NOT_WORDCHAR:
2159     return next == -ESC_w || next == -ESC_d;
2160 ph10 182
2161 nigel 93 default:
2162     return FALSE;
2163     }
2164    
2165     /* Control does not reach here */
2166     }
2167    
2168    
2169    
2170     /*************************************************
2171 nigel 77 * Compile one branch *
2172     *************************************************/
2173    
2174 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2175 nigel 77 changed during the branch, the pointer is used to change the external options
2176 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2177     to find out the amount of memory needed, as well as during the real compile
2178     phase. The value of lengthptr distinguishes the two phases.
2179 nigel 77
2180     Arguments:
2181     optionsptr pointer to the option bits
2182     codeptr points to the pointer to the current code point
2183     ptrptr points to the current pattern pointer
2184     errorcodeptr points to error code variable
2185     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2186     reqbyteptr set to the last literal character required, else < 0
2187     bcptr points to current branch chain
2188     cd contains pointers to tables etc.
2189 nigel 93 lengthptr NULL during the real compile phase
2190     points to length accumulator during pre-compile phase
2191 nigel 77
2192     Returns: TRUE on success
2193     FALSE, with *errorcodeptr set non-zero on error
2194     */
2195    
2196     static BOOL
2197 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2198     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2199     compile_data *cd, int *lengthptr)
2200 nigel 77 {
2201     int repeat_type, op_type;
2202     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2203     int bravalue = 0;
2204     int greedy_default, greedy_non_default;
2205     int firstbyte, reqbyte;
2206     int zeroreqbyte, zerofirstbyte;
2207     int req_caseopt, reqvary, tempreqvary;
2208     int options = *optionsptr;
2209     int after_manual_callout = 0;
2210 nigel 93 int length_prevgroup = 0;
2211 nigel 77 register int c;
2212     register uschar *code = *codeptr;
2213 nigel 93 uschar *last_code = code;
2214     uschar *orig_code = code;
2215 nigel 77 uschar *tempcode;
2216     BOOL inescq = FALSE;
2217     BOOL groupsetfirstbyte = FALSE;
2218     const uschar *ptr = *ptrptr;
2219     const uschar *tempptr;
2220     uschar *previous = NULL;
2221     uschar *previous_callout = NULL;
2222 nigel 93 uschar *save_hwm = NULL;
2223 nigel 77 uschar classbits[32];
2224    
2225     #ifdef SUPPORT_UTF8
2226     BOOL class_utf8;
2227     BOOL utf8 = (options & PCRE_UTF8) != 0;
2228     uschar *class_utf8data;
2229     uschar utf8_char[6];
2230     #else
2231     BOOL utf8 = FALSE;
2232 nigel 93 uschar *utf8_char = NULL;
2233 nigel 77 #endif
2234    
2235 nigel 93 #ifdef DEBUG
2236     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2237     #endif
2238    
2239 nigel 77 /* Set up the default and non-default settings for greediness */
2240    
2241     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2242     greedy_non_default = greedy_default ^ 1;
2243    
2244     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2245     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2246     matches a non-fixed char first char; reqbyte just remains unset if we never
2247     find one.
2248    
2249     When we hit a repeat whose minimum is zero, we may have to adjust these values
2250     to take the zero repeat into account. This is implemented by setting them to
2251     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2252     item types that can be repeated set these backoff variables appropriately. */
2253    
2254     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2255    
2256     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2257     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2258     value > 255. It is added into the firstbyte or reqbyte variables to record the
2259     case status of the value. This is used only for ASCII characters. */
2260    
2261     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2262    
2263     /* Switch on next character until the end of the branch */
2264    
2265     for (;; ptr++)
2266     {
2267     BOOL negate_class;
2268     BOOL possessive_quantifier;
2269     BOOL is_quantifier;
2270 nigel 93 BOOL is_recurse;
2271 ph10 180 BOOL reset_bracount;
2272 nigel 77 int class_charcount;
2273     int class_lastchar;
2274     int newoptions;
2275     int recno;
2276 ph10 172 int refsign;
2277 nigel 77 int skipbytes;
2278     int subreqbyte;
2279     int subfirstbyte;
2280 nigel 93 int terminator;
2281 nigel 77 int mclength;
2282     uschar mcbuffer[8];
2283    
2284 nigel 93 /* Get next byte in the pattern */
2285 nigel 77
2286     c = *ptr;
2287    
2288 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2289     previous cycle of this loop. */
2290    
2291     if (lengthptr != NULL)
2292     {
2293     #ifdef DEBUG
2294     if (code > cd->hwm) cd->hwm = code; /* High water info */
2295     #endif
2296     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2297     {
2298     *errorcodeptr = ERR52;
2299     goto FAILED;
2300     }
2301    
2302     /* There is at least one situation where code goes backwards: this is the
2303     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2304     the class is simply eliminated. However, it is created first, so we have to
2305     allow memory for it. Therefore, don't ever reduce the length at this point.
2306     */
2307    
2308     if (code < last_code) code = last_code;
2309 ph10 202
2310     /* Paranoid check for integer overflow */
2311    
2312     if (OFLOW_MAX - *lengthptr < code - last_code)
2313     {
2314     *errorcodeptr = ERR20;
2315     goto FAILED;
2316     }
2317    
2318 nigel 93 *lengthptr += code - last_code;
2319     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2320    
2321     /* If "previous" is set and it is not at the start of the work space, move
2322     it back to there, in order to avoid filling up the work space. Otherwise,
2323     if "previous" is NULL, reset the current code pointer to the start. */
2324    
2325     if (previous != NULL)
2326     {
2327     if (previous > orig_code)
2328     {
2329     memmove(orig_code, previous, code - previous);
2330     code -= previous - orig_code;
2331     previous = orig_code;
2332     }
2333     }
2334     else code = orig_code;
2335    
2336     /* Remember where this code item starts so we can pick up the length
2337     next time round. */
2338    
2339     last_code = code;
2340     }
2341    
2342     /* In the real compile phase, just check the workspace used by the forward
2343     reference list. */
2344    
2345     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2346     {
2347     *errorcodeptr = ERR52;
2348     goto FAILED;
2349     }
2350    
2351 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2352    
2353     if (inescq && c != 0)
2354     {
2355     if (c == '\\' && ptr[1] == 'E')
2356     {
2357     inescq = FALSE;
2358     ptr++;
2359     continue;
2360     }
2361     else
2362     {
2363     if (previous_callout != NULL)
2364     {
2365 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2366     complete_callout(previous_callout, ptr, cd);
2367 nigel 77 previous_callout = NULL;
2368     }
2369     if ((options & PCRE_AUTO_CALLOUT) != 0)
2370     {
2371     previous_callout = code;
2372     code = auto_callout(code, ptr, cd);
2373     }
2374     goto NORMAL_CHAR;
2375     }
2376     }
2377    
2378     /* Fill in length of a previous callout, except when the next thing is
2379     a quantifier. */
2380    
2381     is_quantifier = c == '*' || c == '+' || c == '?' ||
2382     (c == '{' && is_counted_repeat(ptr+1));
2383    
2384     if (!is_quantifier && previous_callout != NULL &&
2385     after_manual_callout-- <= 0)
2386     {
2387 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2388     complete_callout(previous_callout, ptr, cd);
2389 nigel 77 previous_callout = NULL;
2390     }
2391    
2392     /* In extended mode, skip white space and comments */
2393    
2394     if ((options & PCRE_EXTENDED) != 0)
2395     {
2396     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2397     if (c == '#')
2398     {
2399 nigel 93 while (*(++ptr) != 0)
2400 nigel 91 {
2401 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2402 nigel 91 }
2403 nigel 93 if (*ptr != 0) continue;
2404    
2405 nigel 91 /* Else fall through to handle end of string */
2406     c = 0;
2407 nigel 77 }
2408     }
2409    
2410     /* No auto callout for quantifiers. */
2411    
2412     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2413     {
2414     previous_callout = code;
2415     code = auto_callout(code, ptr, cd);
2416     }
2417    
2418     switch(c)
2419     {
2420 nigel 93 /* ===================================================================*/
2421     case 0: /* The branch terminates at string end */
2422     case '|': /* or | or ) */
2423 nigel 77 case ')':
2424     *firstbyteptr = firstbyte;
2425     *reqbyteptr = reqbyte;
2426     *codeptr = code;
2427     *ptrptr = ptr;
2428 nigel 93 if (lengthptr != NULL)
2429     {
2430 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2431     {
2432     *errorcodeptr = ERR20;
2433     goto FAILED;
2434     }
2435 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2436     DPRINTF((">> end branch\n"));
2437     }
2438 nigel 77 return TRUE;
2439    
2440 nigel 93
2441     /* ===================================================================*/
2442 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2443     the setting of any following char as a first character. */
2444    
2445     case '^':
2446     if ((options & PCRE_MULTILINE) != 0)
2447     {
2448     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2449     }
2450     previous = NULL;
2451     *code++ = OP_CIRC;
2452     break;
2453    
2454     case '$':
2455     previous = NULL;
2456     *code++ = OP_DOLL;
2457     break;
2458    
2459     /* There can never be a first char if '.' is first, whatever happens about
2460     repeats. The value of reqbyte doesn't change either. */
2461    
2462     case '.':
2463     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2464     zerofirstbyte = firstbyte;
2465     zeroreqbyte = reqbyte;
2466     previous = code;
2467     *code++ = OP_ANY;
2468     break;
2469    
2470 nigel 93
2471     /* ===================================================================*/
2472 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2473     32-byte bitmap of the permitted characters, except in the special case
2474     where there is only one such character. For negated classes, we build the
2475     map as usual, then invert it at the end. However, we use a different opcode
2476     so that data characters > 255 can be handled correctly.
2477 nigel 77
2478     If the class contains characters outside the 0-255 range, a different
2479     opcode is compiled. It may optionally have a bit map for characters < 256,
2480     but those above are are explicitly listed afterwards. A flag byte tells
2481     whether the bitmap is present, and whether this is a negated class or not.
2482     */
2483    
2484     case '[':
2485     previous = code;
2486    
2487     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2488     they are encountered at the top level, so we'll do that too. */
2489    
2490     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2491     check_posix_syntax(ptr, &tempptr, cd))
2492     {
2493     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2494     goto FAILED;
2495     }
2496    
2497 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2498 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2499 ph10 205 skip them too. This makes for compatibility with Perl. */
2500 ph10 208
2501 ph10 205 negate_class = FALSE;
2502     for (;;)
2503 nigel 77 {
2504     c = *(++ptr);
2505 ph10 205 if (c == '\\')
2506     {
2507 ph10 208 if (ptr[1] == 'E') ptr++;
2508 ph10 205 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2509 ph10 208 else break;
2510 ph10 205 }
2511     else if (!negate_class && c == '^')
2512     negate_class = TRUE;
2513     else break;
2514 ph10 208 }
2515 nigel 77
2516     /* Keep a count of chars with values < 256 so that we can optimize the case
2517 nigel 93 of just a single character (as long as it's < 256). However, For higher
2518     valued UTF-8 characters, we don't yet do any optimization. */
2519 nigel 77
2520     class_charcount = 0;
2521     class_lastchar = -1;
2522    
2523 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2524     temporary bit of memory, in case the class contains only 1 character (less
2525     than 256), because in that case the compiled code doesn't use the bit map.
2526     */
2527    
2528     memset(classbits, 0, 32 * sizeof(uschar));
2529    
2530 nigel 77 #ifdef SUPPORT_UTF8
2531     class_utf8 = FALSE; /* No chars >= 256 */
2532 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2533 nigel 77 #endif
2534    
2535     /* Process characters until ] is reached. By writing this as a "do" it
2536 nigel 93 means that an initial ] is taken as a data character. At the start of the
2537     loop, c contains the first byte of the character. */
2538 nigel 77
2539 nigel 93 if (c != 0) do
2540 nigel 77 {
2541 nigel 93 const uschar *oldptr;
2542    
2543 nigel 77 #ifdef SUPPORT_UTF8
2544     if (utf8 && c > 127)
2545     { /* Braces are required because the */
2546     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2547     }
2548     #endif
2549    
2550     /* Inside \Q...\E everything is literal except \E */
2551    
2552     if (inescq)
2553     {
2554 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2555 nigel 77 {
2556 nigel 93 inescq = FALSE; /* Reset literal state */
2557     ptr++; /* Skip the 'E' */
2558     continue; /* Carry on with next */
2559 nigel 77 }
2560 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2561 nigel 77 }
2562    
2563     /* Handle POSIX class names. Perl allows a negation extension of the
2564     form [:^name:]. A square bracket that doesn't match the syntax is
2565     treated as a literal. We also recognize the POSIX constructions
2566     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2567     5.6 and 5.8 do. */
2568    
2569     if (c == '[' &&
2570     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2571     check_posix_syntax(ptr, &tempptr, cd))
2572     {
2573     BOOL local_negate = FALSE;
2574 nigel 87 int posix_class, taboffset, tabopt;
2575 nigel 77 register const uschar *cbits = cd->cbits;
2576 nigel 87 uschar pbits[32];
2577 nigel 77
2578     if (ptr[1] != ':')
2579     {
2580     *errorcodeptr = ERR31;
2581     goto FAILED;
2582     }
2583    
2584     ptr += 2;
2585     if (*ptr == '^')
2586     {
2587     local_negate = TRUE;
2588     ptr++;
2589     }
2590    
2591     posix_class = check_posix_name(ptr, tempptr - ptr);
2592     if (posix_class < 0)
2593     {
2594     *errorcodeptr = ERR30;
2595     goto FAILED;
2596     }
2597    
2598     /* If matching is caseless, upper and lower are converted to
2599     alpha. This relies on the fact that the class table starts with
2600     alpha, lower, upper as the first 3 entries. */
2601    
2602     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2603     posix_class = 0;
2604    
2605 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2606     because we may be adding and subtracting from it, and we don't want to
2607     subtract bits that may be in the main map already. At the end we or the
2608     result into the bit map that is being built. */
2609 nigel 77
2610     posix_class *= 3;
2611 nigel 87
2612     /* Copy in the first table (always present) */
2613    
2614     memcpy(pbits, cbits + posix_class_maps[posix_class],
2615     32 * sizeof(uschar));
2616    
2617     /* If there is a second table, add or remove it as required. */
2618    
2619     taboffset = posix_class_maps[posix_class + 1];
2620     tabopt = posix_class_maps[posix_class + 2];
2621    
2622     if (taboffset >= 0)
2623 nigel 77 {
2624 nigel 87 if (tabopt >= 0)
2625     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2626 nigel 77 else
2627 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2628 nigel 77 }
2629    
2630 nigel 87 /* Not see if we need to remove any special characters. An option
2631     value of 1 removes vertical space and 2 removes underscore. */
2632    
2633     if (tabopt < 0) tabopt = -tabopt;
2634     if (tabopt == 1) pbits[1] &= ~0x3c;
2635     else if (tabopt == 2) pbits[11] &= 0x7f;
2636    
2637     /* Add the POSIX table or its complement into the main table that is
2638     being built and we are done. */
2639    
2640     if (local_negate)
2641     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2642     else
2643     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2644    
2645 nigel 77 ptr = tempptr + 1;
2646     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2647     continue; /* End of POSIX syntax handling */
2648     }
2649    
2650     /* Backslash may introduce a single character, or it may introduce one
2651 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2652     case. Inside a class (and only there) it is treated as backspace.
2653     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2654 ph10 205 to 'or' into the one we are building. We assume they have more than one
2655 nigel 77 character in them, so set class_charcount bigger than one. */
2656    
2657     if (c == '\\')
2658     {
2659 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2660     if (*errorcodeptr != 0) goto FAILED;
2661 nigel 77
2662     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2663     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2664 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2665 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2666     {
2667     if (ptr[1] == '\\' && ptr[2] == 'E')
2668     {
2669     ptr += 2; /* avoid empty string */
2670     }
2671     else inescq = TRUE;
2672     continue;
2673     }
2674    
2675     if (c < 0)
2676     {
2677     register const uschar *cbits = cd->cbits;
2678     class_charcount += 2; /* Greater than 1 is what matters */
2679 nigel 93
2680     /* Save time by not doing this in the pre-compile phase. */
2681    
2682     if (lengthptr == NULL) switch (-c)
2683 nigel 77 {
2684     case ESC_d:
2685     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2686     continue;
2687    
2688     case ESC_D:
2689     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2690     continue;
2691    
2692     case ESC_w:
2693     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2694     continue;
2695    
2696     case ESC_W:
2697     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2698     continue;
2699    
2700     case ESC_s:
2701     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2702     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2703     continue;
2704    
2705     case ESC_S:
2706     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2707     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2708     continue;
2709    
2710 nigel 93 case ESC_E: /* Perl ignores an orphan \E */
2711     continue;
2712 ph10 180
2713 nigel 93 default: /* Not recognized; fall through */
2714     break; /* Need "default" setting to stop compiler warning. */
2715     }
2716    
2717     /* In the pre-compile phase, just do the recognition. */
2718    
2719     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2720     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2721 ph10 180
2722 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
2723     they use extra memory. */
2724 ph10 180
2725 ph10 178 if (-c == ESC_h)
2726     {
2727     SETBIT(classbits, 0x09); /* VT */
2728     SETBIT(classbits, 0x20); /* SPACE */
2729 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
2730 ph10 178 #ifdef SUPPORT_UTF8
2731     if (utf8)
2732 ph10 180 {
2733 ph10 178 class_utf8 = TRUE;
2734     *class_utf8data++ = XCL_SINGLE;
2735 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2736 ph10 178 *class_utf8data++ = XCL_SINGLE;
2737 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2738     *class_utf8data++ = XCL_RANGE;
2739     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2740     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2741 ph10 178 *class_utf8data++ = XCL_SINGLE;
2742 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2743 ph10 178 *class_utf8data++ = XCL_SINGLE;
2744 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2745 ph10 178 *class_utf8data++ = XCL_SINGLE;
2746 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2747     }
2748     #endif
2749     continue;
2750     }
2751 nigel 93
2752 ph10 178 if (-c == ESC_H)
2753     {
2754     for (c = 0; c < 32; c++)
2755     {
2756     int x = 0xff;
2757     switch (c)
2758 ph10 180 {
2759 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
2760     case 0x20/8: x ^= 1 << (0x20%8); break;
2761     case 0xa0/8: x ^= 1 << (0xa0%8); break;
2762     default: break;
2763     }
2764     classbits[c] |= x;
2765 ph10 180 }
2766    
2767 ph10 178 #ifdef SUPPORT_UTF8
2768     if (utf8)
2769 ph10 180 {
2770 ph10 178 class_utf8 = TRUE;
2771 ph10 180 *class_utf8data++ = XCL_RANGE;
2772     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2773     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2774     *class_utf8data++ = XCL_RANGE;
2775     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2776     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2777     *class_utf8data++ = XCL_RANGE;
2778     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2779     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2780     *class_utf8data++ = XCL_RANGE;
2781     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2782     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2783     *class_utf8data++ = XCL_RANGE;
2784     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2785     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2786     *class_utf8data++ = XCL_RANGE;
2787     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2788     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2789     *class_utf8data++ = XCL_RANGE;
2790     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2791     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2792     }
2793     #endif
2794     continue;
2795     }
2796 ph10 178
2797     if (-c == ESC_v)
2798     {
2799     SETBIT(classbits, 0x0a); /* LF */
2800     SETBIT(classbits, 0x0b); /* VT */
2801 ph10 180 SETBIT(classbits, 0x0c); /* FF */
2802     SETBIT(classbits, 0x0d); /* CR */
2803     SETBIT(classbits, 0x85); /* NEL */
2804 ph10 178 #ifdef SUPPORT_UTF8
2805     if (utf8)
2806 ph10 180 {
2807 ph10 178 class_utf8 = TRUE;
2808 ph10 180 *class_utf8data++ = XCL_RANGE;
2809     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2810     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2811     }
2812     #endif
2813     continue;
2814     }
2815 ph10 178
2816     if (-c == ESC_V)
2817     {
2818     for (c = 0; c < 32; c++)
2819     {
2820     int x = 0xff;
2821     switch (c)
2822 ph10 180 {
2823 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
2824     x ^= 1 << (0x0b%8);
2825     x ^= 1 << (0x0c%8);
2826 ph10 180 x ^= 1 << (0x0d%8);
2827 ph10 178 break;
2828     case 0x85/8: x ^= 1 << (0x85%8); break;
2829     default: break;
2830     }
2831     classbits[c] |= x;
2832 ph10 180 }
2833    
2834 ph10 178 #ifdef SUPPORT_UTF8
2835     if (utf8)
2836 ph10 180 {
2837 ph10 178 class_utf8 = TRUE;
2838 ph10 180 *class_utf8data++ = XCL_RANGE;
2839     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2840     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2841     *class_utf8data++ = XCL_RANGE;
2842     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2843     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2844     }
2845     #endif
2846     continue;
2847     }
2848 ph10 178
2849 nigel 93 /* We need to deal with \P and \p in both phases. */
2850    
2851 nigel 77 #ifdef SUPPORT_UCP
2852 nigel 93 if (-c == ESC_p || -c == ESC_P)
2853     {
2854     BOOL negated;
2855     int pdata;
2856     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2857     if (ptype < 0) goto FAILED;
2858     class_utf8 = TRUE;
2859     *class_utf8data++ = ((-c == ESC_p) != negated)?
2860     XCL_PROP : XCL_NOTPROP;
2861     *class_utf8data++ = ptype;
2862     *class_utf8data++ = pdata;
2863     class_charcount -= 2; /* Not a < 256 character */
2864 nigel 77 continue;
2865 nigel 93 }
2866 nigel 77 #endif
2867 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
2868     strict mode. By default, for compatibility with Perl, they are
2869     treated as literals. */
2870 nigel 77
2871 nigel 93 if ((options & PCRE_EXTRA) != 0)
2872     {
2873     *errorcodeptr = ERR7;
2874     goto FAILED;
2875     }
2876 nigel 77
2877 nigel 93 class_charcount -= 2; /* Undo the default count from above */
2878     c = *ptr; /* Get the final character and fall through */
2879 nigel 77 }
2880    
2881     /* Fall through if we have a single character (c >= 0). This may be
2882 nigel 93 greater than 256 in UTF-8 mode. */
2883 nigel 77
2884     } /* End of backslash handling */
2885    
2886     /* A single character may be followed by '-' to form a range. However,
2887     Perl does not permit ']' to be the end of the range. A '-' character
2888 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
2889     entirely. The code for handling \Q and \E is messy. */
2890 nigel 77
2891 nigel 93 CHECK_RANGE:
2892     while (ptr[1] == '\\' && ptr[2] == 'E')
2893 nigel 77 {
2894 nigel 93 inescq = FALSE;
2895     ptr += 2;
2896     }
2897    
2898     oldptr = ptr;
2899    
2900     if (!inescq && ptr[1] == '-')
2901     {
2902 nigel 77 int d;
2903     ptr += 2;
2904 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2905 nigel 77
2906 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
2907     mode. */
2908    
2909     while (*ptr == '\\' && ptr[1] == 'Q')
2910     {
2911     ptr += 2;
2912     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2913     inescq = TRUE;
2914     break;
2915     }
2916    
2917     if (*ptr == 0 || (!inescq && *ptr == ']'))
2918     {
2919     ptr = oldptr;
2920     goto LONE_SINGLE_CHARACTER;
2921     }
2922    
2923 nigel 77 #ifdef SUPPORT_UTF8
2924     if (utf8)
2925     { /* Braces are required because the */
2926     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2927     }
2928     else
2929     #endif
2930     d = *ptr; /* Not UTF-8 mode */
2931    
2932     /* The second part of a range can be a single-character escape, but
2933     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2934     in such circumstances. */
2935    
2936 nigel 93 if (!inescq && d == '\\')
2937 nigel 77 {
2938 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2939     if (*errorcodeptr != 0) goto FAILED;
2940 nigel 77
2941 nigel 93 /* \b is backslash; \X is literal X; \R is literal R; any other
2942     special means the '-' was literal */
2943 nigel 77
2944     if (d < 0)
2945     {
2946     if (d == -ESC_b) d = '\b';
2947 nigel 93 else if (d == -ESC_X) d = 'X';
2948     else if (d == -ESC_R) d = 'R'; else
2949 nigel 77 {
2950 nigel 93 ptr = oldptr;
2951 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2952     }
2953     }
2954     }
2955    
2956 nigel 93 /* Check that the two values are in the correct order. Optimize
2957     one-character ranges */
2958 nigel 77
2959 nigel 93 if (d < c)
2960     {
2961     *errorcodeptr = ERR8;
2962     goto FAILED;
2963     }
2964    
2965 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2966    
2967     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2968     matching, we have to use an XCLASS with extra data items. Caseless
2969     matching for characters > 127 is available only if UCP support is
2970     available. */
2971    
2972     #ifdef SUPPORT_UTF8
2973     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2974     {
2975     class_utf8 = TRUE;
2976    
2977     /* With UCP support, we can find the other case equivalents of
2978     the relevant characters. There may be several ranges. Optimize how
2979     they fit with the basic range. */
2980    
2981     #ifdef SUPPORT_UCP
2982     if ((options & PCRE_CASELESS) != 0)
2983     {
2984 nigel 93 unsigned int occ, ocd;
2985     unsigned int cc = c;
2986     unsigned int origd = d;
2987 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
2988     {
2989 ph10 180 if (occ >= (unsigned int)c &&
2990     ocd <= (unsigned int)d)
2991 ph10 176 continue; /* Skip embedded ranges */
2992 nigel 77
2993 ph10 180 if (occ < (unsigned int)c &&
2994 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
2995 nigel 77 { /* if there is overlap, */
2996     c = occ; /* noting that if occ < c */
2997     continue; /* we can't have ocd > d */
2998     } /* because a subrange is */
2999 ph10 180 if (ocd > (unsigned int)d &&
3000 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3001 nigel 77 { /* the basic range. */
3002     d = ocd;
3003     continue;
3004     }
3005    
3006     if (occ == ocd)
3007     {
3008     *class_utf8data++ = XCL_SINGLE;
3009     }
3010     else
3011     {
3012     *class_utf8data++ = XCL_RANGE;
3013     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3014     }
3015     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3016     }
3017     }
3018     #endif /* SUPPORT_UCP */
3019    
3020     /* Now record the original range, possibly modified for UCP caseless
3021     overlapping ranges. */
3022    
3023     *class_utf8data++ = XCL_RANGE;
3024     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3025     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3026    
3027     /* With UCP support, we are done. Without UCP support, there is no
3028     caseless matching for UTF-8 characters > 127; we can use the bit map
3029     for the smaller ones. */
3030    
3031     #ifdef SUPPORT_UCP
3032     continue; /* With next character in the class */
3033     #else
3034     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3035    
3036     /* Adjust upper limit and fall through to set up the map */
3037    
3038     d = 127;
3039    
3040     #endif /* SUPPORT_UCP */
3041     }
3042     #endif /* SUPPORT_UTF8 */
3043    
3044     /* We use the bit map for all cases when not in UTF-8 mode; else
3045     ranges that lie entirely within 0-127 when there is UCP support; else
3046     for partial ranges without UCP support. */
3047    
3048 nigel 93 class_charcount += d - c + 1;
3049     class_lastchar = d;
3050    
3051     /* We can save a bit of time by skipping this in the pre-compile. */
3052    
3053     if (lengthptr == NULL) for (; c <= d; c++)
3054 nigel 77 {
3055     classbits[c/8] |= (1 << (c&7));
3056     if ((options & PCRE_CASELESS) != 0)
3057     {
3058     int uc = cd->fcc[c]; /* flip case */
3059     classbits[uc/8] |= (1 << (uc&7));
3060     }
3061     }
3062    
3063     continue; /* Go get the next char in the class */
3064     }
3065    
3066     /* Handle a lone single character - we can get here for a normal
3067     non-escape char, or after \ that introduces a single character or for an
3068     apparent range that isn't. */
3069    
3070     LONE_SINGLE_CHARACTER:
3071    
3072     /* Handle a character that cannot go in the bit map */
3073    
3074     #ifdef SUPPORT_UTF8
3075     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3076     {
3077     class_utf8 = TRUE;
3078     *class_utf8data++ = XCL_SINGLE;
3079     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3080    
3081     #ifdef SUPPORT_UCP
3082     if ((options & PCRE_CASELESS) != 0)
3083     {
3084 nigel 93 unsigned int othercase;
3085     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3086 nigel 77 {
3087     *class_utf8data++ = XCL_SINGLE;
3088     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3089     }
3090     }
3091     #endif /* SUPPORT_UCP */
3092    
3093     }
3094     else
3095     #endif /* SUPPORT_UTF8 */
3096    
3097     /* Handle a single-byte character */
3098     {
3099     classbits[c/8] |= (1 << (c&7));
3100     if ((options & PCRE_CASELESS) != 0)
3101     {
3102     c = cd->fcc[c]; /* flip case */
3103     classbits[c/8] |= (1 << (c&7));
3104     }
3105     class_charcount++;
3106     class_lastchar = c;
3107     }
3108     }
3109    
3110 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3111 nigel 77
3112 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3113 nigel 77
3114 nigel 93 if (c == 0) /* Missing terminating ']' */
3115     {
3116     *errorcodeptr = ERR6;
3117     goto FAILED;
3118     }
3119 ph10 208
3120 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3121     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3122     can optimize the negative case only if there were no characters >= 128
3123     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
3124     single-bytes only. This is an historical hangover. Maybe one day we can
3125     tidy these opcodes to handle multi-byte characters.
3126    
3127     The optimization throws away the bit map. We turn the item into a
3128     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3129     that OP_NOT does not support multibyte characters. In the positive case, it
3130     can cause firstbyte to be set. Otherwise, there can be no first char if
3131     this item is first, whatever repeat count may follow. In the case of
3132     reqbyte, save the previous value for reinstating. */
3133    
3134     #ifdef SUPPORT_UTF8
3135     if (class_charcount == 1 &&
3136     (!utf8 ||
3137     (!class_utf8 && (!negate_class || class_lastchar < 128))))
3138    
3139     #else
3140     if (class_charcount == 1)
3141     #endif
3142     {
3143     zeroreqbyte = reqbyte;
3144    
3145     /* The OP_NOT opcode works on one-byte characters only. */
3146    
3147     if (negate_class)
3148     {
3149     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3150     zerofirstbyte = firstbyte;
3151     *code++ = OP_NOT;
3152     *code++ = class_lastchar;
3153     break;
3154     }
3155    
3156     /* For a single, positive character, get the value into mcbuffer, and
3157     then we can handle this with the normal one-character code. */
3158    
3159     #ifdef SUPPORT_UTF8
3160     if (utf8 && class_lastchar > 127)
3161     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3162     else
3163     #endif
3164     {
3165     mcbuffer[0] = class_lastchar;
3166     mclength = 1;
3167     }
3168     goto ONE_CHAR;
3169     } /* End of 1-char optimization */
3170    
3171     /* The general case - not the one-char optimization. If this is the first
3172     thing in the branch, there can be no first char setting, whatever the
3173     repeat count. Any reqbyte setting must remain unchanged after any kind of
3174     repeat. */
3175    
3176     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3177     zerofirstbyte = firstbyte;
3178     zeroreqbyte = reqbyte;
3179    
3180     /* If there are characters with values > 255, we have to compile an
3181     extended class, with its own opcode. If there are no characters < 256,
3182 nigel 93 we can omit the bitmap in the actual compiled code. */
3183 nigel 77
3184     #ifdef SUPPORT_UTF8
3185     if (class_utf8)
3186     {
3187     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3188     *code++ = OP_XCLASS;
3189     code += LINK_SIZE;
3190     *code = negate_class? XCL_NOT : 0;
3191    
3192 nigel 93 /* If the map is required, move up the extra data to make room for it;
3193     otherwise just move the code pointer to the end of the extra data. */
3194 nigel 77
3195     if (class_charcount > 0)
3196     {
3197     *code++ |= XCL_MAP;
3198 nigel 93 memmove(code + 32, code, class_utf8data - code);
3199 nigel 77 memcpy(code, classbits, 32);
3200 nigel 93 code = class_utf8data + 32;
3201 nigel 77 }
3202 nigel 93 else code = class_utf8data;
3203 nigel 77
3204     /* Now fill in the complete length of the item */
3205    
3206     PUT(previous, 1, code - previous);
3207     break; /* End of class handling */
3208     }
3209     #endif
3210    
3211     /* If there are no characters > 255, negate the 32-byte map if necessary,
3212     and copy it into the code vector. If this is the first thing in the branch,
3213     there can be no first char setting, whatever the repeat count. Any reqbyte
3214     setting must remain unchanged after any kind of repeat. */
3215    
3216     if (negate_class)
3217     {
3218     *code++ = OP_NCLASS;
3219 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3220     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3221 nigel 77 }
3222     else
3223     {
3224     *code++ = OP_CLASS;
3225     memcpy(code, classbits, 32);
3226     }
3227     code += 32;
3228     break;
3229    
3230 nigel 93
3231     /* ===================================================================*/
3232 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3233     has been tested above. */
3234    
3235     case '{':
3236     if (!is_quantifier) goto NORMAL_CHAR;
3237     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3238     if (*errorcodeptr != 0) goto FAILED;
3239     goto REPEAT;
3240    
3241     case '*':
3242     repeat_min = 0;
3243     repeat_max = -1;
3244     goto REPEAT;
3245    
3246     case '+':
3247     repeat_min = 1;
3248     repeat_max = -1;
3249     goto REPEAT;
3250    
3251     case '?':
3252     repeat_min = 0;
3253     repeat_max = 1;
3254    
3255     REPEAT:
3256     if (previous == NULL)
3257     {
3258     *errorcodeptr = ERR9;
3259     goto FAILED;
3260     }
3261    
3262     if (repeat_min == 0)
3263     {
3264     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3265     reqbyte = zeroreqbyte; /* Ditto */
3266     }
3267    
3268     /* Remember whether this is a variable length repeat */
3269    
3270     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3271    
3272     op_type = 0; /* Default single-char op codes */
3273     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3274    
3275     /* Save start of previous item, in case we have to move it up to make space
3276     for an inserted OP_ONCE for the additional '+' extension. */
3277    
3278     tempcode = previous;
3279    
3280     /* If the next character is '+', we have a possessive quantifier. This
3281     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3282     If the next character is '?' this is a minimizing repeat, by default,
3283     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3284     repeat type to the non-default. */
3285    
3286     if (ptr[1] == '+')
3287     {
3288     repeat_type = 0; /* Force greedy */
3289     possessive_quantifier = TRUE;
3290     ptr++;
3291     }
3292     else if (ptr[1] == '?')
3293     {
3294     repeat_type = greedy_non_default;
3295     ptr++;
3296     }
3297     else repeat_type = greedy_default;
3298    
3299     /* If previous was a character match, abolish the item and generate a
3300     repeat item instead. If a char item has a minumum of more than one, ensure
3301     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3302     the first thing in a branch because the x will have gone into firstbyte
3303     instead. */
3304    
3305     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3306     {
3307     /* Deal with UTF-8 characters that take up more than one byte. It's
3308     easier to write this out separately than try to macrify it. Use c to
3309     hold the length of the character in bytes, plus 0x80 to flag that it's a
3310     length rather than a small character. */
3311    
3312     #ifdef SUPPORT_UTF8
3313     if (utf8 && (code[-1] & 0x80) != 0)
3314     {
3315     uschar *lastchar = code - 1;
3316     while((*lastchar & 0xc0) == 0x80) lastchar--;
3317     c = code - lastchar; /* Length of UTF-8 character */
3318     memcpy(utf8_char, lastchar, c); /* Save the char */
3319     c |= 0x80; /* Flag c as a length */
3320     }
3321     else
3322     #endif
3323    
3324     /* Handle the case of a single byte - either with no UTF8 support, or
3325     with UTF-8 disabled, or for a UTF-8 character < 128. */
3326    
3327     {
3328     c = code[-1];
3329     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3330     }
3331    
3332 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3333     the line is something that cannot possibly match this character. If so,
3334     automatically possessifying this item gains some performance in the case
3335     where the match fails. */
3336    
3337     if (!possessive_quantifier &&
3338     repeat_max < 0 &&
3339     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3340     options, cd))
3341     {
3342     repeat_type = 0; /* Force greedy */
3343     possessive_quantifier = TRUE;
3344     }
3345    
3346 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3347     }
3348    
3349     /* If previous was a single negated character ([^a] or similar), we use
3350     one of the special opcodes, replacing it. The code is shared with single-
3351     character repeats by setting opt_type to add a suitable offset into
3352 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3353     currently used only for single-byte chars. */
3354 nigel 77
3355     else if (*previous == OP_NOT)
3356     {
3357     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3358     c = previous[1];
3359 nigel 93 if (!possessive_quantifier &&
3360     repeat_max < 0 &&
3361     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3362     {
3363     repeat_type = 0; /* Force greedy */
3364     possessive_quantifier = TRUE;
3365     }
3366 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3367     }
3368    
3369     /* If previous was a character type match (\d or similar), abolish it and
3370     create a suitable repeat item. The code is shared with single-character
3371     repeats by setting op_type to add a suitable offset into repeat_type. Note
3372     the the Unicode property types will be present only when SUPPORT_UCP is
3373     defined, but we don't wrap the little bits of code here because it just
3374     makes it horribly messy. */
3375    
3376     else if (*previous < OP_EODN)
3377     {
3378     uschar *oldcode;
3379 nigel 87 int prop_type, prop_value;
3380 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3381     c = *previous;
3382    
3383 nigel 93 if (!possessive_quantifier &&
3384     repeat_max < 0 &&
3385     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3386     {
3387     repeat_type = 0; /* Force greedy */
3388     possessive_quantifier = TRUE;
3389     }
3390    
3391 nigel 77 OUTPUT_SINGLE_REPEAT:
3392 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3393     {
3394     prop_type = previous[1];
3395     prop_value = previous[2];
3396     }
3397     else prop_type = prop_value = -1;
3398 nigel 77
3399     oldcode = code;
3400     code = previous; /* Usually overwrite previous item */
3401    
3402     /* If the maximum is zero then the minimum must also be zero; Perl allows
3403     this case, so we do too - by simply omitting the item altogether. */
3404    
3405     if (repeat_max == 0) goto END_REPEAT;
3406    
3407     /* All real repeats make it impossible to handle partial matching (maybe
3408     one day we will be able to remove this restriction). */
3409    
3410     if (repeat_max != 1) cd->nopartial = TRUE;
3411    
3412     /* Combine the op_type with the repeat_type */
3413    
3414     repeat_type += op_type;
3415    
3416     /* A minimum of zero is handled either as the special case * or ?, or as
3417     an UPTO, with the maximum given. */
3418    
3419     if (repeat_min == 0)
3420     {
3421     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3422     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3423     else
3424     {
3425     *code++ = OP_UPTO + repeat_type;
3426     PUT2INC(code, 0, repeat_max);
3427     }
3428     }
3429    
3430     /* A repeat minimum of 1 is optimized into some special cases. If the
3431 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3432 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3433     one less than the maximum. */
3434    
3435     else if (repeat_min == 1)
3436     {
3437     if (repeat_max == -1)
3438     *code++ = OP_PLUS + repeat_type;
3439     else
3440     {
3441     code = oldcode; /* leave previous item in place */
3442     if (repeat_max == 1) goto END_REPEAT;
3443     *code++ = OP_UPTO + repeat_type;
3444     PUT2INC(code, 0, repeat_max - 1);
3445     }
3446     }
3447    
3448     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3449     handled as an EXACT followed by an UPTO. */
3450    
3451     else
3452     {
3453     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3454     PUT2INC(code, 0, repeat_min);
3455    
3456     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3457     we have to insert the character for the previous code. For a repeated
3458 nigel 87 Unicode property match, there are two extra bytes that define the
3459 nigel 77 required property. In UTF-8 mode, long characters have their length in
3460     c, with the 0x80 bit as a flag. */
3461    
3462     if (repeat_max < 0)
3463     {
3464     #ifdef SUPPORT_UTF8
3465     if (utf8 && c >= 128)
3466     {
3467     memcpy(code, utf8_char, c & 7);
3468     code += c & 7;
3469     }
3470     else
3471     #endif
3472     {
3473     *code++ = c;
3474 nigel 87 if (prop_type >= 0)
3475     {
3476     *code++ = prop_type;
3477     *code++ = prop_value;
3478     }
3479 nigel 77 }
3480     *code++ = OP_STAR + repeat_type;
3481     }
3482    
3483     /* Else insert an UPTO if the max is greater than the min, again
3484 nigel 93 preceded by the character, for the previously inserted code. If the
3485     UPTO is just for 1 instance, we can use QUERY instead. */
3486 nigel 77
3487     else if (repeat_max != repeat_min)
3488     {
3489     #ifdef SUPPORT_UTF8
3490     if (utf8 && c >= 128)
3491     {
3492     memcpy(code, utf8_char, c & 7);
3493     code += c & 7;
3494     }
3495     else
3496     #endif
3497     *code++ = c;
3498 nigel 87 if (prop_type >= 0)
3499     {
3500     *code++ = prop_type;
3501     *code++ = prop_value;
3502     }
3503 nigel 77 repeat_max -= repeat_min;
3504 nigel 93
3505     if (repeat_max == 1)
3506     {
3507     *code++ = OP_QUERY + repeat_type;
3508     }
3509     else
3510     {
3511     *code++ = OP_UPTO + repeat_type;
3512     PUT2INC(code, 0, repeat_max);
3513     }
3514 nigel 77 }
3515     }
3516    
3517     /* The character or character type itself comes last in all cases. */
3518    
3519     #ifdef SUPPORT_UTF8
3520     if (utf8 && c >= 128)
3521     {
3522     memcpy(code, utf8_char, c & 7);
3523     code += c & 7;
3524     }
3525     else
3526     #endif
3527     *code++ = c;
3528    
3529 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3530     define the required property. */
3531 nigel 77
3532     #ifdef SUPPORT_UCP
3533 nigel 87 if (prop_type >= 0)
3534     {
3535     *code++ = prop_type;
3536     *code++ = prop_value;
3537     }
3538 nigel 77 #endif
3539     }
3540    
3541     /* If previous was a character class or a back reference, we put the repeat
3542     stuff after it, but just skip the item if the repeat was {0,0}. */
3543    
3544     else if (*previous == OP_CLASS ||
3545     *previous == OP_NCLASS ||
3546     #ifdef SUPPORT_UTF8
3547     *previous == OP_XCLASS ||
3548     #endif
3549     *previous == OP_REF)
3550     {
3551     if (repeat_max == 0)
3552     {
3553     code = previous;
3554     goto END_REPEAT;
3555     }
3556    
3557     /* All real repeats make it impossible to handle partial matching (maybe
3558     one day we will be able to remove this restriction). */
3559    
3560     if (repeat_max != 1) cd->nopartial = TRUE;
3561    
3562     if (repeat_min == 0 && repeat_max == -1)
3563     *code++ = OP_CRSTAR + repeat_type;
3564     else if (repeat_min == 1 && repeat_max == -1)
3565     *code++ = OP_CRPLUS + repeat_type;
3566     else if (repeat_min == 0 && repeat_max == 1)
3567     *code++ = OP_CRQUERY + repeat_type;
3568     else
3569     {
3570     *code++ = OP_CRRANGE + repeat_type;
3571     PUT2INC(code, 0, repeat_min);
3572     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3573     PUT2INC(code, 0, repeat_max);
3574     }
3575     }
3576    
3577     /* If previous was a bracket group, we may have to replicate it in certain
3578     cases. */
3579    
3580 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3581     *previous == OP_ONCE || *previous == OP_COND)
3582 nigel 77 {
3583     register int i;
3584     int ketoffset = 0;
3585     int len = code - previous;
3586     uschar *bralink = NULL;
3587    
3588 nigel 93 /* Repeating a DEFINE group is pointless */
3589    
3590     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3591     {
3592     *errorcodeptr = ERR55;
3593     goto FAILED;
3594     }
3595    
3596 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3597     by scanning through from the start, and compute the offset back to it
3598     from the current code pointer. There may be an OP_OPT setting following
3599     the final KET, so we can't find the end just by going back from the code
3600     pointer. */
3601    
3602     if (repeat_max == -1)
3603     {
3604     register uschar *ket = previous;
3605     do ket += GET(ket, 1); while (*ket != OP_KET);
3606     ketoffset = code - ket;
3607     }
3608    
3609     /* The case of a zero minimum is special because of the need to stick
3610     OP_BRAZERO in front of it, and because the group appears once in the
3611     data, whereas in other cases it appears the minimum number of times. For
3612     this reason, it is simplest to treat this case separately, as otherwise
3613     the code gets far too messy. There are several special subcases when the
3614     minimum is zero. */
3615    
3616     if (repeat_min == 0)
3617     {
3618     /* If the maximum is also zero, we just omit the group from the output
3619     altogether. */
3620    
3621     if (repeat_max == 0)
3622     {
3623     code = previous;
3624     goto END_REPEAT;
3625     }
3626    
3627     /* If the maximum is 1 or unlimited, we just have to stick in the
3628     BRAZERO and do no more at this point. However, we do need to adjust
3629     any OP_RECURSE calls inside the group that refer to the group itself or
3630 nigel 93 any internal or forward referenced group, because the offset is from
3631     the start of the whole regex. Temporarily terminate the pattern while
3632     doing this. */
3633 nigel 77
3634     if (repeat_max <= 1)
3635     {
3636     *code = OP_END;
3637 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3638 nigel 77 memmove(previous+1, previous, len);
3639     code++;
3640     *previous++ = OP_BRAZERO + repeat_type;
3641     }
3642    
3643     /* If the maximum is greater than 1 and limited, we have to replicate
3644     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3645     The first one has to be handled carefully because it's the original
3646     copy, which has to be moved up. The remainder can be handled by code
3647     that is common with the non-zero minimum case below. We have to
3648     adjust the value or repeat_max, since one less copy is required. Once
3649     again, we may have to adjust any OP_RECURSE calls inside the group. */
3650    
3651     else
3652     {
3653     int offset;
3654     *code = OP_END;
3655 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3656 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3657     code += 2 + LINK_SIZE;
3658     *previous++ = OP_BRAZERO + repeat_type;
3659     *previous++ = OP_BRA;
3660    
3661     /* We chain together the bracket offset fields that have to be
3662     filled in later when the ends of the brackets are reached. */
3663    
3664     offset = (bralink == NULL)? 0 : previous - bralink;
3665     bralink = previous;
3666     PUTINC(previous, 0, offset);
3667     }
3668    
3669     repeat_max--;
3670     }
3671    
3672     /* If the minimum is greater than zero, replicate the group as many
3673     times as necessary, and adjust the maximum to the number of subsequent
3674     copies that we need. If we set a first char from the group, and didn't
3675 nigel 93 set a required char, copy the latter from the former. If there are any
3676     forward reference subroutine calls in the group, there will be entries on
3677     the workspace list; replicate these with an appropriate increment. */
3678 nigel 77
3679     else
3680     {
3681     if (repeat_min > 1)
3682     {
3683 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3684 ph10 202 just adjust the length as if we had. Do some paranoid checks for
3685     potential integer overflow. */
3686 nigel 93
3687     if (lengthptr != NULL)
3688 ph10 202 {
3689     int delta = (repeat_min - 1)*length_prevgroup;
3690     if ((double)(repeat_min - 1)*(double)length_prevgroup >
3691     (double)INT_MAX ||
3692     OFLOW_MAX - *lengthptr < delta)
3693     {
3694     *errorcodeptr = ERR20;
3695     goto FAILED;
3696     }
3697     *lengthptr += delta;
3698     }
3699 nigel 93
3700     /* This is compiling for real */
3701    
3702     else
3703 nigel 77 {
3704 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3705     for (i = 1; i < repeat_min; i++)
3706     {
3707     uschar *hc;
3708     uschar *this_hwm = cd->hwm;
3709     memcpy(code, previous, len);
3710     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3711     {
3712     PUT(cd->hwm, 0, GET(hc, 0) + len);
3713     cd->hwm += LINK_SIZE;
3714     }
3715     save_hwm = this_hwm;
3716     code += len;
3717     }
3718 nigel 77 }
3719     }
3720 nigel 93
3721 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3722     }
3723    
3724     /* This code is common to both the zero and non-zero minimum cases. If
3725     the maximum is limited, it replicates the group in a nested fashion,
3726     remembering the bracket starts on a stack. In the case of a zero minimum,
3727     the first one was set up above. In all cases the repeat_max now specifies
3728 nigel 93 the number of additional copies needed. Again, we must remember to
3729     replicate entries on the forward reference list. */
3730 nigel 77
3731     if (repeat_max >= 0)
3732     {
3733 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3734     just adjust the length as if we had. For each repetition we must add 1
3735     to the length for BRAZERO and for all but the last repetition we must
3736 ph10 202 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3737     paranoid checks to avoid integer overflow. */
3738 nigel 93
3739     if (lengthptr != NULL && repeat_max > 0)
3740 ph10 202 {
3741     int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3742     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3743     if ((double)repeat_max *
3744     (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3745     > (double)INT_MAX ||
3746     OFLOW_MAX - *lengthptr < delta)
3747     {
3748     *errorcodeptr = ERR20;
3749     goto FAILED;
3750     }
3751     *lengthptr += delta;
3752     }
3753 nigel 93
3754     /* This is compiling for real */
3755    
3756     else for (i = repeat_max - 1; i >= 0; i--)
3757 nigel 77 {
3758 nigel 93 uschar *hc;
3759     uschar *this_hwm = cd->hwm;
3760    
3761 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3762    
3763     /* All but the final copy start a new nesting, maintaining the
3764     chain of brackets outstanding. */
3765    
3766     if (i != 0)
3767     {
3768     int offset;
3769     *code++ = OP_BRA;
3770     offset = (bralink == NULL)? 0 : code - bralink;
3771     bralink = code;
3772     PUTINC(code, 0, offset);
3773     }
3774    
3775     memcpy(code, previous, len);
3776 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3777     {
3778     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3779     cd->hwm += LINK_SIZE;
3780     }
3781     save_hwm = this_hwm;
3782 nigel 77 code += len;
3783     }
3784    
3785     /* Now chain through the pending brackets, and fill in their length
3786     fields (which are holding the chain links pro tem). */
3787    
3788     while (bralink != NULL)
3789     {
3790     int oldlinkoffset;
3791     int offset = code - bralink + 1;
3792     uschar *bra = code - offset;
3793     oldlinkoffset = GET(bra, 1);
3794     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3795     *code++ = OP_KET;
3796     PUTINC(code, 0, offset);
3797     PUT(bra, 1, offset);
3798     }
3799     }
3800    
3801     /* If the maximum is unlimited, set a repeater in the final copy. We
3802     can't just offset backwards from the current code point, because we
3803     don't know if there's been an options resetting after the ket. The
3804 nigel 93 correct offset was computed above.
3805 nigel 77
3806 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3807     this group is a non-atomic one that could match an empty string. If so,
3808     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3809     that runtime checking can be done. [This check is also applied to
3810     atomic groups at runtime, but in a different way.] */
3811    
3812     else
3813     {
3814     uschar *ketcode = code - ketoffset;
3815     uschar *bracode = ketcode - GET(ketcode, 1);
3816     *ketcode = OP_KETRMAX + repeat_type;
3817     if (lengthptr == NULL && *bracode != OP_ONCE)
3818     {
3819     uschar *scode = bracode;
3820     do
3821     {
3822     if (could_be_empty_branch(scode, ketcode, utf8))
3823     {
3824     *bracode += OP_SBRA - OP_BRA;
3825     break;
3826     }
3827     scode += GET(scode, 1);
3828     }
3829     while (*scode == OP_ALT);
3830     }
3831     }
3832 nigel 77 }
3833    
3834     /* Else there's some kind of shambles */
3835    
3836     else
3837     {
3838     *errorcodeptr = ERR11;
3839     goto FAILED;
3840     }
3841    
3842 nigel 93 /* If the character following a repeat is '+', or if certain optimization
3843     tests above succeeded, possessive_quantifier is TRUE. For some of the
3844     simpler opcodes, there is an special alternative opcode for this. For
3845     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3846     The '+' notation is just syntactic sugar, taken from Sun's Java package,
3847     but the special opcodes can optimize it a bit. The repeated item starts at
3848     tempcode, not at previous, which might be the first part of a string whose
3849     (former) last char we repeated.
3850 nigel 77
3851 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3852     an 'upto' may follow. We skip over an 'exact' item, and then test the
3853     length of what remains before proceeding. */
3854    
3855 nigel 77 if (possessive_quantifier)
3856     {
3857 nigel 93 int len;
3858     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3859     *tempcode == OP_NOTEXACT)
3860     tempcode += _pcre_OP_lengths[*tempcode];
3861     len = code - tempcode;
3862     if (len > 0) switch (*tempcode)
3863     {
3864     case OP_STAR: *tempcode = OP_POSSTAR; break;
3865     case OP_PLUS: *tempcode = OP_POSPLUS; break;
3866     case OP_QUERY: *tempcode = OP_POSQUERY; break;
3867     case OP_UPTO: *tempcode = OP_POSUPTO; break;
3868    
3869     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3870     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3871     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3872     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3873    
3874     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3875     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3876     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3877     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3878    
3879     default:
3880     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3881     code += 1 + LINK_SIZE;
3882     len += 1 + LINK_SIZE;
3883     tempcode[0] = OP_ONCE;
3884     *code++ = OP_KET;
3885     PUTINC(code, 0, len);
3886     PUT(tempcode, 1, len);
3887     break;
3888     }
3889 nigel 77 }
3890    
3891     /* In all case we no longer have a previous item. We also set the
3892     "follows varying string" flag for subsequently encountered reqbytes if
3893     it isn't already set and we have just passed a varying length item. */
3894    
3895     END_REPEAT:
3896     previous = NULL;
3897     cd->req_varyopt |= reqvary;
3898     break;
3899    
3900    
3901 nigel 93 /* ===================================================================*/
3902     /* Start of nested parenthesized sub-expression, or comment or lookahead or
3903     lookbehind or option setting or condition or all the other extended
3904 ph10 210 parenthesis forms. */
3905 nigel 77
3906     case '(':
3907     newoptions = options;
3908     skipbytes = 0;
3909 nigel 93 bravalue = OP_CBRA;
3910     save_hwm = cd->hwm;
3911 ph10 180 reset_bracount = FALSE;
3912 ph10 211
3913 ph10 210 /* First deal with various "verbs" that can be introduced by '*'. */
3914 ph10 211
3915 ph10 210 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
3916     {
3917 ph10 211 int i, namelen;
3918 ph10 210 const uschar *name = ++ptr;
3919     previous = NULL;
3920     while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
3921     if (*ptr == ':')
3922     {
3923     *errorcodeptr = ERR59; /* Not supported */
3924 ph10 211 goto FAILED;
3925     }
3926 ph10 210 if (*ptr != ')')
3927     {
3928     *errorcodeptr = ERR60;
3929     goto FAILED;
3930     }
3931 ph10 211 namelen = ptr - name;
3932 ph10 210 for (i = 0; i < verbcount; i++)
3933 ph10 211 {
3934 ph10 210 if (namelen == verbs[i].len &&
3935     strncmp((char *)name, verbs[i].name, namelen) == 0)
3936     {
3937     *code = verbs[i].op;
3938     if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
3939     break;
3940 ph10 211 }
3941     }
3942     if (i < verbcount) continue;
3943 ph10 210 *errorcodeptr = ERR60;
3944 ph10 211 goto FAILED;
3945     }
3946    
3947 ph10 210 /* Deal with the extended parentheses; all are introduced by '?', and the
3948     appearance of any of them means that this is not a capturing group. */
3949 nigel 77
3950 ph10 210 else if (*ptr == '?')
3951 nigel 77 {
3952 nigel 93 int i, set, unset, namelen;
3953 nigel 77 int *optset;
3954 nigel 93 const uschar *name;
3955     uschar *slot;
3956 nigel 77
3957     switch (*(++ptr))
3958     {
3959     case '#': /* Comment; skip to ket */
3960     ptr++;
3961 nigel 93 while (*ptr != 0 && *ptr != ')') ptr++;
3962     if (*ptr == 0)
3963     {
3964     *errorcodeptr = ERR18;
3965     goto FAILED;
3966     }
3967 nigel 77 continue;
3968    
3969 nigel 93
3970     /* ------------------------------------------------------------ */
3971 ph10 175 case '|': /* Reset capture count for each branch */
3972     reset_bracount = TRUE;
3973 ph10 180 /* Fall through */
3974 ph10 175
3975     /* ------------------------------------------------------------ */
3976 nigel 93 case ':': /* Non-capturing bracket */
3977 nigel 77 bravalue = OP_BRA;
3978     ptr++;
3979     break;
3980    
3981 nigel 93
3982     /* ------------------------------------------------------------ */
3983 nigel 77 case '(':
3984     bravalue = OP_COND; /* Conditional group */
3985    
3986 nigel 93 /* A condition can be an assertion, a number (referring to a numbered
3987     group), a name (referring to a named group), or 'R', referring to
3988     recursion. R<digits> and R&name are also permitted for recursion tests.
3989 nigel 77
3990 nigel 93 There are several syntaxes for testing a named group: (?(name)) is used
3991     by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3992    
3993     There are two unfortunate ambiguities, caused by history. (a) 'R' can
3994     be the recursive thing or the name 'R' (and similarly for 'R' followed
3995     by digits), and (b) a number could be a name that consists of digits.
3996     In both cases, we look for a name first; if not found, we try the other
3997     cases. */
3998    
3999     /* For conditions that are assertions, check the syntax, and then exit
4000     the switch. This will take control down to where bracketed groups,
4001     including assertions, are processed. */
4002    
4003     if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4004     break;
4005    
4006     /* Most other conditions use OP_CREF (a couple change to OP_RREF
4007     below), and all need to skip 3 bytes at the start of the group. */
4008    
4009     code[1+LINK_SIZE] = OP_CREF;
4010     skipbytes = 3;
4011 ph10 172 refsign = -1;
4012 nigel 93
4013     /* Check for a test for recursion in a named group. */
4014    
4015     if (ptr[1] == 'R' && ptr[2] == '&')
4016 nigel 77 {
4017 nigel 93 terminator = -1;
4018     ptr += 2;
4019     code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4020     }
4021 nigel 91
4022 nigel 93 /* Check for a test for a named group's having been set, using the Perl
4023     syntax (?(<name>) or (?('name') */
4024 nigel 91
4025 nigel 93 else if (ptr[1] == '<')
4026     {
4027     terminator = '>';
4028     ptr++;
4029     }
4030     else if (ptr[1] == '\'')
4031     {
4032     terminator = '\'';
4033     ptr++;
4034     }
4035 ph10 172 else
4036 ph10 167 {
4037     terminator = 0;
4038 ph10 172 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4039     }
4040 nigel 77
4041 nigel 93 /* We now expect to read a name; any thing else is an error */
4042 nigel 77
4043 nigel 93 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4044     {
4045     ptr += 1; /* To get the right offset */
4046     *errorcodeptr = ERR28;
4047     goto FAILED;
4048     }
4049    
4050     /* Read the name, but also get it as a number if it's all digits */
4051    
4052     recno = 0;
4053     name = ++ptr;
4054     while ((cd->ctypes[*ptr] & ctype_word) != 0)
4055     {
4056     if (recno >= 0)
4057     recno = ((digitab[*ptr] & ctype_digit) != 0)?
4058     recno * 10 + *ptr - '0' : -1;
4059 nigel 91 ptr++;
4060 nigel 93 }
4061     namelen = ptr - name;
4062 nigel 91
4063 nigel 93 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4064     {
4065     ptr--; /* Error offset */
4066     *errorcodeptr = ERR26;
4067     goto FAILED;
4068     }
4069 nigel 91
4070 nigel 93 /* Do no further checking in the pre-compile phase. */
4071 nigel 91
4072 nigel 93 if (lengthptr != NULL) break;
4073 nigel 91
4074 nigel 93 /* In the real compile we do the work of looking for the actual
4075 ph10 167 reference. If the string started with "+" or "-" we require the rest to
4076     be digits, in which case recno will be set. */
4077 ph10 172
4078 ph10 167 if (refsign > 0)
4079     {
4080     if (recno <= 0)
4081     {
4082     *errorcodeptr = ERR58;
4083     goto FAILED;
4084 ph10 172 }
4085 ph10 167 if (refsign == '-')
4086     {
4087 ph10 172 recno = cd->bracount - recno + 1;
4088 ph10 167 if (recno <= 0)
4089     {
4090     *errorcodeptr = ERR15;
4091     goto FAILED;