/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 199 - (hide annotations) (download)
Tue Jul 31 14:39:09 2007 UTC (5 years, 9 months ago) by ph10
File MIME type: text/plain
File size: 183796 byte(s)
Daniel's patch for config.h and Windows DLL declarations (not fully working).

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 199 #ifdef HAVE_CONFIG_H
46     #include <config.h>
47     #endif
48    
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68    
69 nigel 77 /*************************************************
70     * Code parameters and static tables *
71     *************************************************/
72    
73 nigel 93 /* This value specifies the size of stack workspace that is used during the
74     first pre-compile phase that determines how much memory is required. The regex
75     is partly compiled into this space, but the compiled parts are discarded as
76     soon as they can be, so that hopefully there will never be an overrun. The code
77     does, however, check for an overrun. The largest amount I've seen used is 218,
78     so this number is very generous.
79 nigel 77
80 nigel 93 The same workspace is used during the second, actual compile phase for
81     remembering forward references to groups so that they can be filled in at the
82     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
83     is 4 there is plenty of room. */
84 nigel 77
85 nigel 93 #define COMPILE_WORK_SIZE (4096)
86 nigel 77
87 nigel 93
88 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
89     are simple data values; negative values are for special things like \d and so
90     on. Zero means further processing is needed (for things like \x), or the escape
91     is invalid. */
92    
93 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
94 nigel 77 static const short int escapes[] = {
95     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
96     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
97     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
98 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
99     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
100 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
101     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
102 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
103     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
104 nigel 77 0, 0, -ESC_z /* x - z */
105     };
106    
107 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
108 nigel 77 static const short int escapes[] = {
109     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
110     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
111     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
112     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
113     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
114     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
115     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
116     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
117 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
118 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
119 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
120 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
121 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
122     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
123     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
124     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
125 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
126 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
127 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
128 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
129 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
130     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
131     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
132     };
133     #endif
134    
135    
136     /* Tables of names of POSIX character classes and their lengths. The list is
137 nigel 87 terminated by a zero length entry. The first three must be alpha, lower, upper,
138 nigel 77 as this is assumed for handling case independence. */
139    
140     static const char *const posix_names[] = {
141     "alpha", "lower", "upper",
142     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
143     "print", "punct", "space", "word", "xdigit" };
144    
145     static const uschar posix_name_lengths[] = {
146     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
147    
148 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
149     base map, with an optional addition or removal of another map. Then, for some
150     classes, there is some additional tweaking: for [:blank:] the vertical space
151     characters are removed, and for [:alpha:] and [:alnum:] the underscore
152     character is removed. The triples in the table consist of the base map offset,
153     second map offset or -1 if no second map, and a non-negative value for map
154     addition or a negative value for map subtraction (if there are two maps). The
155     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
156     remove vertical space characters, 2 => remove underscore. */
157 nigel 77
158     static const int posix_class_maps[] = {
159 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
160     cbit_lower, -1, 0, /* lower */
161     cbit_upper, -1, 0, /* upper */
162     cbit_word, -1, 2, /* alnum - word without underscore */
163     cbit_print, cbit_cntrl, 0, /* ascii */
164     cbit_space, -1, 1, /* blank - a GNU extension */
165     cbit_cntrl, -1, 0, /* cntrl */
166     cbit_digit, -1, 0, /* digit */
167     cbit_graph, -1, 0, /* graph */
168     cbit_print, -1, 0, /* print */
169     cbit_punct, -1, 0, /* punct */
170     cbit_space, -1, 0, /* space */
171     cbit_word, -1, 0, /* word - a Perl extension */
172     cbit_xdigit,-1, 0 /* xdigit */
173 nigel 77 };
174    
175    
176 nigel 93 #define STRING(a) # a
177     #define XSTRING(s) STRING(s)
178    
179 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
180 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
181     they are documented. Always add a new error instead. Messages marked DEAD below
182     are no longer used. */
183 nigel 77
184     static const char *error_texts[] = {
185     "no error",
186     "\\ at end of pattern",
187     "\\c at end of pattern",
188     "unrecognized character follows \\",
189     "numbers out of order in {} quantifier",
190     /* 5 */
191     "number too big in {} quantifier",
192     "missing terminating ] for character class",
193     "invalid escape sequence in character class",
194     "range out of order in character class",
195     "nothing to repeat",
196     /* 10 */
197 nigel 93 "operand of unlimited repeat could match the empty string", /** DEAD **/
198 nigel 77 "internal error: unexpected repeat",
199     "unrecognized character after (?",
200     "POSIX named classes are supported only within a class",
201     "missing )",
202     /* 15 */
203     "reference to non-existent subpattern",
204     "erroffset passed as NULL",
205     "unknown option bit(s) set",
206     "missing ) after comment",
207 nigel 93 "parentheses nested too deeply", /** DEAD **/
208 nigel 77 /* 20 */
209     "regular expression too large",
210     "failed to get memory",
211     "unmatched parentheses",
212     "internal error: code overflow",
213     "unrecognized character after (?<",
214     /* 25 */
215     "lookbehind assertion is not fixed length",
216 nigel 91 "malformed number or name after (?(",
217 nigel 77 "conditional group contains more than two branches",
218     "assertion expected after (?(",
219 ph10 166 "(?R or (?[+-]digits must be followed by )",
220 nigel 77 /* 30 */
221     "unknown POSIX class name",
222     "POSIX collating elements are not supported",
223     "this version of PCRE is not compiled with PCRE_UTF8 support",
224 nigel 93 "spare error", /** DEAD **/
225 nigel 77 "character value in \\x{...} sequence is too large",
226     /* 35 */
227     "invalid condition (?(0)",
228     "\\C not allowed in lookbehind assertion",
229     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
230     "number after (?C is > 255",
231     "closing ) for (?C expected",
232     /* 40 */
233     "recursive call could loop indefinitely",
234     "unrecognized character after (?P",
235 nigel 93 "syntax error in subpattern name (missing terminator)",
236 nigel 91 "two named subpatterns have the same name",
237 nigel 77 "invalid UTF-8 string",
238     /* 45 */
239     "support for \\P, \\p, and \\X has not been compiled",
240     "malformed \\P or \\p sequence",
241 nigel 91 "unknown property name after \\P or \\p",
242 nigel 93 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
243     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
244 nigel 91 /* 50 */
245     "repeated subpattern is too long",
246 nigel 93 "octal value is greater than \\377 (not in UTF-8 mode)",
247     "internal error: overran compiling workspace",
248     "internal error: previously-checked referenced subpattern not found",
249     "DEFINE group contains more than one branch",
250     /* 55 */
251     "repeating a DEFINE group is not allowed",
252     "inconsistent NEWLINE options",
253 ph10 171 "\\g is not followed by a braced name or an optionally braced non-zero number",
254 ph10 172 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
255 nigel 77 };
256    
257    
258     /* Table to identify digits and hex digits. This is used when compiling
259     patterns. Note that the tables in chartables are dependent on the locale, and
260     may mark arbitrary characters as digits - but the PCRE compiling code expects
261     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
262     a private table here. It costs 256 bytes, but it is a lot faster than doing
263     character value tests (at least in some simple cases I timed), and in some
264     applications one wants PCRE to compile efficiently as well as match
265     efficiently.
266    
267     For convenience, we use the same bit definitions as in chartables:
268    
269     0x04 decimal digit
270     0x08 hexadecimal digit
271    
272     Then we can use ctype_digit and ctype_xdigit in the code. */
273    
274 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
275 nigel 77 static const unsigned char digitab[] =
276     {
277     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
278     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
279     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
280     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
281     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
282     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
283     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
284     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
285     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
286     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
287     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
288     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
289     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
290     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
291     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
292     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
293     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
294     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
295     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
296     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
297     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
298     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
299     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
300     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
301     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
302     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
303     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
304     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
305     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
306     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
307     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
308     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
309    
310 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
311 nigel 77 static const unsigned char digitab[] =
312     {
313     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
314     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
315     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
316     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
317     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
318     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
319     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
320     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
321     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
322     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
323     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
324 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
325 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
326     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
329     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
333     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
334     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
335     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
336     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
337     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
338     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
339     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
340     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
341     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
342     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
343     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
344     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
345    
346     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
347     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
348     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
349     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
350     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
351     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
352     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
355     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
356     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
357     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
358 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
359 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
360     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
361     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
362     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
363     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
364     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
365     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
366     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
367     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
368     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
369     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
370     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
371     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
372     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
373     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
374     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
375     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
376     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
377     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
378     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
379     #endif
380    
381    
382     /* Definition to allow mutual recursion */
383    
384     static BOOL
385 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
386 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
387 nigel 77
388    
389    
390     /*************************************************
391     * Handle escapes *
392     *************************************************/
393    
394     /* This function is called when a \ has been encountered. It either returns a
395     positive value for a simple escape such as \n, or a negative value which
396 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
397     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
398     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
399     ptr is pointing at the \. On exit, it is on the final character of the escape
400     sequence.
401 nigel 77
402     Arguments:
403     ptrptr points to the pattern position pointer
404     errorcodeptr points to the errorcode variable
405     bracount number of previous extracting brackets
406     options the options bits
407     isclass TRUE if inside a character class
408    
409     Returns: zero or positive => a data character
410     negative => a special escape sequence
411     on error, errorptr is set
412     */
413    
414     static int
415     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
416     int options, BOOL isclass)
417     {
418 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
419     const uschar *ptr = *ptrptr + 1;
420 nigel 77 int c, i;
421    
422 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
423     ptr--; /* Set pointer back to the last byte */
424    
425 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
426    
427     if (c == 0) *errorcodeptr = ERR1;
428    
429     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
430     a table. A non-zero result is something that can be returned immediately.
431     Otherwise further processing may be required. */
432    
433 ph10 97 #ifndef EBCDIC /* ASCII coding */
434 nigel 77 else if (c < '0' || c > 'z') {} /* Not alphameric */
435     else if ((i = escapes[c - '0']) != 0) c = i;
436    
437 ph10 97 #else /* EBCDIC coding */
438 nigel 77 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
439     else if ((i = escapes[c - 0x48]) != 0) c = i;
440     #endif
441    
442     /* Escapes that need further processing, or are illegal. */
443    
444     else
445     {
446     const uschar *oldptr;
447 nigel 93 BOOL braced, negated;
448    
449 nigel 77 switch (c)
450     {
451     /* A number of Perl escapes are not handled by PCRE. We give an explicit
452     error. */
453    
454     case 'l':
455     case 'L':
456     case 'N':
457     case 'u':
458     case 'U':
459     *errorcodeptr = ERR37;
460     break;
461    
462 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
463     is an absolute backreference. If negative, it is a relative backreference.
464 ph10 172 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
465     reference to a named group. This is part of Perl's movement towards a
466     unified syntax for back references. As this is synonymous with \k{name}, we
467 ph10 171 fudge it up by pretending it really was \k. */
468 nigel 93
469     case 'g':
470     if (ptr[1] == '{')
471     {
472 ph10 171 const uschar *p;
473     for (p = ptr+2; *p != 0 && *p != '}'; p++)
474     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
475 ph10 172 if (*p != 0 && *p != '}')
476 ph10 171 {
477     c = -ESC_k;
478     break;
479 ph10 172 }
480 nigel 93 braced = TRUE;
481     ptr++;
482     }
483     else braced = FALSE;
484    
485     if (ptr[1] == '-')
486     {
487     negated = TRUE;
488     ptr++;
489     }
490     else negated = FALSE;
491    
492     c = 0;
493     while ((digitab[ptr[1]] & ctype_digit) != 0)
494     c = c * 10 + *(++ptr) - '0';
495    
496     if (c == 0 || (braced && *(++ptr) != '}'))
497     {
498     *errorcodeptr = ERR57;
499     return 0;
500     }
501    
502     if (negated)
503     {
504     if (c > bracount)
505     {
506     *errorcodeptr = ERR15;
507     return 0;
508     }
509     c = bracount - (c - 1);
510     }
511    
512     c = -(ESC_REF + c);
513     break;
514    
515 nigel 77 /* The handling of escape sequences consisting of a string of digits
516     starting with one that is not zero is not straightforward. By experiment,
517     the way Perl works seems to be as follows:
518    
519     Outside a character class, the digits are read as a decimal number. If the
520     number is less than 10, or if there are that many previous extracting
521     left brackets, then it is a back reference. Otherwise, up to three octal
522     digits are read to form an escaped byte. Thus \123 is likely to be octal
523     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
524     value is greater than 377, the least significant 8 bits are taken. Inside a
525     character class, \ followed by a digit is always an octal number. */
526    
527     case '1': case '2': case '3': case '4': case '5':
528     case '6': case '7': case '8': case '9':
529    
530     if (!isclass)
531     {
532     oldptr = ptr;
533     c -= '0';
534     while ((digitab[ptr[1]] & ctype_digit) != 0)
535     c = c * 10 + *(++ptr) - '0';
536     if (c < 10 || c <= bracount)
537     {
538     c = -(ESC_REF + c);
539     break;
540     }
541     ptr = oldptr; /* Put the pointer back and fall through */
542     }
543    
544     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
545     generates a binary zero byte and treats the digit as a following literal.
546     Thus we have to pull back the pointer by one. */
547    
548     if ((c = *ptr) >= '8')
549     {
550     ptr--;
551     c = 0;
552     break;
553     }
554    
555     /* \0 always starts an octal number, but we may drop through to here with a
556 nigel 91 larger first octal digit. The original code used just to take the least
557     significant 8 bits of octal numbers (I think this is what early Perls used
558     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
559     than 3 octal digits. */
560 nigel 77
561     case '0':
562     c -= '0';
563     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
564     c = c * 8 + *(++ptr) - '0';
565 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
566 nigel 77 break;
567    
568 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
569     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
570     treated as a data character. */
571 nigel 77
572     case 'x':
573 nigel 87 if (ptr[1] == '{')
574 nigel 77 {
575     const uschar *pt = ptr + 2;
576 nigel 87 int count = 0;
577    
578 nigel 77 c = 0;
579     while ((digitab[*pt] & ctype_xdigit) != 0)
580     {
581 nigel 87 register int cc = *pt++;
582     if (c == 0 && cc == '0') continue; /* Leading zeroes */
583 nigel 77 count++;
584 nigel 87
585 ph10 97 #ifndef EBCDIC /* ASCII coding */
586 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
587 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
588 ph10 97 #else /* EBCDIC coding */
589 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
590 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
591 nigel 77 #endif
592     }
593 nigel 87
594 nigel 77 if (*pt == '}')
595     {
596 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
597 nigel 77 ptr = pt;
598     break;
599     }
600 nigel 87
601 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
602     recognize this construct; fall through to the normal \x handling. */
603     }
604    
605 nigel 87 /* Read just a single-byte hex-defined char */
606 nigel 77
607     c = 0;
608     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
609     {
610     int cc; /* Some compilers don't like ++ */
611     cc = *(++ptr); /* in initializers */
612 ph10 97 #ifndef EBCDIC /* ASCII coding */
613 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
614     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
615 ph10 97 #else /* EBCDIC coding */
616 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
617     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
618     #endif
619     }
620     break;
621    
622 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
623     This coding is ASCII-specific, but then the whole concept of \cx is
624     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
625 nigel 77
626     case 'c':
627     c = *(++ptr);
628     if (c == 0)
629     {
630     *errorcodeptr = ERR2;
631     return 0;
632     }
633    
634 ph10 97 #ifndef EBCDIC /* ASCII coding */
635 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
636     c ^= 0x40;
637 ph10 97 #else /* EBCDIC coding */
638 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
639     c ^= 0xC0;
640     #endif
641     break;
642    
643     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
644     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
645     for Perl compatibility, it is a literal. This code looks a bit odd, but
646     there used to be some cases other than the default, and there may be again
647     in future, so I haven't "optimized" it. */
648    
649     default:
650     if ((options & PCRE_EXTRA) != 0) switch(c)
651     {
652     default:
653     *errorcodeptr = ERR3;
654     break;
655     }
656     break;
657     }
658     }
659    
660     *ptrptr = ptr;
661     return c;
662     }
663    
664    
665    
666     #ifdef SUPPORT_UCP
667     /*************************************************
668     * Handle \P and \p *
669     *************************************************/
670    
671     /* This function is called after \P or \p has been encountered, provided that
672     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
673     pointing at the P or p. On exit, it is pointing at the final character of the
674     escape sequence.
675    
676     Argument:
677     ptrptr points to the pattern position pointer
678     negptr points to a boolean that is set TRUE for negation else FALSE
679 nigel 87 dptr points to an int that is set to the detailed property value
680 nigel 77 errorcodeptr points to the error code variable
681    
682 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
683 nigel 77 */
684    
685     static int
686 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
687 nigel 77 {
688     int c, i, bot, top;
689     const uschar *ptr = *ptrptr;
690 nigel 87 char name[32];
691 nigel 77
692     c = *(++ptr);
693     if (c == 0) goto ERROR_RETURN;
694    
695     *negptr = FALSE;
696    
697 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
698     negation. */
699 nigel 77
700     if (c == '{')
701     {
702     if (ptr[1] == '^')
703     {
704     *negptr = TRUE;
705     ptr++;
706     }
707 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
708 nigel 77 {
709     c = *(++ptr);
710     if (c == 0) goto ERROR_RETURN;
711     if (c == '}') break;
712     name[i] = c;
713     }
714 nigel 87 if (c !='}') goto ERROR_RETURN;
715 nigel 77 name[i] = 0;
716     }
717    
718     /* Otherwise there is just one following character */
719    
720     else
721     {
722     name[0] = c;
723     name[1] = 0;
724     }
725    
726     *ptrptr = ptr;
727    
728     /* Search for a recognized property name using binary chop */
729    
730     bot = 0;
731     top = _pcre_utt_size;
732    
733     while (bot < top)
734     {
735 nigel 87 i = (bot + top) >> 1;
736 nigel 77 c = strcmp(name, _pcre_utt[i].name);
737 nigel 87 if (c == 0)
738     {
739     *dptr = _pcre_utt[i].value;
740     return _pcre_utt[i].type;
741     }
742 nigel 77 if (c > 0) bot = i + 1; else top = i;
743     }
744    
745     *errorcodeptr = ERR47;
746     *ptrptr = ptr;
747     return -1;
748    
749     ERROR_RETURN:
750     *errorcodeptr = ERR46;
751     *ptrptr = ptr;
752     return -1;
753     }
754     #endif
755    
756    
757    
758    
759     /*************************************************
760     * Check for counted repeat *
761     *************************************************/
762    
763     /* This function is called when a '{' is encountered in a place where it might
764     start a quantifier. It looks ahead to see if it really is a quantifier or not.
765     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
766     where the ddds are digits.
767    
768     Arguments:
769     p pointer to the first char after '{'
770    
771     Returns: TRUE or FALSE
772     */
773    
774     static BOOL
775     is_counted_repeat(const uschar *p)
776     {
777     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
778     while ((digitab[*p] & ctype_digit) != 0) p++;
779     if (*p == '}') return TRUE;
780    
781     if (*p++ != ',') return FALSE;
782     if (*p == '}') return TRUE;
783    
784     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
785     while ((digitab[*p] & ctype_digit) != 0) p++;
786    
787     return (*p == '}');
788     }
789    
790    
791    
792     /*************************************************
793     * Read repeat counts *
794     *************************************************/
795    
796     /* Read an item of the form {n,m} and return the values. This is called only
797     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
798     so the syntax is guaranteed to be correct, but we need to check the values.
799    
800     Arguments:
801     p pointer to first char after '{'
802     minp pointer to int for min
803     maxp pointer to int for max
804     returned as -1 if no max
805     errorcodeptr points to error code variable
806    
807     Returns: pointer to '}' on success;
808     current ptr on error, with errorcodeptr set non-zero
809     */
810    
811     static const uschar *
812     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
813     {
814     int min = 0;
815     int max = -1;
816    
817 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
818     an integer overflow. */
819    
820 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
821 nigel 81 if (min < 0 || min > 65535)
822     {
823     *errorcodeptr = ERR5;
824     return p;
825     }
826 nigel 77
827 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
828     Also, max must not be less than min. */
829    
830 nigel 77 if (*p == '}') max = min; else
831     {
832     if (*(++p) != '}')
833     {
834     max = 0;
835     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
836 nigel 81 if (max < 0 || max > 65535)
837     {
838     *errorcodeptr = ERR5;
839     return p;
840     }
841 nigel 77 if (max < min)
842     {
843     *errorcodeptr = ERR4;
844     return p;
845     }
846     }
847     }
848    
849 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
850     '}'. */
851 nigel 77
852 nigel 81 *minp = min;
853     *maxp = max;
854 nigel 77 return p;
855     }
856    
857    
858    
859     /*************************************************
860 nigel 93 * Find forward referenced subpattern *
861 nigel 91 *************************************************/
862    
863 nigel 93 /* This function scans along a pattern's text looking for capturing
864     subpatterns, and counting them. If it finds a named pattern that matches the
865     name it is given, it returns its number. Alternatively, if the name is NULL, it
866     returns when it reaches a given numbered subpattern. This is used for forward
867     references to subpatterns. We know that if (?P< is encountered, the name will
868     be terminated by '>' because that is checked in the first pass.
869 nigel 91
870     Arguments:
871 nigel 93 ptr current position in the pattern
872     count current count of capturing parens so far encountered
873     name name to seek, or NULL if seeking a numbered subpattern
874     lorn name length, or subpattern number if name is NULL
875     xmode TRUE if we are in /x mode
876 nigel 91
877     Returns: the number of the named subpattern, or -1 if not found
878     */
879    
880     static int
881 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
882     BOOL xmode)
883 nigel 91 {
884     const uschar *thisname;
885 nigel 93
886 nigel 91 for (; *ptr != 0; ptr++)
887     {
888 nigel 93 int term;
889    
890     /* Skip over backslashed characters and also entire \Q...\E */
891    
892     if (*ptr == '\\')
893     {
894     if (*(++ptr) == 0) return -1;
895     if (*ptr == 'Q') for (;;)
896     {
897     while (*(++ptr) != 0 && *ptr != '\\');
898     if (*ptr == 0) return -1;
899     if (*(++ptr) == 'E') break;
900     }
901     continue;
902     }
903    
904     /* Skip over character classes */
905    
906     if (*ptr == '[')
907     {
908     while (*(++ptr) != ']')
909     {
910     if (*ptr == '\\')
911     {
912     if (*(++ptr) == 0) return -1;
913     if (*ptr == 'Q') for (;;)
914     {
915     while (*(++ptr) != 0 && *ptr != '\\');
916     if (*ptr == 0) return -1;
917     if (*(++ptr) == 'E') break;
918     }
919     continue;
920     }
921     }
922     continue;
923     }
924    
925     /* Skip comments in /x mode */
926    
927     if (xmode && *ptr == '#')
928     {
929     while (*(++ptr) != 0 && *ptr != '\n');
930     if (*ptr == 0) return -1;
931     continue;
932     }
933    
934     /* An opening parens must now be a real metacharacter */
935    
936 nigel 91 if (*ptr != '(') continue;
937 nigel 93 if (ptr[1] != '?')
938     {
939     count++;
940     if (name == NULL && count == lorn) return count;
941     continue;
942     }
943    
944     ptr += 2;
945     if (*ptr == 'P') ptr++; /* Allow optional P */
946    
947     /* We have to disambiguate (?<! and (?<= from (?<name> */
948    
949     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
950     *ptr != '\'')
951     continue;
952    
953 nigel 91 count++;
954 nigel 93
955     if (name == NULL && count == lorn) return count;
956     term = *ptr++;
957     if (term == '<') term = '>';
958 nigel 91 thisname = ptr;
959 nigel 93 while (*ptr != term) ptr++;
960     if (name != NULL && lorn == ptr - thisname &&
961     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
962 nigel 91 return count;
963     }
964 nigel 93
965 nigel 91 return -1;
966     }
967    
968    
969    
970     /*************************************************
971 nigel 77 * Find first significant op code *
972     *************************************************/
973    
974     /* This is called by several functions that scan a compiled expression looking
975     for a fixed first character, or an anchoring op code etc. It skips over things
976     that do not influence this. For some calls, a change of option is important.
977     For some calls, it makes sense to skip negative forward and all backward
978     assertions, and also the \b assertion; for others it does not.
979    
980     Arguments:
981     code pointer to the start of the group
982     options pointer to external options
983     optbit the option bit whose changing is significant, or
984     zero if none are
985     skipassert TRUE if certain assertions are to be skipped
986    
987     Returns: pointer to the first significant opcode
988     */
989    
990     static const uschar*
991     first_significant_code(const uschar *code, int *options, int optbit,
992     BOOL skipassert)
993     {
994     for (;;)
995     {
996     switch ((int)*code)
997     {
998     case OP_OPT:
999     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1000     *options = (int)code[1];
1001     code += 2;
1002     break;
1003    
1004     case OP_ASSERT_NOT:
1005     case OP_ASSERTBACK:
1006     case OP_ASSERTBACK_NOT:
1007     if (!skipassert) return code;
1008     do code += GET(code, 1); while (*code == OP_ALT);
1009     code += _pcre_OP_lengths[*code];
1010     break;
1011    
1012     case OP_WORD_BOUNDARY:
1013     case OP_NOT_WORD_BOUNDARY:
1014     if (!skipassert) return code;
1015     /* Fall through */
1016    
1017     case OP_CALLOUT:
1018     case OP_CREF:
1019 nigel 93 case OP_RREF:
1020     case OP_DEF:
1021 nigel 77 code += _pcre_OP_lengths[*code];
1022     break;
1023    
1024     default:
1025     return code;
1026     }
1027     }
1028     /* Control never reaches here */
1029     }
1030    
1031    
1032    
1033    
1034     /*************************************************
1035     * Find the fixed length of a pattern *
1036     *************************************************/
1037    
1038     /* Scan a pattern and compute the fixed length of subject that will match it,
1039     if the length is fixed. This is needed for dealing with backward assertions.
1040     In UTF8 mode, the result is in characters rather than bytes.
1041    
1042     Arguments:
1043     code points to the start of the pattern (the bracket)
1044     options the compiling options
1045    
1046     Returns: the fixed length, or -1 if there is no fixed length,
1047     or -2 if \C was encountered
1048     */
1049    
1050     static int
1051     find_fixedlength(uschar *code, int options)
1052     {
1053     int length = -1;
1054    
1055     register int branchlength = 0;
1056     register uschar *cc = code + 1 + LINK_SIZE;
1057    
1058     /* Scan along the opcodes for this branch. If we get to the end of the
1059     branch, check the length against that of the other branches. */
1060    
1061     for (;;)
1062     {
1063     int d;
1064     register int op = *cc;
1065    
1066     switch (op)
1067     {
1068 nigel 93 case OP_CBRA:
1069 nigel 77 case OP_BRA:
1070     case OP_ONCE:
1071     case OP_COND:
1072 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1073 nigel 77 if (d < 0) return d;
1074     branchlength += d;
1075     do cc += GET(cc, 1); while (*cc == OP_ALT);
1076     cc += 1 + LINK_SIZE;
1077     break;
1078    
1079     /* Reached end of a branch; if it's a ket it is the end of a nested
1080     call. If it's ALT it is an alternation in a nested call. If it is
1081     END it's the end of the outer call. All can be handled by the same code. */
1082    
1083     case OP_ALT:
1084     case OP_KET:
1085     case OP_KETRMAX:
1086     case OP_KETRMIN:
1087     case OP_END:
1088     if (length < 0) length = branchlength;
1089     else if (length != branchlength) return -1;
1090     if (*cc != OP_ALT) return length;
1091     cc += 1 + LINK_SIZE;
1092     branchlength = 0;
1093     break;
1094    
1095     /* Skip over assertive subpatterns */
1096    
1097     case OP_ASSERT:
1098     case OP_ASSERT_NOT:
1099     case OP_ASSERTBACK:
1100     case OP_ASSERTBACK_NOT:
1101     do cc += GET(cc, 1); while (*cc == OP_ALT);
1102     /* Fall through */
1103    
1104     /* Skip over things that don't match chars */
1105    
1106     case OP_REVERSE:
1107     case OP_CREF:
1108 nigel 93 case OP_RREF:
1109     case OP_DEF:
1110 nigel 77 case OP_OPT:
1111     case OP_CALLOUT:
1112     case OP_SOD:
1113     case OP_SOM:
1114     case OP_EOD:
1115     case OP_EODN:
1116     case OP_CIRC:
1117     case OP_DOLL:
1118     case OP_NOT_WORD_BOUNDARY:
1119     case OP_WORD_BOUNDARY:
1120     cc += _pcre_OP_lengths[*cc];
1121     break;
1122    
1123     /* Handle literal characters */
1124    
1125     case OP_CHAR:
1126     case OP_CHARNC:
1127 nigel 91 case OP_NOT:
1128 nigel 77 branchlength++;
1129     cc += 2;
1130     #ifdef SUPPORT_UTF8
1131     if ((options & PCRE_UTF8) != 0)
1132     {
1133     while ((*cc & 0xc0) == 0x80) cc++;
1134     }
1135     #endif
1136     break;
1137    
1138     /* Handle exact repetitions. The count is already in characters, but we
1139     need to skip over a multibyte character in UTF8 mode. */
1140    
1141     case OP_EXACT:
1142     branchlength += GET2(cc,1);
1143     cc += 4;
1144     #ifdef SUPPORT_UTF8
1145     if ((options & PCRE_UTF8) != 0)
1146     {
1147     while((*cc & 0x80) == 0x80) cc++;
1148     }
1149     #endif
1150     break;
1151    
1152     case OP_TYPEEXACT:
1153     branchlength += GET2(cc,1);
1154     cc += 4;
1155     break;
1156    
1157     /* Handle single-char matchers */
1158    
1159     case OP_PROP:
1160     case OP_NOTPROP:
1161 nigel 87 cc += 2;
1162 nigel 77 /* Fall through */
1163    
1164     case OP_NOT_DIGIT:
1165     case OP_DIGIT:
1166     case OP_NOT_WHITESPACE:
1167     case OP_WHITESPACE:
1168     case OP_NOT_WORDCHAR:
1169     case OP_WORDCHAR:
1170     case OP_ANY:
1171     branchlength++;
1172     cc++;
1173     break;
1174    
1175     /* The single-byte matcher isn't allowed */
1176    
1177     case OP_ANYBYTE:
1178     return -2;
1179    
1180     /* Check a class for variable quantification */
1181    
1182     #ifdef SUPPORT_UTF8
1183     case OP_XCLASS:
1184     cc += GET(cc, 1) - 33;
1185     /* Fall through */
1186     #endif
1187    
1188     case OP_CLASS:
1189     case OP_NCLASS:
1190     cc += 33;
1191    
1192     switch (*cc)
1193     {
1194     case OP_CRSTAR:
1195     case OP_CRMINSTAR:
1196     case OP_CRQUERY:
1197     case OP_CRMINQUERY:
1198     return -1;
1199    
1200     case OP_CRRANGE:
1201     case OP_CRMINRANGE:
1202     if (GET2(cc,1) != GET2(cc,3)) return -1;
1203     branchlength += GET2(cc,1);
1204     cc += 5;
1205     break;
1206    
1207     default:
1208     branchlength++;
1209     }
1210     break;
1211    
1212     /* Anything else is variable length */
1213    
1214     default:
1215     return -1;
1216     }
1217     }
1218     /* Control never gets here */
1219     }
1220    
1221    
1222    
1223    
1224     /*************************************************
1225     * Scan compiled regex for numbered bracket *
1226     *************************************************/
1227    
1228     /* This little function scans through a compiled pattern until it finds a
1229     capturing bracket with the given number.
1230    
1231     Arguments:
1232     code points to start of expression
1233     utf8 TRUE in UTF-8 mode
1234     number the required bracket number
1235    
1236     Returns: pointer to the opcode for the bracket, or NULL if not found
1237     */
1238    
1239     static const uschar *
1240     find_bracket(const uschar *code, BOOL utf8, int number)
1241     {
1242     for (;;)
1243     {
1244     register int c = *code;
1245     if (c == OP_END) return NULL;
1246 nigel 91
1247     /* XCLASS is used for classes that cannot be represented just by a bit
1248     map. This includes negated single high-valued characters. The length in
1249     the table is zero; the actual length is stored in the compiled code. */
1250    
1251     if (c == OP_XCLASS) code += GET(code, 1);
1252    
1253 nigel 93 /* Handle capturing bracket */
1254 nigel 91
1255 nigel 93 else if (c == OP_CBRA)
1256 nigel 77 {
1257 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1258 nigel 77 if (n == number) return (uschar *)code;
1259 nigel 93 code += _pcre_OP_lengths[c];
1260 nigel 77 }
1261 nigel 91
1262 nigel 93 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1263     a multi-byte character. The length in the table is a minimum, so we have to
1264     arrange to skip the extra bytes. */
1265 nigel 91
1266 nigel 77 else
1267     {
1268     code += _pcre_OP_lengths[c];
1269 ph10 107 #ifdef SUPPORT_UTF8
1270 nigel 77 if (utf8) switch(c)
1271     {
1272     case OP_CHAR:
1273     case OP_CHARNC:
1274     case OP_EXACT:
1275     case OP_UPTO:
1276     case OP_MINUPTO:
1277 nigel 93 case OP_POSUPTO:
1278 nigel 77 case OP_STAR:
1279     case OP_MINSTAR:
1280 nigel 93 case OP_POSSTAR:
1281 nigel 77 case OP_PLUS:
1282     case OP_MINPLUS:
1283 nigel 93 case OP_POSPLUS:
1284 nigel 77 case OP_QUERY:
1285     case OP_MINQUERY:
1286 nigel 93 case OP_POSQUERY:
1287     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1288 nigel 77 break;
1289     }
1290 ph10 111 #endif
1291 nigel 77 }
1292     }
1293     }
1294    
1295    
1296    
1297     /*************************************************
1298     * Scan compiled regex for recursion reference *
1299     *************************************************/
1300    
1301     /* This little function scans through a compiled pattern until it finds an
1302     instance of OP_RECURSE.
1303    
1304     Arguments:
1305     code points to start of expression
1306     utf8 TRUE in UTF-8 mode
1307    
1308     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1309     */
1310    
1311     static const uschar *
1312     find_recurse(const uschar *code, BOOL utf8)
1313     {
1314     for (;;)
1315     {
1316     register int c = *code;
1317     if (c == OP_END) return NULL;
1318 nigel 91 if (c == OP_RECURSE) return code;
1319    
1320     /* XCLASS is used for classes that cannot be represented just by a bit
1321     map. This includes negated single high-valued characters. The length in
1322     the table is zero; the actual length is stored in the compiled code. */
1323    
1324     if (c == OP_XCLASS) code += GET(code, 1);
1325    
1326     /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1327     that are followed by a character may be followed by a multi-byte character.
1328 nigel 93 The length in the table is a minimum, so we have to arrange to skip the extra
1329     bytes. */
1330 nigel 91
1331 nigel 77 else
1332     {
1333     code += _pcre_OP_lengths[c];
1334 ph10 107 #ifdef SUPPORT_UTF8
1335 nigel 77 if (utf8) switch(c)
1336     {
1337     case OP_CHAR:
1338     case OP_CHARNC:
1339     case OP_EXACT:
1340     case OP_UPTO:
1341     case OP_MINUPTO:
1342 nigel 93 case OP_POSUPTO:
1343 nigel 77 case OP_STAR:
1344     case OP_MINSTAR:
1345 nigel 93 case OP_POSSTAR:
1346 nigel 77 case OP_PLUS:
1347     case OP_MINPLUS:
1348 nigel 93 case OP_POSPLUS:
1349 nigel 77 case OP_QUERY:
1350     case OP_MINQUERY:
1351 nigel 93 case OP_POSQUERY:
1352     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1353 nigel 77 break;
1354     }
1355 ph10 111 #endif
1356 nigel 77 }
1357     }
1358     }
1359    
1360    
1361    
1362     /*************************************************
1363     * Scan compiled branch for non-emptiness *
1364     *************************************************/
1365    
1366     /* This function scans through a branch of a compiled pattern to see whether it
1367 nigel 93 can match the empty string or not. It is called from could_be_empty()
1368     below and from compile_branch() when checking for an unlimited repeat of a
1369     group that can match nothing. Note that first_significant_code() skips over
1370     assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1371     struck an inner bracket whose current branch will already have been scanned.
1372 nigel 77
1373     Arguments:
1374     code points to start of search
1375     endcode points to where to stop
1376     utf8 TRUE if in UTF8 mode
1377    
1378     Returns: TRUE if what is matched could be empty
1379     */
1380    
1381     static BOOL
1382     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1383     {
1384     register int c;
1385 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1386 nigel 77 code < endcode;
1387     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1388     {
1389     const uschar *ccode;
1390    
1391     c = *code;
1392 ph10 172
1393 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1394 nigel 77
1395 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1396     {
1397 ph10 172 code += _pcre_OP_lengths[c];
1398 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1399     c = *code;
1400     continue;
1401     }
1402    
1403     /* For other groups, scan the branches. */
1404 ph10 172
1405 nigel 93 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1406 nigel 77 {
1407     BOOL empty_branch;
1408     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1409    
1410     /* Scan a closed bracket */
1411    
1412     empty_branch = FALSE;
1413     do
1414     {
1415     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1416     empty_branch = TRUE;
1417     code += GET(code, 1);
1418     }
1419     while (*code == OP_ALT);
1420     if (!empty_branch) return FALSE; /* All branches are non-empty */
1421 ph10 172 c = *code;
1422 nigel 93 continue;
1423 nigel 77 }
1424    
1425 nigel 93 /* Handle the other opcodes */
1426    
1427     switch (c)
1428 nigel 77 {
1429     /* Check for quantifiers after a class */
1430    
1431     #ifdef SUPPORT_UTF8
1432     case OP_XCLASS:
1433     ccode = code + GET(code, 1);
1434     goto CHECK_CLASS_REPEAT;
1435     #endif
1436    
1437     case OP_CLASS:
1438     case OP_NCLASS:
1439     ccode = code + 33;
1440    
1441     #ifdef SUPPORT_UTF8
1442     CHECK_CLASS_REPEAT:
1443     #endif
1444    
1445     switch (*ccode)
1446     {
1447     case OP_CRSTAR: /* These could be empty; continue */
1448     case OP_CRMINSTAR:
1449     case OP_CRQUERY:
1450     case OP_CRMINQUERY:
1451     break;
1452    
1453     default: /* Non-repeat => class must match */
1454     case OP_CRPLUS: /* These repeats aren't empty */
1455     case OP_CRMINPLUS:
1456     return FALSE;
1457    
1458     case OP_CRRANGE:
1459     case OP_CRMINRANGE:
1460     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1461     break;
1462     }
1463     break;
1464    
1465     /* Opcodes that must match a character */
1466    
1467     case OP_PROP:
1468     case OP_NOTPROP:
1469     case OP_EXTUNI:
1470     case OP_NOT_DIGIT:
1471     case OP_DIGIT:
1472     case OP_NOT_WHITESPACE:
1473     case OP_WHITESPACE:
1474     case OP_NOT_WORDCHAR:
1475     case OP_WORDCHAR:
1476     case OP_ANY:
1477     case OP_ANYBYTE:
1478     case OP_CHAR:
1479     case OP_CHARNC:
1480     case OP_NOT:
1481     case OP_PLUS:
1482     case OP_MINPLUS:
1483 nigel 93 case OP_POSPLUS:
1484 nigel 77 case OP_EXACT:
1485     case OP_NOTPLUS:
1486     case OP_NOTMINPLUS:
1487 nigel 93 case OP_NOTPOSPLUS:
1488 nigel 77 case OP_NOTEXACT:
1489     case OP_TYPEPLUS:
1490     case OP_TYPEMINPLUS:
1491 nigel 93 case OP_TYPEPOSPLUS:
1492 nigel 77 case OP_TYPEEXACT:
1493     return FALSE;
1494    
1495     /* End of branch */
1496    
1497     case OP_KET:
1498     case OP_KETRMAX:
1499     case OP_KETRMIN:
1500     case OP_ALT:
1501     return TRUE;
1502    
1503 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1504     MINUPTO, and POSUPTO may be followed by a multibyte character */
1505 nigel 77
1506     #ifdef SUPPORT_UTF8
1507     case OP_STAR:
1508     case OP_MINSTAR:
1509 nigel 93 case OP_POSSTAR:
1510 nigel 77 case OP_QUERY:
1511     case OP_MINQUERY:
1512 nigel 93 case OP_POSQUERY:
1513 nigel 77 case OP_UPTO:
1514     case OP_MINUPTO:
1515 nigel 93 case OP_POSUPTO:
1516 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1517     break;
1518     #endif
1519     }
1520     }
1521    
1522     return TRUE;
1523     }
1524    
1525    
1526    
1527     /*************************************************
1528     * Scan compiled regex for non-emptiness *
1529     *************************************************/
1530    
1531     /* This function is called to check for left recursive calls. We want to check
1532     the current branch of the current pattern to see if it could match the empty
1533     string. If it could, we must look outwards for branches at other levels,
1534     stopping when we pass beyond the bracket which is the subject of the recursion.
1535    
1536     Arguments:
1537     code points to start of the recursion
1538     endcode points to where to stop (current RECURSE item)
1539     bcptr points to the chain of current (unclosed) branch starts
1540     utf8 TRUE if in UTF-8 mode
1541    
1542     Returns: TRUE if what is matched could be empty
1543     */
1544    
1545     static BOOL
1546     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1547     BOOL utf8)
1548     {
1549     while (bcptr != NULL && bcptr->current >= code)
1550     {
1551     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1552     bcptr = bcptr->outer;
1553     }
1554     return TRUE;
1555     }
1556    
1557    
1558    
1559     /*************************************************
1560     * Check for POSIX class syntax *
1561     *************************************************/
1562    
1563     /* This function is called when the sequence "[:" or "[." or "[=" is
1564     encountered in a character class. It checks whether this is followed by an
1565     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1566     ".]" or "=]".
1567    
1568     Argument:
1569     ptr pointer to the initial [
1570     endptr where to return the end pointer
1571     cd pointer to compile data
1572    
1573     Returns: TRUE or FALSE
1574     */
1575    
1576     static BOOL
1577     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1578     {
1579     int terminator; /* Don't combine these lines; the Solaris cc */
1580     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1581     if (*(++ptr) == '^') ptr++;
1582     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1583     if (*ptr == terminator && ptr[1] == ']')
1584     {
1585     *endptr = ptr;
1586     return TRUE;
1587     }
1588     return FALSE;
1589     }
1590    
1591    
1592    
1593    
1594     /*************************************************
1595     * Check POSIX class name *
1596     *************************************************/
1597    
1598     /* This function is called to check the name given in a POSIX-style class entry
1599     such as [:alnum:].
1600    
1601     Arguments:
1602     ptr points to the first letter
1603     len the length of the name
1604    
1605     Returns: a value representing the name, or -1 if unknown
1606     */
1607    
1608     static int
1609     check_posix_name(const uschar *ptr, int len)
1610     {
1611     register int yield = 0;
1612     while (posix_name_lengths[yield] != 0)
1613     {
1614     if (len == posix_name_lengths[yield] &&
1615     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1616     yield++;
1617     }
1618     return -1;
1619     }
1620    
1621    
1622     /*************************************************
1623     * Adjust OP_RECURSE items in repeated group *
1624     *************************************************/
1625    
1626     /* OP_RECURSE items contain an offset from the start of the regex to the group
1627     that is referenced. This means that groups can be replicated for fixed
1628     repetition simply by copying (because the recursion is allowed to refer to
1629     earlier groups that are outside the current group). However, when a group is
1630     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1631     it, after it has been compiled. This means that any OP_RECURSE items within it
1632     that refer to the group itself or any contained groups have to have their
1633 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1634     the partially compiled regex must be temporarily terminated with OP_END.
1635 nigel 77
1636 nigel 93 This function has been extended with the possibility of forward references for
1637     recursions and subroutine calls. It must also check the list of such references
1638     for the group we are dealing with. If it finds that one of the recursions in
1639     the current group is on this list, it adjusts the offset in the list, not the
1640     value in the reference (which is a group number).
1641    
1642 nigel 77 Arguments:
1643     group points to the start of the group
1644     adjust the amount by which the group is to be moved
1645     utf8 TRUE in UTF-8 mode
1646     cd contains pointers to tables etc.
1647 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1648 nigel 77
1649     Returns: nothing
1650     */
1651    
1652     static void
1653 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1654     uschar *save_hwm)
1655 nigel 77 {
1656     uschar *ptr = group;
1657     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1658     {
1659 nigel 93 int offset;
1660     uschar *hc;
1661    
1662     /* See if this recursion is on the forward reference list. If so, adjust the
1663     reference. */
1664    
1665     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1666     {
1667     offset = GET(hc, 0);
1668     if (cd->start_code + offset == ptr + 1)
1669     {
1670     PUT(hc, 0, offset + adjust);
1671     break;
1672     }
1673     }
1674    
1675     /* Otherwise, adjust the recursion offset if it's after the start of this
1676     group. */
1677    
1678     if (hc >= cd->hwm)
1679     {
1680     offset = GET(ptr, 1);
1681     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1682     }
1683    
1684 nigel 77 ptr += 1 + LINK_SIZE;
1685     }
1686     }
1687    
1688    
1689    
1690     /*************************************************
1691     * Insert an automatic callout point *
1692     *************************************************/
1693    
1694     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1695     callout points before each pattern item.
1696    
1697     Arguments:
1698     code current code pointer
1699     ptr current pattern pointer
1700     cd pointers to tables etc
1701    
1702     Returns: new code pointer
1703     */
1704    
1705     static uschar *
1706     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1707     {
1708     *code++ = OP_CALLOUT;
1709     *code++ = 255;
1710     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1711     PUT(code, LINK_SIZE, 0); /* Default length */
1712     return code + 2*LINK_SIZE;
1713     }
1714    
1715    
1716    
1717     /*************************************************
1718     * Complete a callout item *
1719     *************************************************/
1720    
1721     /* A callout item contains the length of the next item in the pattern, which
1722     we can't fill in till after we have reached the relevant point. This is used
1723     for both automatic and manual callouts.
1724    
1725     Arguments:
1726     previous_callout points to previous callout item
1727     ptr current pattern pointer
1728     cd pointers to tables etc
1729    
1730     Returns: nothing
1731     */
1732    
1733     static void
1734     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1735     {
1736     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1737     PUT(previous_callout, 2 + LINK_SIZE, length);
1738     }
1739    
1740    
1741    
1742     #ifdef SUPPORT_UCP
1743     /*************************************************
1744     * Get othercase range *
1745     *************************************************/
1746    
1747     /* This function is passed the start and end of a class range, in UTF-8 mode
1748     with UCP support. It searches up the characters, looking for internal ranges of
1749     characters in the "other" case. Each call returns the next one, updating the
1750     start address.
1751    
1752     Arguments:
1753     cptr points to starting character value; updated
1754     d end value
1755     ocptr where to put start of othercase range
1756     odptr where to put end of othercase range
1757    
1758     Yield: TRUE when range returned; FALSE when no more
1759     */
1760    
1761     static BOOL
1762 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1763     unsigned int *odptr)
1764 nigel 77 {
1765 nigel 93 unsigned int c, othercase, next;
1766 nigel 77
1767     for (c = *cptr; c <= d; c++)
1768 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1769 nigel 77
1770     if (c > d) return FALSE;
1771    
1772     *ocptr = othercase;
1773     next = othercase + 1;
1774    
1775     for (++c; c <= d; c++)
1776     {
1777 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1778 nigel 77 next++;
1779     }
1780    
1781     *odptr = next - 1;
1782     *cptr = c;
1783    
1784     return TRUE;
1785     }
1786     #endif /* SUPPORT_UCP */
1787    
1788    
1789 nigel 93
1790 nigel 77 /*************************************************
1791 nigel 93 * Check if auto-possessifying is possible *
1792     *************************************************/
1793    
1794     /* This function is called for unlimited repeats of certain items, to see
1795     whether the next thing could possibly match the repeated item. If not, it makes
1796     sense to automatically possessify the repeated item.
1797    
1798     Arguments:
1799     op_code the repeated op code
1800     this data for this item, depends on the opcode
1801     utf8 TRUE in UTF-8 mode
1802     utf8_char used for utf8 character bytes, NULL if not relevant
1803     ptr next character in pattern
1804     options options bits
1805     cd contains pointers to tables etc.
1806    
1807     Returns: TRUE if possessifying is wanted
1808     */
1809    
1810     static BOOL
1811     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1812     const uschar *ptr, int options, compile_data *cd)
1813     {
1814     int next;
1815    
1816     /* Skip whitespace and comments in extended mode */
1817    
1818     if ((options & PCRE_EXTENDED) != 0)
1819     {
1820     for (;;)
1821     {
1822     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1823     if (*ptr == '#')
1824     {
1825     while (*(++ptr) != 0)
1826     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1827     }
1828     else break;
1829     }
1830     }
1831    
1832     /* If the next item is one that we can handle, get its value. A non-negative
1833     value is a character, a negative value is an escape value. */
1834    
1835     if (*ptr == '\\')
1836     {
1837     int temperrorcode = 0;
1838     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1839     if (temperrorcode != 0) return FALSE;
1840     ptr++; /* Point after the escape sequence */
1841     }
1842    
1843     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1844     {
1845     #ifdef SUPPORT_UTF8
1846     if (utf8) { GETCHARINC(next, ptr); } else
1847     #endif
1848     next = *ptr++;
1849     }
1850    
1851     else return FALSE;
1852    
1853     /* Skip whitespace and comments in extended mode */
1854    
1855     if ((options & PCRE_EXTENDED) != 0)
1856     {
1857     for (;;)
1858     {
1859     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1860     if (*ptr == '#')
1861     {
1862     while (*(++ptr) != 0)
1863     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1864     }
1865     else break;
1866     }
1867     }
1868    
1869     /* If the next thing is itself optional, we have to give up. */
1870    
1871     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1872     return FALSE;
1873    
1874     /* Now compare the next item with the previous opcode. If the previous is a
1875     positive single character match, "item" either contains the character or, if
1876     "item" is greater than 127 in utf8 mode, the character's bytes are in
1877     utf8_char. */
1878    
1879    
1880     /* Handle cases when the next item is a character. */
1881    
1882     if (next >= 0) switch(op_code)
1883     {
1884     case OP_CHAR:
1885     #ifdef SUPPORT_UTF8
1886     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1887     #endif
1888     return item != next;
1889    
1890     /* For CHARNC (caseless character) we must check the other case. If we have
1891     Unicode property support, we can use it to test the other case of
1892     high-valued characters. */
1893    
1894     case OP_CHARNC:
1895     #ifdef SUPPORT_UTF8
1896     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1897     #endif
1898     if (item == next) return FALSE;
1899     #ifdef SUPPORT_UTF8
1900     if (utf8)
1901     {
1902     unsigned int othercase;
1903     if (next < 128) othercase = cd->fcc[next]; else
1904     #ifdef SUPPORT_UCP
1905     othercase = _pcre_ucp_othercase((unsigned int)next);
1906     #else
1907     othercase = NOTACHAR;
1908     #endif
1909     return (unsigned int)item != othercase;
1910     }
1911     else
1912     #endif /* SUPPORT_UTF8 */
1913     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1914    
1915     /* For OP_NOT, "item" must be a single-byte character. */
1916    
1917     case OP_NOT:
1918     if (next < 0) return FALSE; /* Not a character */
1919     if (item == next) return TRUE;
1920     if ((options & PCRE_CASELESS) == 0) return FALSE;
1921     #ifdef SUPPORT_UTF8
1922     if (utf8)
1923     {
1924     unsigned int othercase;
1925     if (next < 128) othercase = cd->fcc[next]; else
1926     #ifdef SUPPORT_UCP
1927     othercase = _pcre_ucp_othercase(next);
1928     #else
1929     othercase = NOTACHAR;
1930     #endif
1931     return (unsigned int)item == othercase;
1932     }
1933     else
1934     #endif /* SUPPORT_UTF8 */
1935     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1936    
1937     case OP_DIGIT:
1938     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1939    
1940     case OP_NOT_DIGIT:
1941     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1942    
1943     case OP_WHITESPACE:
1944     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1945    
1946     case OP_NOT_WHITESPACE:
1947     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1948    
1949     case OP_WORDCHAR:
1950     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1951    
1952     case OP_NOT_WORDCHAR:
1953     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1954    
1955 ph10 180 case OP_HSPACE:
1956     case OP_NOT_HSPACE:
1957     switch(next)
1958     {
1959     case 0x09:
1960     case 0x20:
1961     case 0xa0:
1962     case 0x1680:
1963     case 0x180e:
1964     case 0x2000:
1965     case 0x2001:
1966     case 0x2002:
1967     case 0x2003:
1968     case 0x2004:
1969     case 0x2005:
1970     case 0x2006:
1971     case 0x2007:
1972     case 0x2008:
1973     case 0x2009:
1974     case 0x200A:
1975     case 0x202f:
1976     case 0x205f:
1977     case 0x3000:
1978     return op_code != OP_HSPACE;
1979     default:
1980     return op_code == OP_HSPACE;
1981     }
1982    
1983     case OP_VSPACE:
1984     case OP_NOT_VSPACE:
1985     switch(next)
1986     {
1987     case 0x0a:
1988     case 0x0b:
1989     case 0x0c:
1990     case 0x0d:
1991     case 0x85:
1992     case 0x2028:
1993     case 0x2029:
1994     return op_code != OP_VSPACE;
1995     default:
1996     return op_code == OP_VSPACE;
1997     }
1998    
1999 nigel 93 default:
2000     return FALSE;
2001     }
2002    
2003    
2004     /* Handle the case when the next item is \d, \s, etc. */
2005    
2006     switch(op_code)
2007     {
2008     case OP_CHAR:
2009     case OP_CHARNC:
2010     #ifdef SUPPORT_UTF8
2011     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2012     #endif
2013     switch(-next)
2014     {
2015     case ESC_d:
2016     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2017    
2018     case ESC_D:
2019     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2020    
2021     case ESC_s:
2022     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2023    
2024     case ESC_S:
2025     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2026    
2027     case ESC_w:
2028     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2029    
2030     case ESC_W:
2031     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2032 ph10 182
2033 ph10 180 case ESC_h:
2034     case ESC_H:
2035     switch(item)
2036     {
2037     case 0x09:
2038     case 0x20:
2039     case 0xa0:
2040     case 0x1680:
2041     case 0x180e:
2042     case 0x2000:
2043     case 0x2001:
2044     case 0x2002:
2045     case 0x2003:
2046     case 0x2004:
2047     case 0x2005:
2048     case 0x2006:
2049     case 0x2007:
2050     case 0x2008:
2051     case 0x2009:
2052     case 0x200A:
2053     case 0x202f:
2054     case 0x205f:
2055     case 0x3000:
2056     return -next != ESC_h;
2057     default:
2058     return -next == ESC_h;
2059 ph10 182 }
2060    
2061 ph10 180 case ESC_v:
2062     case ESC_V:
2063     switch(item)
2064     {
2065     case 0x0a:
2066     case 0x0b:
2067     case 0x0c:
2068     case 0x0d:
2069     case 0x85:
2070     case 0x2028:
2071     case 0x2029:
2072     return -next != ESC_v;
2073     default:
2074     return -next == ESC_v;
2075 ph10 182 }
2076 nigel 93
2077     default:
2078     return FALSE;
2079     }
2080    
2081     case OP_DIGIT:
2082 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2083     next == -ESC_h || next == -ESC_v;
2084 nigel 93
2085     case OP_NOT_DIGIT:
2086     return next == -ESC_d;
2087    
2088     case OP_WHITESPACE:
2089     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2090    
2091     case OP_NOT_WHITESPACE:
2092 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2093 nigel 93
2094 ph10 180 case OP_HSPACE:
2095     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2096    
2097     case OP_NOT_HSPACE:
2098     return next == -ESC_h;
2099 ph10 182
2100 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2101 ph10 182 case OP_VSPACE:
2102 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2103    
2104     case OP_NOT_VSPACE:
2105 ph10 182 return next == -ESC_v;
2106 ph10 180
2107 nigel 93 case OP_WORDCHAR:
2108 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2109 nigel 93
2110     case OP_NOT_WORDCHAR:
2111     return next == -ESC_w || next == -ESC_d;
2112 ph10 182
2113 nigel 93 default:
2114     return FALSE;
2115     }
2116    
2117     /* Control does not reach here */
2118     }
2119    
2120    
2121    
2122     /*************************************************
2123 nigel 77 * Compile one branch *
2124     *************************************************/
2125    
2126 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2127 nigel 77 changed during the branch, the pointer is used to change the external options
2128 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2129     to find out the amount of memory needed, as well as during the real compile
2130     phase. The value of lengthptr distinguishes the two phases.
2131 nigel 77
2132     Arguments:
2133     optionsptr pointer to the option bits
2134     codeptr points to the pointer to the current code point
2135     ptrptr points to the current pattern pointer
2136     errorcodeptr points to error code variable
2137     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2138     reqbyteptr set to the last literal character required, else < 0
2139     bcptr points to current branch chain
2140     cd contains pointers to tables etc.
2141 nigel 93 lengthptr NULL during the real compile phase
2142     points to length accumulator during pre-compile phase
2143 nigel 77
2144     Returns: TRUE on success
2145     FALSE, with *errorcodeptr set non-zero on error
2146     */
2147    
2148     static BOOL
2149 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2150     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2151     compile_data *cd, int *lengthptr)
2152 nigel 77 {
2153     int repeat_type, op_type;
2154     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2155     int bravalue = 0;
2156     int greedy_default, greedy_non_default;
2157     int firstbyte, reqbyte;
2158     int zeroreqbyte, zerofirstbyte;
2159     int req_caseopt, reqvary, tempreqvary;
2160     int options = *optionsptr;
2161     int after_manual_callout = 0;
2162 nigel 93 int length_prevgroup = 0;
2163 nigel 77 register int c;
2164     register uschar *code = *codeptr;
2165 nigel 93 uschar *last_code = code;
2166     uschar *orig_code = code;
2167 nigel 77 uschar *tempcode;
2168     BOOL inescq = FALSE;
2169     BOOL groupsetfirstbyte = FALSE;
2170     const uschar *ptr = *ptrptr;
2171     const uschar *tempptr;
2172     uschar *previous = NULL;
2173     uschar *previous_callout = NULL;
2174 nigel 93 uschar *save_hwm = NULL;
2175 nigel 77 uschar classbits[32];
2176    
2177     #ifdef SUPPORT_UTF8
2178     BOOL class_utf8;
2179     BOOL utf8 = (options & PCRE_UTF8) != 0;
2180     uschar *class_utf8data;
2181     uschar utf8_char[6];
2182     #else
2183     BOOL utf8 = FALSE;
2184 nigel 93 uschar *utf8_char = NULL;
2185 nigel 77 #endif
2186    
2187 nigel 93 #ifdef DEBUG
2188     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2189     #endif
2190    
2191 nigel 77 /* Set up the default and non-default settings for greediness */
2192    
2193     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2194     greedy_non_default = greedy_default ^ 1;
2195    
2196     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2197     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2198     matches a non-fixed char first char; reqbyte just remains unset if we never
2199     find one.
2200    
2201     When we hit a repeat whose minimum is zero, we may have to adjust these values
2202     to take the zero repeat into account. This is implemented by setting them to
2203     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2204     item types that can be repeated set these backoff variables appropriately. */
2205    
2206     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2207    
2208     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2209     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2210     value > 255. It is added into the firstbyte or reqbyte variables to record the
2211     case status of the value. This is used only for ASCII characters. */
2212    
2213     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2214    
2215     /* Switch on next character until the end of the branch */
2216    
2217     for (;; ptr++)
2218     {
2219     BOOL negate_class;
2220     BOOL possessive_quantifier;
2221     BOOL is_quantifier;
2222 nigel 93 BOOL is_recurse;
2223 ph10 180 BOOL reset_bracount;
2224 nigel 77 int class_charcount;
2225     int class_lastchar;
2226     int newoptions;
2227     int recno;
2228 ph10 172 int refsign;
2229 nigel 77 int skipbytes;
2230     int subreqbyte;
2231     int subfirstbyte;
2232 nigel 93 int terminator;
2233 nigel 77 int mclength;
2234     uschar mcbuffer[8];
2235    
2236 nigel 93 /* Get next byte in the pattern */
2237 nigel 77
2238     c = *ptr;
2239    
2240 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2241     previous cycle of this loop. */
2242    
2243     if (lengthptr != NULL)
2244     {
2245     #ifdef DEBUG
2246     if (code > cd->hwm) cd->hwm = code; /* High water info */
2247     #endif
2248     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2249     {
2250     *errorcodeptr = ERR52;
2251     goto FAILED;
2252     }
2253    
2254     /* There is at least one situation where code goes backwards: this is the
2255     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2256     the class is simply eliminated. However, it is created first, so we have to
2257     allow memory for it. Therefore, don't ever reduce the length at this point.
2258     */
2259    
2260     if (code < last_code) code = last_code;
2261     *lengthptr += code - last_code;
2262     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2263    
2264     /* If "previous" is set and it is not at the start of the work space, move
2265     it back to there, in order to avoid filling up the work space. Otherwise,
2266     if "previous" is NULL, reset the current code pointer to the start. */
2267    
2268     if (previous != NULL)
2269     {
2270     if (previous > orig_code)
2271     {
2272     memmove(orig_code, previous, code - previous);
2273     code -= previous - orig_code;
2274     previous = orig_code;
2275     }
2276     }
2277     else code = orig_code;
2278    
2279     /* Remember where this code item starts so we can pick up the length
2280     next time round. */
2281    
2282     last_code = code;
2283     }
2284    
2285     /* In the real compile phase, just check the workspace used by the forward
2286     reference list. */
2287    
2288     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2289     {
2290     *errorcodeptr = ERR52;
2291     goto FAILED;
2292     }
2293    
2294 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2295    
2296     if (inescq && c != 0)
2297     {
2298     if (c == '\\' && ptr[1] == 'E')
2299     {
2300     inescq = FALSE;
2301     ptr++;
2302     continue;
2303     }
2304     else
2305     {
2306     if (previous_callout != NULL)
2307     {
2308 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2309     complete_callout(previous_callout, ptr, cd);
2310 nigel 77 previous_callout = NULL;
2311     }
2312     if ((options & PCRE_AUTO_CALLOUT) != 0)
2313     {
2314     previous_callout = code;
2315     code = auto_callout(code, ptr, cd);
2316     }
2317     goto NORMAL_CHAR;
2318     }
2319     }
2320    
2321     /* Fill in length of a previous callout, except when the next thing is
2322     a quantifier. */
2323    
2324     is_quantifier = c == '*' || c == '+' || c == '?' ||
2325     (c == '{' && is_counted_repeat(ptr+1));
2326    
2327     if (!is_quantifier && previous_callout != NULL &&
2328     after_manual_callout-- <= 0)
2329     {
2330 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2331     complete_callout(previous_callout, ptr, cd);
2332 nigel 77 previous_callout = NULL;
2333     }
2334    
2335     /* In extended mode, skip white space and comments */
2336    
2337     if ((options & PCRE_EXTENDED) != 0)
2338     {
2339     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2340     if (c == '#')
2341     {
2342 nigel 93 while (*(++ptr) != 0)
2343 nigel 91 {
2344 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2345 nigel 91 }
2346 nigel 93 if (*ptr != 0) continue;
2347    
2348 nigel 91 /* Else fall through to handle end of string */
2349     c = 0;
2350 nigel 77 }
2351     }
2352    
2353     /* No auto callout for quantifiers. */
2354    
2355     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2356     {
2357     previous_callout = code;
2358     code = auto_callout(code, ptr, cd);
2359     }
2360    
2361     switch(c)
2362     {
2363 nigel 93 /* ===================================================================*/
2364     case 0: /* The branch terminates at string end */
2365     case '|': /* or | or ) */
2366 nigel 77 case ')':
2367     *firstbyteptr = firstbyte;
2368     *reqbyteptr = reqbyte;
2369     *codeptr = code;
2370     *ptrptr = ptr;
2371 nigel 93 if (lengthptr != NULL)
2372     {
2373     *lengthptr += code - last_code; /* To include callout length */
2374     DPRINTF((">> end branch\n"));
2375     }
2376 nigel 77 return TRUE;
2377    
2378 nigel 93
2379     /* ===================================================================*/
2380 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2381     the setting of any following char as a first character. */
2382    
2383     case '^':
2384     if ((options & PCRE_MULTILINE) != 0)
2385     {
2386     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2387     }
2388     previous = NULL;
2389     *code++ = OP_CIRC;
2390     break;
2391    
2392     case '$':
2393     previous = NULL;
2394     *code++ = OP_DOLL;
2395     break;
2396    
2397     /* There can never be a first char if '.' is first, whatever happens about
2398     repeats. The value of reqbyte doesn't change either. */
2399    
2400     case '.':
2401     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2402     zerofirstbyte = firstbyte;
2403     zeroreqbyte = reqbyte;
2404     previous = code;
2405     *code++ = OP_ANY;
2406     break;
2407    
2408 nigel 93
2409     /* ===================================================================*/
2410 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2411     32-byte bitmap of the permitted characters, except in the special case
2412     where there is only one such character. For negated classes, we build the
2413     map as usual, then invert it at the end. However, we use a different opcode
2414     so that data characters > 255 can be handled correctly.
2415 nigel 77
2416     If the class contains characters outside the 0-255 range, a different
2417     opcode is compiled. It may optionally have a bit map for characters < 256,
2418     but those above are are explicitly listed afterwards. A flag byte tells
2419     whether the bitmap is present, and whether this is a negated class or not.
2420     */
2421    
2422     case '[':
2423     previous = code;
2424    
2425     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2426     they are encountered at the top level, so we'll do that too. */
2427    
2428     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2429     check_posix_syntax(ptr, &tempptr, cd))
2430     {
2431     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2432     goto FAILED;
2433     }
2434    
2435     /* If the first character is '^', set the negation flag and skip it. */
2436    
2437     if ((c = *(++ptr)) == '^')
2438     {
2439     negate_class = TRUE;
2440     c = *(++ptr);
2441     }
2442     else
2443     {
2444     negate_class = FALSE;
2445     }
2446    
2447     /* Keep a count of chars with values < 256 so that we can optimize the case
2448 nigel 93 of just a single character (as long as it's < 256). However, For higher
2449     valued UTF-8 characters, we don't yet do any optimization. */
2450 nigel 77
2451     class_charcount = 0;
2452     class_lastchar = -1;
2453    
2454 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2455     temporary bit of memory, in case the class contains only 1 character (less
2456     than 256), because in that case the compiled code doesn't use the bit map.
2457     */
2458    
2459     memset(classbits, 0, 32 * sizeof(uschar));
2460    
2461 nigel 77 #ifdef SUPPORT_UTF8
2462     class_utf8 = FALSE; /* No chars >= 256 */
2463 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2464 nigel 77 #endif
2465    
2466     /* Process characters until ] is reached. By writing this as a "do" it
2467 nigel 93 means that an initial ] is taken as a data character. At the start of the
2468     loop, c contains the first byte of the character. */
2469 nigel 77
2470 nigel 93 if (c != 0) do
2471 nigel 77 {
2472 nigel 93 const uschar *oldptr;
2473    
2474 nigel 77 #ifdef SUPPORT_UTF8
2475     if (utf8 && c > 127)
2476     { /* Braces are required because the */
2477     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2478     }
2479     #endif
2480    
2481     /* Inside \Q...\E everything is literal except \E */
2482    
2483     if (inescq)
2484     {
2485 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2486 nigel 77 {
2487 nigel 93 inescq = FALSE; /* Reset literal state */
2488     ptr++; /* Skip the 'E' */
2489     continue; /* Carry on with next */
2490 nigel 77 }
2491 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2492 nigel 77 }
2493    
2494     /* Handle POSIX class names. Perl allows a negation extension of the
2495     form [:^name:]. A square bracket that doesn't match the syntax is
2496     treated as a literal. We also recognize the POSIX constructions
2497     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2498     5.6 and 5.8 do. */
2499    
2500     if (c == '[' &&
2501     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2502     check_posix_syntax(ptr, &tempptr, cd))
2503     {
2504     BOOL local_negate = FALSE;
2505 nigel 87 int posix_class, taboffset, tabopt;
2506 nigel 77 register const uschar *cbits = cd->cbits;
2507 nigel 87 uschar pbits[32];
2508 nigel 77
2509     if (ptr[1] != ':')
2510     {
2511     *errorcodeptr = ERR31;
2512     goto FAILED;
2513     }
2514    
2515     ptr += 2;
2516     if (*ptr == '^')
2517     {
2518     local_negate = TRUE;
2519     ptr++;
2520     }
2521    
2522     posix_class = check_posix_name(ptr, tempptr - ptr);
2523     if (posix_class < 0)
2524     {
2525     *errorcodeptr = ERR30;
2526     goto FAILED;
2527     }
2528    
2529     /* If matching is caseless, upper and lower are converted to
2530     alpha. This relies on the fact that the class table starts with
2531     alpha, lower, upper as the first 3 entries. */
2532    
2533     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2534     posix_class = 0;
2535    
2536 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2537     because we may be adding and subtracting from it, and we don't want to
2538     subtract bits that may be in the main map already. At the end we or the
2539     result into the bit map that is being built. */
2540 nigel 77
2541     posix_class *= 3;
2542 nigel 87
2543     /* Copy in the first table (always present) */
2544    
2545     memcpy(pbits, cbits + posix_class_maps[posix_class],
2546     32 * sizeof(uschar));
2547    
2548     /* If there is a second table, add or remove it as required. */
2549    
2550     taboffset = posix_class_maps[posix_class + 1];
2551     tabopt = posix_class_maps[posix_class + 2];
2552    
2553     if (taboffset >= 0)
2554 nigel 77 {
2555 nigel 87 if (tabopt >= 0)
2556     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2557 nigel 77 else
2558 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2559 nigel 77 }
2560    
2561 nigel 87 /* Not see if we need to remove any special characters. An option
2562     value of 1 removes vertical space and 2 removes underscore. */
2563    
2564     if (tabopt < 0) tabopt = -tabopt;
2565     if (tabopt == 1) pbits[1] &= ~0x3c;
2566     else if (tabopt == 2) pbits[11] &= 0x7f;
2567    
2568     /* Add the POSIX table or its complement into the main table that is
2569     being built and we are done. */
2570    
2571     if (local_negate)
2572     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2573     else
2574     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2575    
2576 nigel 77 ptr = tempptr + 1;
2577     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2578     continue; /* End of POSIX syntax handling */
2579     }
2580    
2581     /* Backslash may introduce a single character, or it may introduce one
2582 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2583     case. Inside a class (and only there) it is treated as backspace.
2584     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2585     to or into the one we are building. We assume they have more than one
2586 nigel 77 character in them, so set class_charcount bigger than one. */
2587    
2588     if (c == '\\')
2589     {
2590 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2591     if (*errorcodeptr != 0) goto FAILED;
2592 nigel 77
2593     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2594     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2595 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2596 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2597     {
2598     if (ptr[1] == '\\' && ptr[2] == 'E')
2599     {
2600     ptr += 2; /* avoid empty string */
2601     }
2602     else inescq = TRUE;
2603     continue;
2604     }
2605    
2606     if (c < 0)
2607     {
2608     register const uschar *cbits = cd->cbits;
2609     class_charcount += 2; /* Greater than 1 is what matters */
2610 nigel 93
2611     /* Save time by not doing this in the pre-compile phase. */
2612    
2613     if (lengthptr == NULL) switch (-c)
2614 nigel 77 {
2615     case ESC_d:
2616     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2617     continue;
2618    
2619     case ESC_D:
2620     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2621     continue;
2622    
2623     case ESC_w:
2624     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2625     continue;
2626    
2627     case ESC_W:
2628     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2629     continue;
2630    
2631     case ESC_s:
2632     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2633     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2634     continue;
2635    
2636     case ESC_S:
2637     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2638     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2639     continue;
2640    
2641 nigel 93 case ESC_E: /* Perl ignores an orphan \E */
2642     continue;
2643 ph10 180
2644 nigel 93 default: /* Not recognized; fall through */
2645     break; /* Need "default" setting to stop compiler warning. */
2646     }
2647    
2648     /* In the pre-compile phase, just do the recognition. */
2649    
2650     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2651     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2652 ph10 180
2653 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
2654     they use extra memory. */
2655 ph10 180
2656 ph10 178 if (-c == ESC_h)
2657     {
2658     SETBIT(classbits, 0x09); /* VT */
2659     SETBIT(classbits, 0x20); /* SPACE */
2660 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
2661 ph10 178 #ifdef SUPPORT_UTF8
2662     if (utf8)
2663 ph10 180 {
2664 ph10 178 class_utf8 = TRUE;
2665     *class_utf8data++ = XCL_SINGLE;
2666 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2667 ph10 178 *class_utf8data++ = XCL_SINGLE;
2668 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2669     *class_utf8data++ = XCL_RANGE;
2670     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2671     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2672 ph10 178 *class_utf8data++ = XCL_SINGLE;
2673 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2674 ph10 178 *class_utf8data++ = XCL_SINGLE;
2675 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2676 ph10 178 *class_utf8data++ = XCL_SINGLE;
2677 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2678     }
2679     #endif
2680     continue;
2681     }
2682 nigel 93
2683 ph10 178 if (-c == ESC_H)
2684     {
2685     for (c = 0; c < 32; c++)
2686     {
2687     int x = 0xff;
2688     switch (c)
2689 ph10 180 {
2690 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
2691     case 0x20/8: x ^= 1 << (0x20%8); break;
2692     case 0xa0/8: x ^= 1 << (0xa0%8); break;
2693     default: break;
2694     }
2695     classbits[c] |= x;
2696 ph10 180 }
2697    
2698 ph10 178 #ifdef SUPPORT_UTF8
2699     if (utf8)
2700 ph10 180 {
2701 ph10 178 class_utf8 = TRUE;
2702 ph10 180 *class_utf8data++ = XCL_RANGE;
2703     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2704     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2705     *class_utf8data++ = XCL_RANGE;
2706     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2707     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2708     *class_utf8data++ = XCL_RANGE;
2709     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2710     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2711     *class_utf8data++ = XCL_RANGE;
2712     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2713     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2714     *class_utf8data++ = XCL_RANGE;
2715     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2716     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2717     *class_utf8data++ = XCL_RANGE;
2718     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2719     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2720     *class_utf8data++ = XCL_RANGE;
2721     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2722     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2723     }
2724     #endif
2725     continue;
2726     }
2727 ph10 178
2728     if (-c == ESC_v)
2729     {
2730     SETBIT(classbits, 0x0a); /* LF */
2731     SETBIT(classbits, 0x0b); /* VT */
2732 ph10 180 SETBIT(classbits, 0x0c); /* FF */
2733     SETBIT(classbits, 0x0d); /* CR */
2734     SETBIT(classbits, 0x85); /* NEL */
2735 ph10 178 #ifdef SUPPORT_UTF8
2736     if (utf8)
2737 ph10 180 {
2738 ph10 178 class_utf8 = TRUE;
2739 ph10 180 *class_utf8data++ = XCL_RANGE;
2740     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2741     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2742     }
2743     #endif
2744     continue;
2745     }
2746 ph10 178
2747     if (-c == ESC_V)
2748     {
2749     for (c = 0; c < 32; c++)
2750     {
2751     int x = 0xff;
2752     switch (c)
2753 ph10 180 {
2754 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
2755     x ^= 1 << (0x0b%8);
2756     x ^= 1 << (0x0c%8);
2757 ph10 180 x ^= 1 << (0x0d%8);
2758 ph10 178 break;
2759     case 0x85/8: x ^= 1 << (0x85%8); break;
2760     default: break;
2761     }
2762     classbits[c] |= x;
2763 ph10 180 }
2764    
2765 ph10 178 #ifdef SUPPORT_UTF8
2766     if (utf8)
2767 ph10 180 {
2768 ph10 178 class_utf8 = TRUE;
2769 ph10 180 *class_utf8data++ = XCL_RANGE;
2770     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2771     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2772     *class_utf8data++ = XCL_RANGE;
2773     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2774     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2775     }
2776     #endif
2777     continue;
2778     }
2779 ph10 178
2780 nigel 93 /* We need to deal with \P and \p in both phases. */
2781    
2782 nigel 77 #ifdef SUPPORT_UCP
2783 nigel 93 if (-c == ESC_p || -c == ESC_P)
2784     {
2785     BOOL negated;
2786     int pdata;
2787     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2788     if (ptype < 0) goto FAILED;
2789     class_utf8 = TRUE;
2790     *class_utf8data++ = ((-c == ESC_p) != negated)?
2791     XCL_PROP : XCL_NOTPROP;
2792     *class_utf8data++ = ptype;
2793     *class_utf8data++ = pdata;
2794     class_charcount -= 2; /* Not a < 256 character */
2795 nigel 77 continue;
2796 nigel 93 }
2797 nigel 77 #endif
2798 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
2799     strict mode. By default, for compatibility with Perl, they are
2800     treated as literals. */
2801 nigel 77
2802 nigel 93 if ((options & PCRE_EXTRA) != 0)
2803     {
2804     *errorcodeptr = ERR7;
2805     goto FAILED;
2806     }
2807 nigel 77
2808 nigel 93 class_charcount -= 2; /* Undo the default count from above */
2809     c = *ptr; /* Get the final character and fall through */
2810 nigel 77 }
2811    
2812     /* Fall through if we have a single character (c >= 0). This may be
2813 nigel 93 greater than 256 in UTF-8 mode. */
2814 nigel 77
2815     } /* End of backslash handling */
2816    
2817     /* A single character may be followed by '-' to form a range. However,
2818     Perl does not permit ']' to be the end of the range. A '-' character
2819 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
2820     entirely. The code for handling \Q and \E is messy. */
2821 nigel 77
2822 nigel 93 CHECK_RANGE:
2823     while (ptr[1] == '\\' && ptr[2] == 'E')
2824 nigel 77 {
2825 nigel 93 inescq = FALSE;
2826     ptr += 2;
2827     }
2828    
2829     oldptr = ptr;
2830    
2831     if (!inescq && ptr[1] == '-')
2832     {
2833 nigel 77 int d;
2834     ptr += 2;
2835 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2836 nigel 77
2837 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
2838     mode. */
2839    
2840     while (*ptr == '\\' && ptr[1] == 'Q')
2841     {
2842     ptr += 2;
2843     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2844     inescq = TRUE;
2845     break;
2846     }
2847    
2848     if (*ptr == 0 || (!inescq && *ptr == ']'))
2849     {
2850     ptr = oldptr;
2851     goto LONE_SINGLE_CHARACTER;
2852     }
2853    
2854 nigel 77 #ifdef SUPPORT_UTF8
2855     if (utf8)
2856     { /* Braces are required because the */
2857     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2858     }
2859     else
2860     #endif
2861     d = *ptr; /* Not UTF-8 mode */
2862    
2863     /* The second part of a range can be a single-character escape, but
2864     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2865     in such circumstances. */
2866    
2867 nigel 93 if (!inescq && d == '\\')
2868 nigel 77 {
2869 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2870     if (*errorcodeptr != 0) goto FAILED;
2871 nigel 77
2872 nigel 93 /* \b is backslash; \X is literal X; \R is literal R; any other
2873     special means the '-' was literal */
2874 nigel 77
2875     if (d < 0)
2876     {
2877     if (d == -ESC_b) d = '\b';
2878 nigel 93 else if (d == -ESC_X) d = 'X';
2879     else if (d == -ESC_R) d = 'R'; else
2880 nigel 77 {
2881 nigel 93 ptr = oldptr;
2882 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2883     }
2884     }
2885     }
2886    
2887 nigel 93 /* Check that the two values are in the correct order. Optimize
2888     one-character ranges */
2889 nigel 77
2890 nigel 93 if (d < c)
2891     {
2892     *errorcodeptr = ERR8;
2893     goto FAILED;
2894     }
2895    
2896 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2897    
2898     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2899     matching, we have to use an XCLASS with extra data items. Caseless
2900     matching for characters > 127 is available only if UCP support is
2901     available. */
2902    
2903     #ifdef SUPPORT_UTF8
2904     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2905     {
2906     class_utf8 = TRUE;
2907    
2908     /* With UCP support, we can find the other case equivalents of
2909     the relevant characters. There may be several ranges. Optimize how
2910     they fit with the basic range. */
2911    
2912     #ifdef SUPPORT_UCP
2913     if ((options & PCRE_CASELESS) != 0)
2914     {
2915 nigel 93 unsigned int occ, ocd;
2916     unsigned int cc = c;
2917     unsigned int origd = d;
2918 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
2919     {
2920 ph10 180 if (occ >= (unsigned int)c &&
2921     ocd <= (unsigned int)d)
2922 ph10 176 continue; /* Skip embedded ranges */
2923 nigel 77
2924 ph10 180 if (occ < (unsigned int)c &&
2925 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
2926 nigel 77 { /* if there is overlap, */
2927     c = occ; /* noting that if occ < c */
2928     continue; /* we can't have ocd > d */
2929     } /* because a subrange is */
2930 ph10 180 if (ocd > (unsigned int)d &&
2931 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
2932 nigel 77 { /* the basic range. */
2933     d = ocd;
2934     continue;
2935     }
2936    
2937     if (occ == ocd)
2938     {
2939     *class_utf8data++ = XCL_SINGLE;
2940     }
2941     else
2942     {
2943     *class_utf8data++ = XCL_RANGE;
2944     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2945     }
2946     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2947     }
2948     }
2949     #endif /* SUPPORT_UCP */
2950    
2951     /* Now record the original range, possibly modified for UCP caseless
2952     overlapping ranges. */
2953    
2954     *class_utf8data++ = XCL_RANGE;
2955     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2956     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2957    
2958     /* With UCP support, we are done. Without UCP support, there is no
2959     caseless matching for UTF-8 characters > 127; we can use the bit map
2960     for the smaller ones. */
2961    
2962     #ifdef SUPPORT_UCP
2963     continue; /* With next character in the class */
2964     #else
2965     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2966    
2967     /* Adjust upper limit and fall through to set up the map */
2968    
2969     d = 127;
2970    
2971     #endif /* SUPPORT_UCP */
2972     }
2973     #endif /* SUPPORT_UTF8 */
2974    
2975     /* We use the bit map for all cases when not in UTF-8 mode; else
2976     ranges that lie entirely within 0-127 when there is UCP support; else
2977     for partial ranges without UCP support. */
2978    
2979 nigel 93 class_charcount += d - c + 1;
2980     class_lastchar = d;
2981    
2982     /* We can save a bit of time by skipping this in the pre-compile. */
2983    
2984     if (lengthptr == NULL) for (; c <= d; c++)
2985 nigel 77 {
2986     classbits[c/8] |= (1 << (c&7));
2987     if ((options & PCRE_CASELESS) != 0)
2988     {
2989     int uc = cd->fcc[c]; /* flip case */
2990     classbits[uc/8] |= (1 << (uc&7));
2991     }
2992     }
2993    
2994     continue; /* Go get the next char in the class */
2995     }
2996    
2997     /* Handle a lone single character - we can get here for a normal
2998     non-escape char, or after \ that introduces a single character or for an
2999     apparent range that isn't. */
3000    
3001     LONE_SINGLE_CHARACTER:
3002    
3003     /* Handle a character that cannot go in the bit map */
3004    
3005     #ifdef SUPPORT_UTF8
3006     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3007     {
3008     class_utf8 = TRUE;
3009     *class_utf8data++ = XCL_SINGLE;
3010     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3011    
3012     #ifdef SUPPORT_UCP
3013     if ((options & PCRE_CASELESS) != 0)
3014     {
3015 nigel 93 unsigned int othercase;
3016     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3017 nigel 77 {
3018     *class_utf8data++ = XCL_SINGLE;
3019     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3020     }
3021     }
3022     #endif /* SUPPORT_UCP */
3023    
3024     }
3025     else
3026     #endif /* SUPPORT_UTF8 */
3027    
3028     /* Handle a single-byte character */
3029     {
3030     classbits[c/8] |= (1 << (c&7));
3031     if ((options & PCRE_CASELESS) != 0)
3032     {
3033     c = cd->fcc[c]; /* flip case */
3034     classbits[c/8] |= (1 << (c&7));
3035     }
3036     class_charcount++;
3037     class_lastchar = c;
3038     }
3039     }
3040    
3041 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3042 nigel 77
3043 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3044 nigel 77
3045 nigel 93 if (c == 0) /* Missing terminating ']' */
3046     {
3047     *errorcodeptr = ERR6;
3048     goto FAILED;
3049     }
3050    
3051 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3052     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3053     can optimize the negative case only if there were no characters >= 128
3054     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
3055     single-bytes only. This is an historical hangover. Maybe one day we can
3056     tidy these opcodes to handle multi-byte characters.
3057    
3058     The optimization throws away the bit map. We turn the item into a
3059     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3060     that OP_NOT does not support multibyte characters. In the positive case, it
3061     can cause firstbyte to be set. Otherwise, there can be no first char if
3062     this item is first, whatever repeat count may follow. In the case of
3063     reqbyte, save the previous value for reinstating. */
3064    
3065     #ifdef SUPPORT_UTF8
3066     if (class_charcount == 1 &&
3067     (!utf8 ||
3068     (!class_utf8 && (!negate_class || class_lastchar < 128))))
3069    
3070     #else
3071     if (class_charcount == 1)
3072     #endif
3073     {
3074     zeroreqbyte = reqbyte;
3075    
3076     /* The OP_NOT opcode works on one-byte characters only. */
3077    
3078     if (negate_class)
3079     {
3080     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3081     zerofirstbyte = firstbyte;
3082     *code++ = OP_NOT;
3083     *code++ = class_lastchar;
3084     break;
3085     }
3086    
3087     /* For a single, positive character, get the value into mcbuffer, and
3088     then we can handle this with the normal one-character code. */
3089    
3090     #ifdef SUPPORT_UTF8
3091     if (utf8 && class_lastchar > 127)
3092     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3093     else
3094     #endif
3095     {
3096     mcbuffer[0] = class_lastchar;
3097     mclength = 1;
3098     }
3099     goto ONE_CHAR;
3100     } /* End of 1-char optimization */
3101    
3102     /* The general case - not the one-char optimization. If this is the first
3103     thing in the branch, there can be no first char setting, whatever the
3104     repeat count. Any reqbyte setting must remain unchanged after any kind of
3105     repeat. */
3106    
3107     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3108     zerofirstbyte = firstbyte;
3109     zeroreqbyte = reqbyte;
3110    
3111     /* If there are characters with values > 255, we have to compile an
3112     extended class, with its own opcode. If there are no characters < 256,
3113 nigel 93 we can omit the bitmap in the actual compiled code. */
3114 nigel 77
3115     #ifdef SUPPORT_UTF8
3116     if (class_utf8)
3117     {
3118     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3119     *code++ = OP_XCLASS;
3120     code += LINK_SIZE;
3121     *code = negate_class? XCL_NOT : 0;
3122    
3123 nigel 93 /* If the map is required, move up the extra data to make room for it;
3124     otherwise just move the code pointer to the end of the extra data. */
3125 nigel 77
3126     if (class_charcount > 0)
3127     {
3128     *code++ |= XCL_MAP;
3129 nigel 93 memmove(code + 32, code, class_utf8data - code);
3130 nigel 77 memcpy(code, classbits, 32);
3131 nigel 93 code = class_utf8data + 32;
3132 nigel 77 }
3133 nigel 93 else code = class_utf8data;
3134 nigel 77
3135     /* Now fill in the complete length of the item */
3136    
3137     PUT(previous, 1, code - previous);
3138     break; /* End of class handling */
3139     }
3140     #endif
3141    
3142     /* If there are no characters > 255, negate the 32-byte map if necessary,
3143     and copy it into the code vector. If this is the first thing in the branch,
3144     there can be no first char setting, whatever the repeat count. Any reqbyte
3145     setting must remain unchanged after any kind of repeat. */
3146    
3147     if (negate_class)
3148     {
3149     *code++ = OP_NCLASS;
3150 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3151     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3152 nigel 77 }
3153     else
3154     {
3155     *code++ = OP_CLASS;
3156     memcpy(code, classbits, 32);
3157     }
3158     code += 32;
3159     break;
3160    
3161 nigel 93
3162     /* ===================================================================*/
3163 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3164     has been tested above. */
3165    
3166     case '{':
3167     if (!is_quantifier) goto NORMAL_CHAR;
3168     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3169     if (*errorcodeptr != 0) goto FAILED;
3170     goto REPEAT;
3171    
3172     case '*':
3173     repeat_min = 0;
3174     repeat_max = -1;
3175     goto REPEAT;
3176    
3177     case '+':
3178     repeat_min = 1;
3179     repeat_max = -1;
3180     goto REPEAT;
3181    
3182     case '?':
3183     repeat_min = 0;
3184     repeat_max = 1;
3185    
3186     REPEAT:
3187     if (previous == NULL)
3188     {
3189     *errorcodeptr = ERR9;
3190     goto FAILED;
3191     }
3192    
3193     if (repeat_min == 0)
3194     {
3195     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3196     reqbyte = zeroreqbyte; /* Ditto */
3197     }
3198    
3199     /* Remember whether this is a variable length repeat */
3200    
3201     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3202    
3203     op_type = 0; /* Default single-char op codes */
3204     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3205    
3206     /* Save start of previous item, in case we have to move it up to make space
3207     for an inserted OP_ONCE for the additional '+' extension. */
3208    
3209     tempcode = previous;
3210    
3211     /* If the next character is '+', we have a possessive quantifier. This
3212     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3213     If the next character is '?' this is a minimizing repeat, by default,
3214     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3215     repeat type to the non-default. */
3216    
3217     if (ptr[1] == '+')
3218     {
3219     repeat_type = 0; /* Force greedy */
3220     possessive_quantifier = TRUE;
3221     ptr++;
3222     }
3223     else if (ptr[1] == '?')
3224     {
3225     repeat_type = greedy_non_default;
3226     ptr++;
3227     }
3228     else repeat_type = greedy_default;
3229    
3230     /* If previous was a character match, abolish the item and generate a
3231     repeat item instead. If a char item has a minumum of more than one, ensure
3232     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3233     the first thing in a branch because the x will have gone into firstbyte
3234     instead. */
3235    
3236     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3237     {
3238     /* Deal with UTF-8 characters that take up more than one byte. It's
3239     easier to write this out separately than try to macrify it. Use c to
3240     hold the length of the character in bytes, plus 0x80 to flag that it's a
3241     length rather than a small character. */
3242    
3243     #ifdef SUPPORT_UTF8
3244     if (utf8 && (code[-1] & 0x80) != 0)
3245     {
3246     uschar *lastchar = code - 1;
3247     while((*lastchar & 0xc0) == 0x80) lastchar--;
3248     c = code - lastchar; /* Length of UTF-8 character */
3249     memcpy(utf8_char, lastchar, c); /* Save the char */
3250     c |= 0x80; /* Flag c as a length */
3251     }
3252     else
3253     #endif
3254    
3255     /* Handle the case of a single byte - either with no UTF8 support, or
3256     with UTF-8 disabled, or for a UTF-8 character < 128. */
3257    
3258     {
3259     c = code[-1];
3260     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3261     }
3262    
3263 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3264     the line is something that cannot possibly match this character. If so,
3265     automatically possessifying this item gains some performance in the case
3266     where the match fails. */
3267    
3268     if (!possessive_quantifier &&
3269     repeat_max < 0 &&
3270     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3271     options, cd))
3272     {
3273     repeat_type = 0; /* Force greedy */
3274     possessive_quantifier = TRUE;
3275     }
3276    
3277 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3278     }
3279    
3280     /* If previous was a single negated character ([^a] or similar), we use
3281     one of the special opcodes, replacing it. The code is shared with single-
3282     character repeats by setting opt_type to add a suitable offset into
3283 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3284     currently used only for single-byte chars. */
3285 nigel 77
3286     else if (*previous == OP_NOT)
3287     {
3288     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3289     c = previous[1];
3290 nigel 93 if (!possessive_quantifier &&
3291     repeat_max < 0 &&
3292     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3293     {
3294     repeat_type = 0; /* Force greedy */
3295     possessive_quantifier = TRUE;
3296     }
3297 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3298     }
3299    
3300     /* If previous was a character type match (\d or similar), abolish it and
3301     create a suitable repeat item. The code is shared with single-character
3302     repeats by setting op_type to add a suitable offset into repeat_type. Note
3303     the the Unicode property types will be present only when SUPPORT_UCP is
3304     defined, but we don't wrap the little bits of code here because it just
3305     makes it horribly messy. */
3306    
3307     else if (*previous < OP_EODN)
3308     {
3309     uschar *oldcode;
3310 nigel 87 int prop_type, prop_value;
3311 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3312     c = *previous;
3313    
3314 nigel 93 if (!possessive_quantifier &&
3315     repeat_max < 0 &&
3316     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3317     {
3318     repeat_type = 0; /* Force greedy */
3319     possessive_quantifier = TRUE;
3320     }
3321    
3322 nigel 77 OUTPUT_SINGLE_REPEAT:
3323 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3324     {
3325     prop_type = previous[1];
3326     prop_value = previous[2];
3327     }
3328     else prop_type = prop_value = -1;
3329 nigel 77
3330     oldcode = code;
3331     code = previous; /* Usually overwrite previous item */
3332    
3333     /* If the maximum is zero then the minimum must also be zero; Perl allows
3334     this case, so we do too - by simply omitting the item altogether. */
3335    
3336     if (repeat_max == 0) goto END_REPEAT;
3337    
3338     /* All real repeats make it impossible to handle partial matching (maybe
3339     one day we will be able to remove this restriction). */
3340    
3341     if (repeat_max != 1) cd->nopartial = TRUE;
3342    
3343     /* Combine the op_type with the repeat_type */
3344    
3345     repeat_type += op_type;
3346    
3347     /* A minimum of zero is handled either as the special case * or ?, or as
3348     an UPTO, with the maximum given. */
3349    
3350     if (repeat_min == 0)
3351     {
3352     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3353     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3354     else
3355     {
3356     *code++ = OP_UPTO + repeat_type;
3357     PUT2INC(code, 0, repeat_max);
3358     }
3359     }
3360    
3361     /* A repeat minimum of 1 is optimized into some special cases. If the
3362 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3363 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3364     one less than the maximum. */
3365    
3366     else if (repeat_min == 1)
3367     {
3368     if (repeat_max == -1)
3369     *code++ = OP_PLUS + repeat_type;
3370     else
3371     {
3372     code = oldcode; /* leave previous item in place */
3373     if (repeat_max == 1) goto END_REPEAT;
3374     *code++ = OP_UPTO + repeat_type;
3375     PUT2INC(code, 0, repeat_max - 1);
3376     }
3377     }
3378    
3379     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3380     handled as an EXACT followed by an UPTO. */
3381    
3382     else
3383     {
3384     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3385     PUT2INC(code, 0, repeat_min);
3386    
3387     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3388     we have to insert the character for the previous code. For a repeated
3389 nigel 87 Unicode property match, there are two extra bytes that define the
3390 nigel 77 required property. In UTF-8 mode, long characters have their length in
3391     c, with the 0x80 bit as a flag. */
3392    
3393     if (repeat_max < 0)
3394     {
3395     #ifdef SUPPORT_UTF8
3396     if (utf8 && c >= 128)
3397     {
3398     memcpy(code, utf8_char, c & 7);
3399     code += c & 7;
3400     }
3401     else
3402     #endif
3403     {
3404     *code++ = c;
3405 nigel 87 if (prop_type >= 0)
3406     {
3407     *code++ = prop_type;
3408     *code++ = prop_value;
3409     }
3410 nigel 77 }
3411     *code++ = OP_STAR + repeat_type;
3412     }
3413    
3414     /* Else insert an UPTO if the max is greater than the min, again
3415 nigel 93 preceded by the character, for the previously inserted code. If the
3416     UPTO is just for 1 instance, we can use QUERY instead. */
3417 nigel 77
3418     else if (repeat_max != repeat_min)
3419     {
3420     #ifdef SUPPORT_UTF8
3421     if (utf8 && c >= 128)
3422     {
3423     memcpy(code, utf8_char, c & 7);
3424     code += c & 7;
3425     }
3426     else
3427     #endif
3428     *code++ = c;
3429 nigel 87 if (prop_type >= 0)
3430     {
3431     *code++ = prop_type;
3432     *code++ = prop_value;
3433     }
3434 nigel 77 repeat_max -= repeat_min;
3435 nigel 93
3436     if (repeat_max == 1)
3437     {
3438     *code++ = OP_QUERY + repeat_type;
3439     }
3440     else
3441     {
3442     *code++ = OP_UPTO + repeat_type;
3443     PUT2INC(code, 0, repeat_max);
3444     }
3445 nigel 77 }
3446     }
3447    
3448     /* The character or character type itself comes last in all cases. */
3449    
3450     #ifdef SUPPORT_UTF8
3451     if (utf8 && c >= 128)
3452     {
3453     memcpy(code, utf8_char, c & 7);
3454     code += c & 7;
3455     }
3456     else
3457     #endif
3458     *code++ = c;
3459    
3460 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3461     define the required property. */
3462 nigel 77
3463     #ifdef SUPPORT_UCP
3464 nigel 87 if (prop_type >= 0)
3465     {
3466     *code++ = prop_type;
3467     *code++ = prop_value;
3468     }
3469 nigel 77 #endif
3470     }
3471    
3472     /* If previous was a character class or a back reference, we put the repeat
3473     stuff after it, but just skip the item if the repeat was {0,0}. */
3474    
3475     else if (*previous == OP_CLASS ||
3476     *previous == OP_NCLASS ||
3477     #ifdef SUPPORT_UTF8
3478     *previous == OP_XCLASS ||
3479     #endif
3480     *previous == OP_REF)
3481     {
3482     if (repeat_max == 0)
3483     {
3484     code = previous;
3485     goto END_REPEAT;
3486     }
3487    
3488     /* All real repeats make it impossible to handle partial matching (maybe
3489     one day we will be able to remove this restriction). */
3490    
3491     if (repeat_max != 1) cd->nopartial = TRUE;
3492    
3493     if (repeat_min == 0 && repeat_max == -1)
3494     *code++ = OP_CRSTAR + repeat_type;
3495     else if (repeat_min == 1 && repeat_max == -1)
3496     *code++ = OP_CRPLUS + repeat_type;
3497     else if (repeat_min == 0 && repeat_max == 1)
3498     *code++ = OP_CRQUERY + repeat_type;
3499     else
3500     {
3501     *code++ = OP_CRRANGE + repeat_type;
3502     PUT2INC(code, 0, repeat_min);
3503     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3504     PUT2INC(code, 0, repeat_max);
3505     }
3506     }
3507    
3508     /* If previous was a bracket group, we may have to replicate it in certain
3509     cases. */
3510    
3511 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3512     *previous == OP_ONCE || *previous == OP_COND)
3513 nigel 77 {
3514     register int i;
3515     int ketoffset = 0;
3516     int len = code - previous;
3517     uschar *bralink = NULL;
3518    
3519 nigel 93 /* Repeating a DEFINE group is pointless */
3520    
3521     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3522     {
3523     *errorcodeptr = ERR55;
3524     goto FAILED;
3525     }
3526    
3527     /* This is a paranoid check to stop integer overflow later on */
3528    
3529     if (len > MAX_DUPLENGTH)
3530     {
3531     *errorcodeptr = ERR50;
3532     goto FAILED;
3533     }
3534    
3535 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3536     by scanning through from the start, and compute the offset back to it
3537     from the current code pointer. There may be an OP_OPT setting following
3538     the final KET, so we can't find the end just by going back from the code
3539     pointer. */
3540    
3541     if (repeat_max == -1)
3542     {
3543     register uschar *ket = previous;
3544     do ket += GET(ket, 1); while (*ket != OP_KET);
3545     ketoffset = code - ket;
3546     }
3547    
3548     /* The case of a zero minimum is special because of the need to stick
3549     OP_BRAZERO in front of it, and because the group appears once in the
3550     data, whereas in other cases it appears the minimum number of times. For
3551     this reason, it is simplest to treat this case separately, as otherwise
3552     the code gets far too messy. There are several special subcases when the
3553     minimum is zero. */
3554    
3555     if (repeat_min == 0)
3556     {
3557     /* If the maximum is also zero, we just omit the group from the output
3558     altogether. */
3559    
3560     if (repeat_max == 0)
3561     {
3562     code = previous;
3563     goto END_REPEAT;
3564     }
3565    
3566     /* If the maximum is 1 or unlimited, we just have to stick in the
3567     BRAZERO and do no more at this point. However, we do need to adjust
3568     any OP_RECURSE calls inside the group that refer to the group itself or
3569 nigel 93 any internal or forward referenced group, because the offset is from
3570     the start of the whole regex. Temporarily terminate the pattern while
3571     doing this. */
3572 nigel 77
3573     if (repeat_max <= 1)
3574     {
3575     *code = OP_END;
3576 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3577 nigel 77 memmove(previous+1, previous, len);
3578     code++;
3579     *previous++ = OP_BRAZERO + repeat_type;
3580     }
3581    
3582     /* If the maximum is greater than 1 and limited, we have to replicate
3583     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3584     The first one has to be handled carefully because it's the original
3585     copy, which has to be moved up. The remainder can be handled by code
3586     that is common with the non-zero minimum case below. We have to
3587     adjust the value or repeat_max, since one less copy is required. Once
3588     again, we may have to adjust any OP_RECURSE calls inside the group. */
3589    
3590     else
3591     {
3592     int offset;
3593     *code = OP_END;
3594 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3595 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3596     code += 2 + LINK_SIZE;
3597     *previous++ = OP_BRAZERO + repeat_type;
3598     *previous++ = OP_BRA;
3599    
3600     /* We chain together the bracket offset fields that have to be
3601     filled in later when the ends of the brackets are reached. */
3602    
3603     offset = (bralink == NULL)? 0 : previous - bralink;
3604     bralink = previous;
3605     PUTINC(previous, 0, offset);
3606     }
3607    
3608     repeat_max--;
3609     }
3610    
3611     /* If the minimum is greater than zero, replicate the group as many
3612     times as necessary, and adjust the maximum to the number of subsequent
3613     copies that we need. If we set a first char from the group, and didn't
3614 nigel 93 set a required char, copy the latter from the former. If there are any
3615     forward reference subroutine calls in the group, there will be entries on
3616     the workspace list; replicate these with an appropriate increment. */
3617 nigel 77
3618     else
3619     {
3620     if (repeat_min > 1)
3621     {
3622 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3623     just adjust the length as if we had. */
3624    
3625     if (lengthptr != NULL)
3626     *lengthptr += (repeat_min - 1)*length_prevgroup;
3627    
3628     /* This is compiling for real */
3629    
3630     else
3631 nigel 77 {
3632 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3633     for (i = 1; i < repeat_min; i++)
3634     {
3635     uschar *hc;
3636     uschar *this_hwm = cd->hwm;
3637     memcpy(code, previous, len);
3638     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3639     {
3640     PUT(cd->hwm, 0, GET(hc, 0) + len);
3641     cd->hwm += LINK_SIZE;
3642     }
3643     save_hwm = this_hwm;
3644     code += len;
3645     }
3646 nigel 77 }
3647     }
3648 nigel 93
3649 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3650     }
3651    
3652     /* This code is common to both the zero and non-zero minimum cases. If
3653     the maximum is limited, it replicates the group in a nested fashion,
3654     remembering the bracket starts on a stack. In the case of a zero minimum,
3655     the first one was set up above. In all cases the repeat_max now specifies
3656 nigel 93 the number of additional copies needed. Again, we must remember to
3657     replicate entries on the forward reference list. */
3658 nigel 77
3659     if (repeat_max >= 0)
3660     {
3661 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3662     just adjust the length as if we had. For each repetition we must add 1
3663     to the length for BRAZERO and for all but the last repetition we must
3664     add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3665    
3666     if (lengthptr != NULL && repeat_max > 0)
3667     *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3668     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3669    
3670     /* This is compiling for real */
3671    
3672     else for (i = repeat_max - 1; i >= 0; i--)
3673 nigel 77 {
3674 nigel 93 uschar *hc;
3675     uschar *this_hwm = cd->hwm;
3676    
3677 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3678    
3679     /* All but the final copy start a new nesting, maintaining the
3680     chain of brackets outstanding. */
3681    
3682     if (i != 0)
3683     {
3684     int offset;
3685     *code++ = OP_BRA;
3686     offset = (bralink == NULL)? 0 : code - bralink;
3687     bralink = code;
3688     PUTINC(code, 0, offset);
3689     }
3690    
3691     memcpy(code, previous, len);
3692 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3693     {
3694     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3695     cd->hwm += LINK_SIZE;
3696     }
3697     save_hwm = this_hwm;
3698 nigel 77 code += len;
3699     }
3700    
3701     /* Now chain through the pending brackets, and fill in their length
3702     fields (which are holding the chain links pro tem). */
3703    
3704     while (bralink != NULL)
3705     {
3706     int oldlinkoffset;
3707     int offset = code - bralink + 1;
3708     uschar *bra = code - offset;
3709     oldlinkoffset = GET(bra, 1);
3710     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3711     *code++ = OP_KET;
3712     PUTINC(code, 0, offset);
3713     PUT(bra, 1, offset);
3714     }
3715     }
3716    
3717     /* If the maximum is unlimited, set a repeater in the final copy. We
3718     can't just offset backwards from the current code point, because we
3719     don't know if there's been an options resetting after the ket. The
3720 nigel 93 correct offset was computed above.
3721 nigel 77
3722 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3723     this group is a non-atomic one that could match an empty string. If so,
3724     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3725     that runtime checking can be done. [This check is also applied to
3726     atomic groups at runtime, but in a different way.] */
3727    
3728     else
3729     {
3730     uschar *ketcode = code - ketoffset;
3731     uschar *bracode = ketcode - GET(ketcode, 1);
3732     *ketcode = OP_KETRMAX + repeat_type;
3733     if (lengthptr == NULL && *bracode != OP_ONCE)
3734     {
3735     uschar *scode = bracode;
3736     do
3737     {
3738     if (could_be_empty_branch(scode, ketcode, utf8))
3739     {
3740     *bracode += OP_SBRA - OP_BRA;
3741     break;
3742     }
3743     scode += GET(scode, 1);
3744     }
3745     while (*scode == OP_ALT);
3746     }
3747     }
3748 nigel 77 }
3749    
3750     /* Else there's some kind of shambles */
3751    
3752     else
3753     {
3754     *errorcodeptr = ERR11;
3755     goto FAILED;
3756     }
3757    
3758 nigel 93 /* If the character following a repeat is '+', or if certain optimization
3759     tests above succeeded, possessive_quantifier is TRUE. For some of the
3760     simpler opcodes, there is an special alternative opcode for this. For
3761     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3762     The '+' notation is just syntactic sugar, taken from Sun's Java package,
3763     but the special opcodes can optimize it a bit. The repeated item starts at
3764     tempcode, not at previous, which might be the first part of a string whose
3765     (former) last char we repeated.
3766 nigel 77
3767 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3768     an 'upto' may follow. We skip over an 'exact' item, and then test the
3769     length of what remains before proceeding. */
3770    
3771 nigel 77 if (possessive_quantifier)
3772     {
3773 nigel 93 int len;
3774     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3775     *tempcode == OP_NOTEXACT)
3776     tempcode += _pcre_OP_lengths[*tempcode];
3777     len = code - tempcode;
3778     if (len > 0) switch (*tempcode)
3779     {
3780     case OP_STAR: *tempcode = OP_POSSTAR; break;
3781     case OP_PLUS: *tempcode = OP_POSPLUS; break;
3782     case OP_QUERY: *tempcode = OP_POSQUERY; break;
3783     case OP_UPTO: *tempcode = OP_POSUPTO; break;
3784    
3785     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3786     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3787     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3788     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3789    
3790     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3791     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3792     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3793     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3794    
3795     default:
3796     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3797     code += 1 + LINK_SIZE;
3798     len += 1 + LINK_SIZE;
3799     tempcode[0] = OP_ONCE;
3800     *code++ = OP_KET;
3801     PUTINC(code, 0, len);
3802     PUT(tempcode, 1, len);
3803     break;
3804     }
3805 nigel 77 }
3806    
3807     /* In all case we no longer have a previous item. We also set the
3808     "follows varying string" flag for subsequently encountered reqbytes if
3809     it isn't already set and we have just passed a varying length item. */
3810    
3811     END_REPEAT:
3812     previous = NULL;
3813     cd->req_varyopt |= reqvary;
3814     break;
3815    
3816    
3817 nigel 93 /* ===================================================================*/
3818     /* Start of nested parenthesized sub-expression, or comment or lookahead or
3819     lookbehind or option setting or condition or all the other extended
3820     parenthesis forms. First deal with the specials; all are introduced by ?,
3821     and the appearance of any of them means that this is not a capturing
3822     group. */
3823 nigel 77
3824     case '(':
3825     newoptions = options;
3826     skipbytes = 0;
3827 nigel 93 bravalue = OP_CBRA;
3828     save_hwm = cd->hwm;
3829 ph10 180 reset_bracount = FALSE;
3830 nigel 77
3831     if (*(++ptr) == '?')
3832     {
3833 nigel 93 int i, set, unset, namelen;
3834 nigel 77 int *optset;
3835 nigel 93 const uschar *name;
3836     uschar *slot;
3837 nigel 77
3838     switch (*(++ptr))
3839     {
3840     case '#': /* Comment; skip to ket */
3841     ptr++;
3842 nigel 93 while (*ptr != 0 && *ptr != ')') ptr++;
3843     if (*ptr == 0)
3844     {
3845     *errorcodeptr = ERR18;
3846     goto FAILED;
3847     }
3848 nigel 77 continue;
3849    
3850 nigel 93
3851     /* ------------------------------------------------------------ */
3852 ph10 175 case '|': /* Reset capture count for each branch */
3853     reset_bracount = TRUE;
3854 ph10 180 /* Fall through */
3855 ph10 175
3856     /* ------------------------------------------------------------ */
3857 nigel 93 case ':': /* Non-capturing bracket */
3858 nigel 77 bravalue = OP_BRA;
3859     ptr++;
3860     break;
3861    
3862 nigel 93
3863     /* ------------------------------------------------------------ */
3864 nigel 77 case '(':
3865     bravalue = OP_COND; /* Conditional group */
3866    
3867 nigel 93 /* A condition can be an assertion, a number (referring to a numbered
3868     group), a name (referring to a named group), or 'R', referring to
3869     recursion. R<digits> and R&name are also permitted for recursion tests.
3870 nigel 77
3871 nigel 93 There are several syntaxes for testing a named group: (?(name)) is used
3872     by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3873    
3874     There are two unfortunate ambiguities, caused by history. (a) 'R' can
3875     be the recursive thing or the name 'R' (and similarly for 'R' followed
3876     by digits), and (b) a number could be a name that consists of digits.
3877     In both cases, we look for a name first; if not found, we try the other
3878     cases. */
3879    
3880     /* For conditions that are assertions, check the syntax, and then exit
3881     the switch. This will take control down to where bracketed groups,
3882     including assertions, are processed. */
3883    
3884     if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3885     break;
3886    
3887     /* Most other conditions use OP_CREF (a couple change to OP_RREF
3888     below), and all need to skip 3 bytes at the start of the group. */
3889    
3890     code[1+LINK_SIZE] = OP_CREF;
3891     skipbytes = 3;
3892 ph10 172 refsign = -1;
3893 nigel 93
3894     /* Check for a test for recursion in a named group. */
3895    
3896     if (ptr[1] == 'R' && ptr[2] == '&')
3897 nigel 77 {
3898 nigel 93 terminator = -1;
3899     ptr += 2;
3900     code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3901     }
3902 nigel 91
3903 nigel 93 /* Check for a test for a named group's having been set, using the Perl
3904     syntax (?(<name>) or (?('name') */
3905 nigel 91
3906 nigel 93 else if (ptr[1] == '<')
3907     {
3908     terminator = '>';
3909     ptr++;
3910     }
3911     else if (ptr[1] == '\'')
3912     {
3913     terminator = '\'';
3914     ptr++;
3915     }
3916 ph10 172 else
3917 ph10 167 {
3918     terminator = 0;
3919 ph10 172 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3920     }
3921 nigel 77
3922 nigel 93 /* We now expect to read a name; any thing else is an error */
3923 nigel 77
3924 nigel 93 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3925     {
3926     ptr += 1; /* To get the right offset */
3927     *errorcodeptr = ERR28;
3928     goto FAILED;
3929     }
3930    
3931     /* Read the name, but also get it as a number if it's all digits */
3932    
3933     recno = 0;
3934     name = ++ptr;
3935     while ((cd->ctypes[*ptr] & ctype_word) != 0)
3936     {
3937     if (recno >= 0)
3938     recno = ((digitab[*ptr] & ctype_digit) != 0)?
3939     recno * 10 + *ptr - '0' : -1;
3940 nigel 91 ptr++;
3941 nigel 93 }
3942     namelen = ptr - name;
3943 nigel 91
3944 nigel 93 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3945     {
3946     ptr--; /* Error offset */
3947     *errorcodeptr = ERR26;
3948     goto FAILED;
3949     }
3950 nigel 91
3951 nigel 93 /* Do no further checking in the pre-compile phase. */
3952 nigel 91
3953 nigel 93 if (lengthptr != NULL) break;
3954 nigel 91
3955 nigel 93 /* In the real compile we do the work of looking for the actual
3956 ph10 167 reference. If the string started with "+" or "-" we require the rest to
3957     be digits, in which case recno will be set. */
3958 ph10 172
3959 ph10 167 if (refsign > 0)
3960     {
3961     if (recno <= 0)
3962     {
3963     *errorcodeptr = ERR58;
3964     goto FAILED;
3965 ph10 172 }
3966 ph10 167 if (refsign == '-')
3967     {
3968 ph10 172 recno = cd->bracount - recno + 1;
3969 ph10 167 if (recno <= 0)
3970     {
3971     *errorcodeptr = ERR15;
3972     goto FAILED;
3973 ph10 172 }
3974 ph10 167 }
3975 ph10 172 else recno += cd->bracount;
3976 ph10 167 PUT2(code, 2+LINK_SIZE, recno);
3977     break;
3978 ph10 172 }
3979 nigel 91
3980 ph10 167 /* Otherwise (did not start with "+" or "-"), start by looking for the
3981     name. */
3982 ph10 172
3983 nigel 93 slot = cd->name_table;
3984     for (i = 0; i < cd->names_found; i++)
3985     {
3986     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3987     slot += cd->name_entry_size;
3988     }
3989 nigel 91
3990 nigel 93 /* Found a previous named subpattern */
3991 nigel 91
3992 nigel 93 if (i < cd->names_found)
3993     {
3994     recno = GET2(slot, 0);
3995     PUT2(code, 2+LINK_SIZE, recno);
3996     }
3997 nigel 91
3998 nigel 93 /* Search the pattern for a forward reference */
3999 nigel 91
4000 nigel 93 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4001     (options & PCRE_EXTENDED) != 0)) > 0)
4002     {
4003     PUT2(code, 2+LINK_SIZE, i);
4004     }
4005 nigel 91
4006 nigel 93 /* If terminator == 0 it means that the name followed directly after
4007     the opening parenthesis [e.g. (?(abc)...] and in this case there are
4008     some further alternatives to try. For the cases where terminator != 0
4009     [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4010     now checked all the possibilities, so give an error. */
4011 nigel 91
4012 nigel 93 else if (terminator != 0)
4013     {
4014     *errorcodeptr = ERR15;
4015     goto FAILED;
4016     }
4017    
4018     /* Check for (?(R) for recursion. Allow digits after R to specify a
4019     specific group number. */
4020    
4021     else if (*name == 'R')
4022     {
4023     recno = 0;
4024     for (i = 1; i < namelen; i++)
4025 nigel 91 {
4026 nigel 93 if ((digitab[name[i]] & ctype_digit) == 0)
4027     {
4028     *errorcodeptr = ERR15;
4029     goto FAILED;
4030     }
4031     recno = recno * 10 + name[i] - '0';
4032 nigel 77 }
4033 nigel 93 if (recno == 0) recno = RREF_ANY;
4034     code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4035     PUT2(code, 2+LINK_SIZE, recno);
4036 nigel 77 }
4037 nigel 91
4038 nigel 93 /* Similarly, check for the (?(DEFINE) "condition", which is always
4039     false. */
4040 nigel 91
4041 nigel 93 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4042     {
4043     code[1+LINK_SIZE] = OP_DEF;
4044     skipbytes = 1;
4045     }
4046    
4047     /* Check for the "name" actually being a subpattern number. */
4048    
4049     else if (recno > 0)
4050     {
4051     PUT2(code, 2+LINK_SIZE, recno);
4052     }
4053    
4054     /* Either an unidentified subpattern, or a reference to (?(0) */
4055    
4056     else
4057     {
4058     *errorcodeptr = (recno == 0)? ERR35: ERR15;
4059     goto FAILED;
4060     }
4061 nigel 77 break;
4062    
4063 nigel 93
4064     /* ------------------------------------------------------------ */
4065 nigel 77 case '=': /* Positive lookahead */
4066     bravalue = OP_ASSERT;
4067     ptr++;
4068     break;
4069    
4070 nigel 93
4071     /* ------------------------------------------------------------ */
4072 nigel 77 case '!': /* Negative lookahead */
4073     bravalue = OP_ASSERT_NOT;
4074     ptr++;
4075     break;
4076    
4077 nigel 93
4078     /* ------------------------------------------------------------ */
4079     case '<': /* Lookbehind or named define */
4080     switch (ptr[1])
4081 nigel 77 {
4082     case '=': /* Positive lookbehind */
4083     bravalue = OP_ASSERTBACK;
4084 nigel 93 ptr += 2;
4085 nigel 77 break;
4086    
4087     case '!': /* Negative lookbehind */
4088     bravalue = OP_ASSERTBACK_NOT;
4089 nigel 93 ptr += 2;
4090 nigel 77 break;
4091 nigel 93
4092     default: /* Could be name define, else bad */
4093     if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4094     ptr++; /* Correct offset for error */
4095     *errorcodeptr = ERR24;
4096     goto FAILED;
4097 nigel 77 }
4098     break;
4099    
4100 nigel 93
4101     /* ------------------------------------------------------------ */
4102 nigel 77 case '>': /* One-time brackets */
4103     bravalue = OP_ONCE;
4104     ptr++;
4105     break;
4106    
4107 nigel 93
4108     /* ------------------------------------------------------------ */
4109 nigel 77 case 'C': /* Callout - may be followed by digits; */
4110     previous_callout = code; /* Save for later completion */
4111     after_manual_callout = 1; /* Skip one item before completing */
4112 nigel 93 *code++ = OP_CALLOUT;
4113     {
4114 nigel 77 int n = 0;
4115     while ((digitab[*(++ptr)] & ctype_digit) != 0)
4116     n = n * 10 + *ptr - '0';
4117 nigel