/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 195 - (hide annotations) (download)
Mon Jul 30 13:23:28 2007 UTC (7 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 183655 byte(s)
Add words about EBCDIC to doc and ./configure --help (somebody thought it might
be a useful option and tried it on an ASCII system). Fixed one missing table
entry for EBCDIC. 

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 nigel 93 #define NLBLOCK cd /* Block containing newline information */
46     #define PSSTART start_pattern /* Field containing processed string start */
47     #define PSEND end_pattern /* Field containing processed string end */
48    
49    
50 nigel 77 #include "pcre_internal.h"
51    
52    
53 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
54     used by pcretest. DEBUG is not defined when building a production library. */
55    
56     #ifdef DEBUG
57     #include "pcre_printint.src"
58     #endif
59    
60    
61 ph10 178 /* Macro for setting individual bits in class bitmaps. */
62    
63     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
64    
65    
66 nigel 77 /*************************************************
67     * Code parameters and static tables *
68     *************************************************/
69    
70 nigel 93 /* This value specifies the size of stack workspace that is used during the
71     first pre-compile phase that determines how much memory is required. The regex
72     is partly compiled into this space, but the compiled parts are discarded as
73     soon as they can be, so that hopefully there will never be an overrun. The code
74     does, however, check for an overrun. The largest amount I've seen used is 218,
75     so this number is very generous.
76 nigel 77
77 nigel 93 The same workspace is used during the second, actual compile phase for
78     remembering forward references to groups so that they can be filled in at the
79     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
80     is 4 there is plenty of room. */
81 nigel 77
82 nigel 93 #define COMPILE_WORK_SIZE (4096)
83 nigel 77
84 nigel 93
85 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
86     are simple data values; negative values are for special things like \d and so
87     on. Zero means further processing is needed (for things like \x), or the escape
88     is invalid. */
89    
90 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
91 nigel 77 static const short int escapes[] = {
92     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
93     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
94     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
95 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
96     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
97 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
98     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
99 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
100     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
101 nigel 77 0, 0, -ESC_z /* x - z */
102     };
103    
104 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
105 nigel 77 static const short int escapes[] = {
106     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
107     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
108     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
109     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
110     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
111     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
112     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
113     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
114 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
115 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
116 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
117 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
118 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
119     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
120     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
121     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
122 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
123 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
124 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
125 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
126 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
127     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
128     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
129     };
130     #endif
131    
132    
133     /* Tables of names of POSIX character classes and their lengths. The list is
134 nigel 87 terminated by a zero length entry. The first three must be alpha, lower, upper,
135 nigel 77 as this is assumed for handling case independence. */
136    
137     static const char *const posix_names[] = {
138     "alpha", "lower", "upper",
139     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
140     "print", "punct", "space", "word", "xdigit" };
141    
142     static const uschar posix_name_lengths[] = {
143     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
144    
145 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
146     base map, with an optional addition or removal of another map. Then, for some
147     classes, there is some additional tweaking: for [:blank:] the vertical space
148     characters are removed, and for [:alpha:] and [:alnum:] the underscore
149     character is removed. The triples in the table consist of the base map offset,
150     second map offset or -1 if no second map, and a non-negative value for map
151     addition or a negative value for map subtraction (if there are two maps). The
152     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
153     remove vertical space characters, 2 => remove underscore. */
154 nigel 77
155     static const int posix_class_maps[] = {
156 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
157     cbit_lower, -1, 0, /* lower */
158     cbit_upper, -1, 0, /* upper */
159     cbit_word, -1, 2, /* alnum - word without underscore */
160     cbit_print, cbit_cntrl, 0, /* ascii */
161     cbit_space, -1, 1, /* blank - a GNU extension */
162     cbit_cntrl, -1, 0, /* cntrl */
163     cbit_digit, -1, 0, /* digit */
164     cbit_graph, -1, 0, /* graph */
165     cbit_print, -1, 0, /* print */
166     cbit_punct, -1, 0, /* punct */
167     cbit_space, -1, 0, /* space */
168     cbit_word, -1, 0, /* word - a Perl extension */
169     cbit_xdigit,-1, 0 /* xdigit */
170 nigel 77 };
171    
172    
173 nigel 93 #define STRING(a) # a
174     #define XSTRING(s) STRING(s)
175    
176 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
177 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
178     they are documented. Always add a new error instead. Messages marked DEAD below
179     are no longer used. */
180 nigel 77
181     static const char *error_texts[] = {
182     "no error",
183     "\\ at end of pattern",
184     "\\c at end of pattern",
185     "unrecognized character follows \\",
186     "numbers out of order in {} quantifier",
187     /* 5 */
188     "number too big in {} quantifier",
189     "missing terminating ] for character class",
190     "invalid escape sequence in character class",
191     "range out of order in character class",
192     "nothing to repeat",
193     /* 10 */
194 nigel 93 "operand of unlimited repeat could match the empty string", /** DEAD **/
195 nigel 77 "internal error: unexpected repeat",
196     "unrecognized character after (?",
197     "POSIX named classes are supported only within a class",
198     "missing )",
199     /* 15 */
200     "reference to non-existent subpattern",
201     "erroffset passed as NULL",
202     "unknown option bit(s) set",
203     "missing ) after comment",
204 nigel 93 "parentheses nested too deeply", /** DEAD **/
205 nigel 77 /* 20 */
206     "regular expression too large",
207     "failed to get memory",
208     "unmatched parentheses",
209     "internal error: code overflow",
210     "unrecognized character after (?<",
211     /* 25 */
212     "lookbehind assertion is not fixed length",
213 nigel 91 "malformed number or name after (?(",
214 nigel 77 "conditional group contains more than two branches",
215     "assertion expected after (?(",
216 ph10 166 "(?R or (?[+-]digits must be followed by )",
217 nigel 77 /* 30 */
218     "unknown POSIX class name",
219     "POSIX collating elements are not supported",
220     "this version of PCRE is not compiled with PCRE_UTF8 support",
221 nigel 93 "spare error", /** DEAD **/
222 nigel 77 "character value in \\x{...} sequence is too large",
223     /* 35 */
224     "invalid condition (?(0)",
225     "\\C not allowed in lookbehind assertion",
226     "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
227     "number after (?C is > 255",
228     "closing ) for (?C expected",
229     /* 40 */
230     "recursive call could loop indefinitely",
231     "unrecognized character after (?P",
232 nigel 93 "syntax error in subpattern name (missing terminator)",
233 nigel 91 "two named subpatterns have the same name",
234 nigel 77 "invalid UTF-8 string",
235     /* 45 */
236     "support for \\P, \\p, and \\X has not been compiled",
237     "malformed \\P or \\p sequence",
238 nigel 91 "unknown property name after \\P or \\p",
239 nigel 93 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
240     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
241 nigel 91 /* 50 */
242     "repeated subpattern is too long",
243 nigel 93 "octal value is greater than \\377 (not in UTF-8 mode)",
244     "internal error: overran compiling workspace",
245     "internal error: previously-checked referenced subpattern not found",
246     "DEFINE group contains more than one branch",
247     /* 55 */
248     "repeating a DEFINE group is not allowed",
249     "inconsistent NEWLINE options",
250 ph10 171 "\\g is not followed by a braced name or an optionally braced non-zero number",
251 ph10 172 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number"
252 nigel 77 };
253    
254    
255     /* Table to identify digits and hex digits. This is used when compiling
256     patterns. Note that the tables in chartables are dependent on the locale, and
257     may mark arbitrary characters as digits - but the PCRE compiling code expects
258     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
259     a private table here. It costs 256 bytes, but it is a lot faster than doing
260     character value tests (at least in some simple cases I timed), and in some
261     applications one wants PCRE to compile efficiently as well as match
262     efficiently.
263    
264     For convenience, we use the same bit definitions as in chartables:
265    
266     0x04 decimal digit
267     0x08 hexadecimal digit
268    
269     Then we can use ctype_digit and ctype_xdigit in the code. */
270    
271 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
272 nigel 77 static const unsigned char digitab[] =
273     {
274     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
275     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
276     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
277     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
278     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
279     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
280     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
281     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
282     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
283     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
284     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
285     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
286     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
287     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
288     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
289     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
290     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
291     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
292     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
293     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
294     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
295     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
296     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
297     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
298     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
299     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
300     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
301     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
302     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
303     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
304     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
305     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
306    
307 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
308 nigel 77 static const unsigned char digitab[] =
309     {
310     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
311     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
312     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
313     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
314     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
315     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
316     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
317     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
318     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
319     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
320     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
321 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
322 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
323     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
324     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
325     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
326     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
333     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
334     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
335     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
336     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
337     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
338     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
339     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
340     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
341     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
342    
343     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
344     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
345     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
346     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
347     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
348     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
350     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
351     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
352     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
353     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
355 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
356 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
357     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
358     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
359     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
360     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
361     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
362     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
363     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
364     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
365     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
366     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
367     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
368     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
369     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
370     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
371     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
372     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
373     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
374     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
375     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
376     #endif
377    
378    
379     /* Definition to allow mutual recursion */
380    
381     static BOOL
382 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
383 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
384 nigel 77
385    
386    
387     /*************************************************
388     * Handle escapes *
389     *************************************************/
390    
391     /* This function is called when a \ has been encountered. It either returns a
392     positive value for a simple escape such as \n, or a negative value which
393 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
394     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
395     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
396     ptr is pointing at the \. On exit, it is on the final character of the escape
397     sequence.
398 nigel 77
399     Arguments:
400     ptrptr points to the pattern position pointer
401     errorcodeptr points to the errorcode variable
402     bracount number of previous extracting brackets
403     options the options bits
404     isclass TRUE if inside a character class
405    
406     Returns: zero or positive => a data character
407     negative => a special escape sequence
408     on error, errorptr is set
409     */
410    
411     static int
412     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
413     int options, BOOL isclass)
414     {
415 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
416     const uschar *ptr = *ptrptr + 1;
417 nigel 77 int c, i;
418    
419 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
420     ptr--; /* Set pointer back to the last byte */
421    
422 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
423    
424     if (c == 0) *errorcodeptr = ERR1;
425    
426     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
427     a table. A non-zero result is something that can be returned immediately.
428     Otherwise further processing may be required. */
429    
430 ph10 97 #ifndef EBCDIC /* ASCII coding */
431 nigel 77 else if (c < '0' || c > 'z') {} /* Not alphameric */
432     else if ((i = escapes[c - '0']) != 0) c = i;
433    
434 ph10 97 #else /* EBCDIC coding */
435 nigel 77 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
436     else if ((i = escapes[c - 0x48]) != 0) c = i;
437     #endif
438    
439     /* Escapes that need further processing, or are illegal. */
440    
441     else
442     {
443     const uschar *oldptr;
444 nigel 93 BOOL braced, negated;
445    
446 nigel 77 switch (c)
447     {
448     /* A number of Perl escapes are not handled by PCRE. We give an explicit
449     error. */
450    
451     case 'l':
452     case 'L':
453     case 'N':
454     case 'u':
455     case 'U':
456     *errorcodeptr = ERR37;
457     break;
458    
459 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
460     is an absolute backreference. If negative, it is a relative backreference.
461 ph10 172 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
462     reference to a named group. This is part of Perl's movement towards a
463     unified syntax for back references. As this is synonymous with \k{name}, we
464 ph10 171 fudge it up by pretending it really was \k. */
465 nigel 93
466     case 'g':
467     if (ptr[1] == '{')
468     {
469 ph10 171 const uschar *p;
470     for (p = ptr+2; *p != 0 && *p != '}'; p++)
471     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
472 ph10 172 if (*p != 0 && *p != '}')
473 ph10 171 {
474     c = -ESC_k;
475     break;
476 ph10 172 }
477 nigel 93 braced = TRUE;
478     ptr++;
479     }
480     else braced = FALSE;
481    
482     if (ptr[1] == '-')
483     {
484     negated = TRUE;
485     ptr++;
486     }
487     else negated = FALSE;
488    
489     c = 0;
490     while ((digitab[ptr[1]] & ctype_digit) != 0)
491     c = c * 10 + *(++ptr) - '0';
492    
493     if (c == 0 || (braced && *(++ptr) != '}'))
494     {
495     *errorcodeptr = ERR57;
496     return 0;
497     }
498    
499     if (negated)
500     {
501     if (c > bracount)
502     {
503     *errorcodeptr = ERR15;
504     return 0;
505     }
506     c = bracount - (c - 1);
507     }
508    
509     c = -(ESC_REF + c);
510     break;
511    
512 nigel 77 /* The handling of escape sequences consisting of a string of digits
513     starting with one that is not zero is not straightforward. By experiment,
514     the way Perl works seems to be as follows:
515    
516     Outside a character class, the digits are read as a decimal number. If the
517     number is less than 10, or if there are that many previous extracting
518     left brackets, then it is a back reference. Otherwise, up to three octal
519     digits are read to form an escaped byte. Thus \123 is likely to be octal
520     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
521     value is greater than 377, the least significant 8 bits are taken. Inside a
522     character class, \ followed by a digit is always an octal number. */
523    
524     case '1': case '2': case '3': case '4': case '5':
525     case '6': case '7': case '8': case '9':
526    
527     if (!isclass)
528     {
529     oldptr = ptr;
530     c -= '0';
531     while ((digitab[ptr[1]] & ctype_digit) != 0)
532     c = c * 10 + *(++ptr) - '0';
533     if (c < 10 || c <= bracount)
534     {
535     c = -(ESC_REF + c);
536     break;
537     }
538     ptr = oldptr; /* Put the pointer back and fall through */
539     }
540    
541     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
542     generates a binary zero byte and treats the digit as a following literal.
543     Thus we have to pull back the pointer by one. */
544    
545     if ((c = *ptr) >= '8')
546     {
547     ptr--;
548     c = 0;
549     break;
550     }
551    
552     /* \0 always starts an octal number, but we may drop through to here with a
553 nigel 91 larger first octal digit. The original code used just to take the least
554     significant 8 bits of octal numbers (I think this is what early Perls used
555     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
556     than 3 octal digits. */
557 nigel 77
558     case '0':
559     c -= '0';
560     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
561     c = c * 8 + *(++ptr) - '0';
562 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
563 nigel 77 break;
564    
565 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
566     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
567     treated as a data character. */
568 nigel 77
569     case 'x':
570 nigel 87 if (ptr[1] == '{')
571 nigel 77 {
572     const uschar *pt = ptr + 2;
573 nigel 87 int count = 0;
574    
575 nigel 77 c = 0;
576     while ((digitab[*pt] & ctype_xdigit) != 0)
577     {
578 nigel 87 register int cc = *pt++;
579     if (c == 0 && cc == '0') continue; /* Leading zeroes */
580 nigel 77 count++;
581 nigel 87
582 ph10 97 #ifndef EBCDIC /* ASCII coding */
583 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
584 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
585 ph10 97 #else /* EBCDIC coding */
586 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
587 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
588 nigel 77 #endif
589     }
590 nigel 87
591 nigel 77 if (*pt == '}')
592     {
593 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
594 nigel 77 ptr = pt;
595     break;
596     }
597 nigel 87
598 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
599     recognize this construct; fall through to the normal \x handling. */
600     }
601    
602 nigel 87 /* Read just a single-byte hex-defined char */
603 nigel 77
604     c = 0;
605     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
606     {
607     int cc; /* Some compilers don't like ++ */
608     cc = *(++ptr); /* in initializers */
609 ph10 97 #ifndef EBCDIC /* ASCII coding */
610 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
611     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
612 ph10 97 #else /* EBCDIC coding */
613 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
614     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
615     #endif
616     }
617     break;
618    
619 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
620     This coding is ASCII-specific, but then the whole concept of \cx is
621     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
622 nigel 77
623     case 'c':
624     c = *(++ptr);
625     if (c == 0)
626     {
627     *errorcodeptr = ERR2;
628     return 0;
629     }
630    
631 ph10 97 #ifndef EBCDIC /* ASCII coding */
632 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
633     c ^= 0x40;
634 ph10 97 #else /* EBCDIC coding */
635 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
636     c ^= 0xC0;
637     #endif
638     break;
639    
640     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
641     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
642     for Perl compatibility, it is a literal. This code looks a bit odd, but
643     there used to be some cases other than the default, and there may be again
644     in future, so I haven't "optimized" it. */
645    
646     default:
647     if ((options & PCRE_EXTRA) != 0) switch(c)
648     {
649     default:
650     *errorcodeptr = ERR3;
651     break;
652     }
653     break;
654     }
655     }
656    
657     *ptrptr = ptr;
658     return c;
659     }
660    
661    
662    
663     #ifdef SUPPORT_UCP
664     /*************************************************
665     * Handle \P and \p *
666     *************************************************/
667    
668     /* This function is called after \P or \p has been encountered, provided that
669     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
670     pointing at the P or p. On exit, it is pointing at the final character of the
671     escape sequence.
672    
673     Argument:
674     ptrptr points to the pattern position pointer
675     negptr points to a boolean that is set TRUE for negation else FALSE
676 nigel 87 dptr points to an int that is set to the detailed property value
677 nigel 77 errorcodeptr points to the error code variable
678    
679 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
680 nigel 77 */
681    
682     static int
683 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
684 nigel 77 {
685     int c, i, bot, top;
686     const uschar *ptr = *ptrptr;
687 nigel 87 char name[32];
688 nigel 77
689     c = *(++ptr);
690     if (c == 0) goto ERROR_RETURN;
691    
692     *negptr = FALSE;
693    
694 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
695     negation. */
696 nigel 77
697     if (c == '{')
698     {
699     if (ptr[1] == '^')
700     {
701     *negptr = TRUE;
702     ptr++;
703     }
704 nigel 87 for (i = 0; i < sizeof(name) - 1; i++)
705 nigel 77 {
706     c = *(++ptr);
707     if (c == 0) goto ERROR_RETURN;
708     if (c == '}') break;
709     name[i] = c;
710     }
711 nigel 87 if (c !='}') goto ERROR_RETURN;
712 nigel 77 name[i] = 0;
713     }
714    
715     /* Otherwise there is just one following character */
716    
717     else
718     {
719     name[0] = c;
720     name[1] = 0;
721     }
722    
723     *ptrptr = ptr;
724    
725     /* Search for a recognized property name using binary chop */
726    
727     bot = 0;
728     top = _pcre_utt_size;
729    
730     while (bot < top)
731     {
732 nigel 87 i = (bot + top) >> 1;
733 nigel 77 c = strcmp(name, _pcre_utt[i].name);
734 nigel 87 if (c == 0)
735     {
736     *dptr = _pcre_utt[i].value;
737     return _pcre_utt[i].type;
738     }
739 nigel 77 if (c > 0) bot = i + 1; else top = i;
740     }
741    
742     *errorcodeptr = ERR47;
743     *ptrptr = ptr;
744     return -1;
745    
746     ERROR_RETURN:
747     *errorcodeptr = ERR46;
748     *ptrptr = ptr;
749     return -1;
750     }
751     #endif
752    
753    
754    
755    
756     /*************************************************
757     * Check for counted repeat *
758     *************************************************/
759    
760     /* This function is called when a '{' is encountered in a place where it might
761     start a quantifier. It looks ahead to see if it really is a quantifier or not.
762     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
763     where the ddds are digits.
764    
765     Arguments:
766     p pointer to the first char after '{'
767    
768     Returns: TRUE or FALSE
769     */
770    
771     static BOOL
772     is_counted_repeat(const uschar *p)
773     {
774     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
775     while ((digitab[*p] & ctype_digit) != 0) p++;
776     if (*p == '}') return TRUE;
777    
778     if (*p++ != ',') return FALSE;
779     if (*p == '}') return TRUE;
780    
781     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
782     while ((digitab[*p] & ctype_digit) != 0) p++;
783    
784     return (*p == '}');
785     }
786    
787    
788    
789     /*************************************************
790     * Read repeat counts *
791     *************************************************/
792    
793     /* Read an item of the form {n,m} and return the values. This is called only
794     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
795     so the syntax is guaranteed to be correct, but we need to check the values.
796    
797     Arguments:
798     p pointer to first char after '{'
799     minp pointer to int for min
800     maxp pointer to int for max
801     returned as -1 if no max
802     errorcodeptr points to error code variable
803    
804     Returns: pointer to '}' on success;
805     current ptr on error, with errorcodeptr set non-zero
806     */
807    
808     static const uschar *
809     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
810     {
811     int min = 0;
812     int max = -1;
813    
814 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
815     an integer overflow. */
816    
817 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
818 nigel 81 if (min < 0 || min > 65535)
819     {
820     *errorcodeptr = ERR5;
821     return p;
822     }
823 nigel 77
824 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
825     Also, max must not be less than min. */
826    
827 nigel 77 if (*p == '}') max = min; else
828     {
829     if (*(++p) != '}')
830     {
831     max = 0;
832     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
833 nigel 81 if (max < 0 || max > 65535)
834     {
835     *errorcodeptr = ERR5;
836     return p;
837     }
838 nigel 77 if (max < min)
839     {
840     *errorcodeptr = ERR4;
841     return p;
842     }
843     }
844     }
845    
846 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
847     '}'. */
848 nigel 77
849 nigel 81 *minp = min;
850     *maxp = max;
851 nigel 77 return p;
852     }
853    
854    
855    
856     /*************************************************
857 nigel 93 * Find forward referenced subpattern *
858 nigel 91 *************************************************/
859    
860 nigel 93 /* This function scans along a pattern's text looking for capturing
861     subpatterns, and counting them. If it finds a named pattern that matches the
862     name it is given, it returns its number. Alternatively, if the name is NULL, it
863     returns when it reaches a given numbered subpattern. This is used for forward
864     references to subpatterns. We know that if (?P< is encountered, the name will
865     be terminated by '>' because that is checked in the first pass.
866 nigel 91
867     Arguments:
868 nigel 93 ptr current position in the pattern
869     count current count of capturing parens so far encountered
870     name name to seek, or NULL if seeking a numbered subpattern
871     lorn name length, or subpattern number if name is NULL
872     xmode TRUE if we are in /x mode
873 nigel 91
874     Returns: the number of the named subpattern, or -1 if not found
875     */
876    
877     static int
878 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
879     BOOL xmode)
880 nigel 91 {
881     const uschar *thisname;
882 nigel 93
883 nigel 91 for (; *ptr != 0; ptr++)
884     {
885 nigel 93 int term;
886    
887     /* Skip over backslashed characters and also entire \Q...\E */
888    
889     if (*ptr == '\\')
890     {
891     if (*(++ptr) == 0) return -1;
892     if (*ptr == 'Q') for (;;)
893     {
894     while (*(++ptr) != 0 && *ptr != '\\');
895     if (*ptr == 0) return -1;
896     if (*(++ptr) == 'E') break;
897     }
898     continue;
899     }
900    
901     /* Skip over character classes */
902    
903     if (*ptr == '[')
904     {
905     while (*(++ptr) != ']')
906     {
907     if (*ptr == '\\')
908     {
909     if (*(++ptr) == 0) return -1;
910     if (*ptr == 'Q') for (;;)
911     {
912     while (*(++ptr) != 0 && *ptr != '\\');
913     if (*ptr == 0) return -1;
914     if (*(++ptr) == 'E') break;
915     }
916     continue;
917     }
918     }
919     continue;
920     }
921    
922     /* Skip comments in /x mode */
923    
924     if (xmode && *ptr == '#')
925     {
926     while (*(++ptr) != 0 && *ptr != '\n');
927     if (*ptr == 0) return -1;
928     continue;
929     }
930    
931     /* An opening parens must now be a real metacharacter */
932    
933 nigel 91 if (*ptr != '(') continue;
934 nigel 93 if (ptr[1] != '?')
935     {
936     count++;
937     if (name == NULL && count == lorn) return count;
938     continue;
939     }
940    
941     ptr += 2;
942     if (*ptr == 'P') ptr++; /* Allow optional P */
943    
944     /* We have to disambiguate (?<! and (?<= from (?<name> */
945    
946     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
947     *ptr != '\'')
948     continue;
949    
950 nigel 91 count++;
951 nigel 93
952     if (name == NULL && count == lorn) return count;
953     term = *ptr++;
954     if (term == '<') term = '>';
955 nigel 91 thisname = ptr;
956 nigel 93 while (*ptr != term) ptr++;
957     if (name != NULL && lorn == ptr - thisname &&
958     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
959 nigel 91 return count;
960     }
961 nigel 93
962 nigel 91 return -1;
963     }
964    
965    
966    
967     /*************************************************
968 nigel 77 * Find first significant op code *
969     *************************************************/
970    
971     /* This is called by several functions that scan a compiled expression looking
972     for a fixed first character, or an anchoring op code etc. It skips over things
973     that do not influence this. For some calls, a change of option is important.
974     For some calls, it makes sense to skip negative forward and all backward
975     assertions, and also the \b assertion; for others it does not.
976    
977     Arguments:
978     code pointer to the start of the group
979     options pointer to external options
980     optbit the option bit whose changing is significant, or
981     zero if none are
982     skipassert TRUE if certain assertions are to be skipped
983    
984     Returns: pointer to the first significant opcode
985     */
986    
987     static const uschar*
988     first_significant_code(const uschar *code, int *options, int optbit,
989     BOOL skipassert)
990     {
991     for (;;)
992     {
993     switch ((int)*code)
994     {
995     case OP_OPT:
996     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
997     *options = (int)code[1];
998     code += 2;
999     break;
1000    
1001     case OP_ASSERT_NOT:
1002     case OP_ASSERTBACK:
1003     case OP_ASSERTBACK_NOT:
1004     if (!skipassert) return code;
1005     do code += GET(code, 1); while (*code == OP_ALT);
1006     code += _pcre_OP_lengths[*code];
1007     break;
1008    
1009     case OP_WORD_BOUNDARY:
1010     case OP_NOT_WORD_BOUNDARY:
1011     if (!skipassert) return code;
1012     /* Fall through */
1013    
1014     case OP_CALLOUT:
1015     case OP_CREF:
1016 nigel 93 case OP_RREF:
1017     case OP_DEF:
1018 nigel 77 code += _pcre_OP_lengths[*code];
1019     break;
1020    
1021     default:
1022     return code;
1023     }
1024     }
1025     /* Control never reaches here */
1026     }
1027    
1028    
1029    
1030    
1031     /*************************************************
1032     * Find the fixed length of a pattern *
1033     *************************************************/
1034    
1035     /* Scan a pattern and compute the fixed length of subject that will match it,
1036     if the length is fixed. This is needed for dealing with backward assertions.
1037     In UTF8 mode, the result is in characters rather than bytes.
1038    
1039     Arguments:
1040     code points to the start of the pattern (the bracket)
1041     options the compiling options
1042    
1043     Returns: the fixed length, or -1 if there is no fixed length,
1044     or -2 if \C was encountered
1045     */
1046    
1047     static int
1048     find_fixedlength(uschar *code, int options)
1049     {
1050     int length = -1;
1051    
1052     register int branchlength = 0;
1053     register uschar *cc = code + 1 + LINK_SIZE;
1054    
1055     /* Scan along the opcodes for this branch. If we get to the end of the
1056     branch, check the length against that of the other branches. */
1057    
1058     for (;;)
1059     {
1060     int d;
1061     register int op = *cc;
1062    
1063     switch (op)
1064     {
1065 nigel 93 case OP_CBRA:
1066 nigel 77 case OP_BRA:
1067     case OP_ONCE:
1068     case OP_COND:
1069 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1070 nigel 77 if (d < 0) return d;
1071     branchlength += d;
1072     do cc += GET(cc, 1); while (*cc == OP_ALT);
1073     cc += 1 + LINK_SIZE;
1074     break;
1075    
1076     /* Reached end of a branch; if it's a ket it is the end of a nested
1077     call. If it's ALT it is an alternation in a nested call. If it is
1078     END it's the end of the outer call. All can be handled by the same code. */
1079    
1080     case OP_ALT:
1081     case OP_KET:
1082     case OP_KETRMAX:
1083     case OP_KETRMIN:
1084     case OP_END:
1085     if (length < 0) length = branchlength;
1086     else if (length != branchlength) return -1;
1087     if (*cc != OP_ALT) return length;
1088     cc += 1 + LINK_SIZE;
1089     branchlength = 0;
1090     break;
1091    
1092     /* Skip over assertive subpatterns */
1093    
1094     case OP_ASSERT:
1095     case OP_ASSERT_NOT:
1096     case OP_ASSERTBACK:
1097     case OP_ASSERTBACK_NOT:
1098     do cc += GET(cc, 1); while (*cc == OP_ALT);
1099     /* Fall through */
1100    
1101     /* Skip over things that don't match chars */
1102    
1103     case OP_REVERSE:
1104     case OP_CREF:
1105 nigel 93 case OP_RREF:
1106     case OP_DEF:
1107 nigel 77 case OP_OPT:
1108     case OP_CALLOUT:
1109     case OP_SOD:
1110     case OP_SOM:
1111     case OP_EOD:
1112     case OP_EODN:
1113     case OP_CIRC:
1114     case OP_DOLL:
1115     case OP_NOT_WORD_BOUNDARY:
1116     case OP_WORD_BOUNDARY:
1117     cc += _pcre_OP_lengths[*cc];
1118     break;
1119    
1120     /* Handle literal characters */
1121    
1122     case OP_CHAR:
1123     case OP_CHARNC:
1124 nigel 91 case OP_NOT:
1125 nigel 77 branchlength++;
1126     cc += 2;
1127     #ifdef SUPPORT_UTF8
1128     if ((options & PCRE_UTF8) != 0)
1129     {
1130     while ((*cc & 0xc0) == 0x80) cc++;
1131     }
1132     #endif
1133     break;
1134    
1135     /* Handle exact repetitions. The count is already in characters, but we
1136     need to skip over a multibyte character in UTF8 mode. */
1137    
1138     case OP_EXACT:
1139     branchlength += GET2(cc,1);
1140     cc += 4;
1141     #ifdef SUPPORT_UTF8
1142     if ((options & PCRE_UTF8) != 0)
1143     {
1144     while((*cc & 0x80) == 0x80) cc++;
1145     }
1146     #endif
1147     break;
1148    
1149     case OP_TYPEEXACT:
1150     branchlength += GET2(cc,1);
1151     cc += 4;
1152     break;
1153    
1154     /* Handle single-char matchers */
1155    
1156     case OP_PROP:
1157     case OP_NOTPROP:
1158 nigel 87 cc += 2;
1159 nigel 77 /* Fall through */
1160    
1161     case OP_NOT_DIGIT:
1162     case OP_DIGIT:
1163     case OP_NOT_WHITESPACE:
1164     case OP_WHITESPACE:
1165     case OP_NOT_WORDCHAR:
1166     case OP_WORDCHAR:
1167     case OP_ANY:
1168     branchlength++;
1169     cc++;
1170     break;
1171    
1172     /* The single-byte matcher isn't allowed */
1173    
1174     case OP_ANYBYTE:
1175     return -2;
1176    
1177     /* Check a class for variable quantification */
1178    
1179     #ifdef SUPPORT_UTF8
1180     case OP_XCLASS:
1181     cc += GET(cc, 1) - 33;
1182     /* Fall through */
1183     #endif
1184    
1185     case OP_CLASS:
1186     case OP_NCLASS:
1187     cc += 33;
1188    
1189     switch (*cc)
1190     {
1191     case OP_CRSTAR:
1192     case OP_CRMINSTAR:
1193     case OP_CRQUERY:
1194     case OP_CRMINQUERY:
1195     return -1;
1196    
1197     case OP_CRRANGE:
1198     case OP_CRMINRANGE:
1199     if (GET2(cc,1) != GET2(cc,3)) return -1;
1200     branchlength += GET2(cc,1);
1201     cc += 5;
1202     break;
1203    
1204     default:
1205     branchlength++;
1206     }
1207     break;
1208    
1209     /* Anything else is variable length */
1210    
1211     default:
1212     return -1;
1213     }
1214     }
1215     /* Control never gets here */
1216     }
1217    
1218    
1219    
1220    
1221     /*************************************************
1222     * Scan compiled regex for numbered bracket *
1223     *************************************************/
1224    
1225     /* This little function scans through a compiled pattern until it finds a
1226     capturing bracket with the given number.
1227    
1228     Arguments:
1229     code points to start of expression
1230     utf8 TRUE in UTF-8 mode
1231     number the required bracket number
1232    
1233     Returns: pointer to the opcode for the bracket, or NULL if not found
1234     */
1235    
1236     static const uschar *
1237     find_bracket(const uschar *code, BOOL utf8, int number)
1238     {
1239     for (;;)
1240     {
1241     register int c = *code;
1242     if (c == OP_END) return NULL;
1243 nigel 91
1244     /* XCLASS is used for classes that cannot be represented just by a bit
1245     map. This includes negated single high-valued characters. The length in
1246     the table is zero; the actual length is stored in the compiled code. */
1247    
1248     if (c == OP_XCLASS) code += GET(code, 1);
1249    
1250 nigel 93 /* Handle capturing bracket */
1251 nigel 91
1252 nigel 93 else if (c == OP_CBRA)
1253 nigel 77 {
1254 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1255 nigel 77 if (n == number) return (uschar *)code;
1256 nigel 93 code += _pcre_OP_lengths[c];
1257 nigel 77 }
1258 nigel 91
1259 nigel 93 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1260     a multi-byte character. The length in the table is a minimum, so we have to
1261     arrange to skip the extra bytes. */
1262 nigel 91
1263 nigel 77 else
1264     {
1265     code += _pcre_OP_lengths[c];
1266 ph10 107 #ifdef SUPPORT_UTF8
1267 nigel 77 if (utf8) switch(c)
1268     {
1269     case OP_CHAR:
1270     case OP_CHARNC:
1271     case OP_EXACT:
1272     case OP_UPTO:
1273     case OP_MINUPTO:
1274 nigel 93 case OP_POSUPTO:
1275 nigel 77 case OP_STAR:
1276     case OP_MINSTAR:
1277 nigel 93 case OP_POSSTAR:
1278 nigel 77 case OP_PLUS:
1279     case OP_MINPLUS:
1280 nigel 93 case OP_POSPLUS:
1281 nigel 77 case OP_QUERY:
1282     case OP_MINQUERY:
1283 nigel 93 case OP_POSQUERY:
1284     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1285 nigel 77 break;
1286     }
1287 ph10 111 #endif
1288 nigel 77 }
1289     }
1290     }
1291    
1292    
1293    
1294     /*************************************************
1295     * Scan compiled regex for recursion reference *
1296     *************************************************/
1297    
1298     /* This little function scans through a compiled pattern until it finds an
1299     instance of OP_RECURSE.
1300    
1301     Arguments:
1302     code points to start of expression
1303     utf8 TRUE in UTF-8 mode
1304    
1305     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1306     */
1307    
1308     static const uschar *
1309     find_recurse(const uschar *code, BOOL utf8)
1310     {
1311     for (;;)
1312     {
1313     register int c = *code;
1314     if (c == OP_END) return NULL;
1315 nigel 91 if (c == OP_RECURSE) return code;
1316    
1317     /* XCLASS is used for classes that cannot be represented just by a bit
1318     map. This includes negated single high-valued characters. The length in
1319     the table is zero; the actual length is stored in the compiled code. */
1320    
1321     if (c == OP_XCLASS) code += GET(code, 1);
1322    
1323     /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1324     that are followed by a character may be followed by a multi-byte character.
1325 nigel 93 The length in the table is a minimum, so we have to arrange to skip the extra
1326     bytes. */
1327 nigel 91
1328 nigel 77 else
1329     {
1330     code += _pcre_OP_lengths[c];
1331 ph10 107 #ifdef SUPPORT_UTF8
1332 nigel 77 if (utf8) switch(c)
1333     {
1334     case OP_CHAR:
1335     case OP_CHARNC:
1336     case OP_EXACT:
1337     case OP_UPTO:
1338     case OP_MINUPTO:
1339 nigel 93 case OP_POSUPTO:
1340 nigel 77 case OP_STAR:
1341     case OP_MINSTAR:
1342 nigel 93 case OP_POSSTAR:
1343 nigel 77 case OP_PLUS:
1344     case OP_MINPLUS:
1345 nigel 93 case OP_POSPLUS:
1346 nigel 77 case OP_QUERY:
1347     case OP_MINQUERY:
1348 nigel 93 case OP_POSQUERY:
1349     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1350 nigel 77 break;
1351     }
1352 ph10 111 #endif
1353 nigel 77 }
1354     }
1355     }
1356    
1357    
1358    
1359     /*************************************************
1360     * Scan compiled branch for non-emptiness *
1361     *************************************************/
1362    
1363     /* This function scans through a branch of a compiled pattern to see whether it
1364 nigel 93 can match the empty string or not. It is called from could_be_empty()
1365     below and from compile_branch() when checking for an unlimited repeat of a
1366     group that can match nothing. Note that first_significant_code() skips over
1367     assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1368     struck an inner bracket whose current branch will already have been scanned.
1369 nigel 77
1370     Arguments:
1371     code points to start of search
1372     endcode points to where to stop
1373     utf8 TRUE if in UTF8 mode
1374    
1375     Returns: TRUE if what is matched could be empty
1376     */
1377    
1378     static BOOL
1379     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1380     {
1381     register int c;
1382 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1383 nigel 77 code < endcode;
1384     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1385     {
1386     const uschar *ccode;
1387    
1388     c = *code;
1389 ph10 172
1390 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1391 nigel 77
1392 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1393     {
1394 ph10 172 code += _pcre_OP_lengths[c];
1395 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1396     c = *code;
1397     continue;
1398     }
1399    
1400     /* For other groups, scan the branches. */
1401 ph10 172
1402 nigel 93 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1403 nigel 77 {
1404     BOOL empty_branch;
1405     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1406    
1407     /* Scan a closed bracket */
1408    
1409     empty_branch = FALSE;
1410     do
1411     {
1412     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1413     empty_branch = TRUE;
1414     code += GET(code, 1);
1415     }
1416     while (*code == OP_ALT);
1417     if (!empty_branch) return FALSE; /* All branches are non-empty */
1418 ph10 172 c = *code;
1419 nigel 93 continue;
1420 nigel 77 }
1421    
1422 nigel 93 /* Handle the other opcodes */
1423    
1424     switch (c)
1425 nigel 77 {
1426     /* Check for quantifiers after a class */
1427    
1428     #ifdef SUPPORT_UTF8
1429     case OP_XCLASS:
1430     ccode = code + GET(code, 1);
1431     goto CHECK_CLASS_REPEAT;
1432     #endif
1433    
1434     case OP_CLASS:
1435     case OP_NCLASS:
1436     ccode = code + 33;
1437    
1438     #ifdef SUPPORT_UTF8
1439     CHECK_CLASS_REPEAT:
1440     #endif
1441    
1442     switch (*ccode)
1443     {
1444     case OP_CRSTAR: /* These could be empty; continue */
1445     case OP_CRMINSTAR:
1446     case OP_CRQUERY:
1447     case OP_CRMINQUERY:
1448     break;
1449    
1450     default: /* Non-repeat => class must match */
1451     case OP_CRPLUS: /* These repeats aren't empty */
1452     case OP_CRMINPLUS:
1453     return FALSE;
1454    
1455     case OP_CRRANGE:
1456     case OP_CRMINRANGE:
1457     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1458     break;
1459     }
1460     break;
1461    
1462     /* Opcodes that must match a character */
1463    
1464     case OP_PROP:
1465     case OP_NOTPROP:
1466     case OP_EXTUNI:
1467     case OP_NOT_DIGIT:
1468     case OP_DIGIT:
1469     case OP_NOT_WHITESPACE:
1470     case OP_WHITESPACE:
1471     case OP_NOT_WORDCHAR:
1472     case OP_WORDCHAR:
1473     case OP_ANY:
1474     case OP_ANYBYTE:
1475     case OP_CHAR:
1476     case OP_CHARNC:
1477     case OP_NOT:
1478     case OP_PLUS:
1479     case OP_MINPLUS:
1480 nigel 93 case OP_POSPLUS:
1481 nigel 77 case OP_EXACT:
1482     case OP_NOTPLUS:
1483     case OP_NOTMINPLUS:
1484 nigel 93 case OP_NOTPOSPLUS:
1485 nigel 77 case OP_NOTEXACT:
1486     case OP_TYPEPLUS:
1487     case OP_TYPEMINPLUS:
1488 nigel 93 case OP_TYPEPOSPLUS:
1489 nigel 77 case OP_TYPEEXACT:
1490     return FALSE;
1491    
1492     /* End of branch */
1493    
1494     case OP_KET:
1495     case OP_KETRMAX:
1496     case OP_KETRMIN:
1497     case OP_ALT:
1498     return TRUE;
1499    
1500 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1501     MINUPTO, and POSUPTO may be followed by a multibyte character */
1502 nigel 77
1503     #ifdef SUPPORT_UTF8
1504     case OP_STAR:
1505     case OP_MINSTAR:
1506 nigel 93 case OP_POSSTAR:
1507 nigel 77 case OP_QUERY:
1508     case OP_MINQUERY:
1509 nigel 93 case OP_POSQUERY:
1510 nigel 77 case OP_UPTO:
1511     case OP_MINUPTO:
1512 nigel 93 case OP_POSUPTO:
1513 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1514     break;
1515     #endif
1516     }
1517     }
1518    
1519     return TRUE;
1520     }
1521    
1522    
1523    
1524     /*************************************************
1525     * Scan compiled regex for non-emptiness *
1526     *************************************************/
1527    
1528     /* This function is called to check for left recursive calls. We want to check
1529     the current branch of the current pattern to see if it could match the empty
1530     string. If it could, we must look outwards for branches at other levels,
1531     stopping when we pass beyond the bracket which is the subject of the recursion.
1532    
1533     Arguments:
1534     code points to start of the recursion
1535     endcode points to where to stop (current RECURSE item)
1536     bcptr points to the chain of current (unclosed) branch starts
1537     utf8 TRUE if in UTF-8 mode
1538    
1539     Returns: TRUE if what is matched could be empty
1540     */
1541    
1542     static BOOL
1543     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1544     BOOL utf8)
1545     {
1546     while (bcptr != NULL && bcptr->current >= code)
1547     {
1548     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1549     bcptr = bcptr->outer;
1550     }
1551     return TRUE;
1552     }
1553    
1554    
1555    
1556     /*************************************************
1557     * Check for POSIX class syntax *
1558     *************************************************/
1559    
1560     /* This function is called when the sequence "[:" or "[." or "[=" is
1561     encountered in a character class. It checks whether this is followed by an
1562     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1563     ".]" or "=]".
1564    
1565     Argument:
1566     ptr pointer to the initial [
1567     endptr where to return the end pointer
1568     cd pointer to compile data
1569    
1570     Returns: TRUE or FALSE
1571     */
1572    
1573     static BOOL
1574     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1575     {
1576     int terminator; /* Don't combine these lines; the Solaris cc */
1577     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1578     if (*(++ptr) == '^') ptr++;
1579     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1580     if (*ptr == terminator && ptr[1] == ']')
1581     {
1582     *endptr = ptr;
1583     return TRUE;
1584     }
1585     return FALSE;
1586     }
1587    
1588    
1589    
1590    
1591     /*************************************************
1592     * Check POSIX class name *
1593     *************************************************/
1594    
1595     /* This function is called to check the name given in a POSIX-style class entry
1596     such as [:alnum:].
1597    
1598     Arguments:
1599     ptr points to the first letter
1600     len the length of the name
1601    
1602     Returns: a value representing the name, or -1 if unknown
1603     */
1604    
1605     static int
1606     check_posix_name(const uschar *ptr, int len)
1607     {
1608     register int yield = 0;
1609     while (posix_name_lengths[yield] != 0)
1610     {
1611     if (len == posix_name_lengths[yield] &&
1612     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1613     yield++;
1614     }
1615     return -1;
1616     }
1617    
1618    
1619     /*************************************************
1620     * Adjust OP_RECURSE items in repeated group *
1621     *************************************************/
1622    
1623     /* OP_RECURSE items contain an offset from the start of the regex to the group
1624     that is referenced. This means that groups can be replicated for fixed
1625     repetition simply by copying (because the recursion is allowed to refer to
1626     earlier groups that are outside the current group). However, when a group is
1627     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1628     it, after it has been compiled. This means that any OP_RECURSE items within it
1629     that refer to the group itself or any contained groups have to have their
1630 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1631     the partially compiled regex must be temporarily terminated with OP_END.
1632 nigel 77
1633 nigel 93 This function has been extended with the possibility of forward references for
1634     recursions and subroutine calls. It must also check the list of such references
1635     for the group we are dealing with. If it finds that one of the recursions in
1636     the current group is on this list, it adjusts the offset in the list, not the
1637     value in the reference (which is a group number).
1638    
1639 nigel 77 Arguments:
1640     group points to the start of the group
1641     adjust the amount by which the group is to be moved
1642     utf8 TRUE in UTF-8 mode
1643     cd contains pointers to tables etc.
1644 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1645 nigel 77
1646     Returns: nothing
1647     */
1648    
1649     static void
1650 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1651     uschar *save_hwm)
1652 nigel 77 {
1653     uschar *ptr = group;
1654     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1655     {
1656 nigel 93 int offset;
1657     uschar *hc;
1658    
1659     /* See if this recursion is on the forward reference list. If so, adjust the
1660     reference. */
1661    
1662     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1663     {
1664     offset = GET(hc, 0);
1665     if (cd->start_code + offset == ptr + 1)
1666     {
1667     PUT(hc, 0, offset + adjust);
1668     break;
1669     }
1670     }
1671    
1672     /* Otherwise, adjust the recursion offset if it's after the start of this
1673     group. */
1674    
1675     if (hc >= cd->hwm)
1676     {
1677     offset = GET(ptr, 1);
1678     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1679     }
1680    
1681 nigel 77 ptr += 1 + LINK_SIZE;
1682     }
1683     }
1684    
1685    
1686    
1687     /*************************************************
1688     * Insert an automatic callout point *
1689     *************************************************/
1690    
1691     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1692     callout points before each pattern item.
1693    
1694     Arguments:
1695     code current code pointer
1696     ptr current pattern pointer
1697     cd pointers to tables etc
1698    
1699     Returns: new code pointer
1700     */
1701    
1702     static uschar *
1703     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1704     {
1705     *code++ = OP_CALLOUT;
1706     *code++ = 255;
1707     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1708     PUT(code, LINK_SIZE, 0); /* Default length */
1709     return code + 2*LINK_SIZE;
1710     }
1711    
1712    
1713    
1714     /*************************************************
1715     * Complete a callout item *
1716     *************************************************/
1717    
1718     /* A callout item contains the length of the next item in the pattern, which
1719     we can't fill in till after we have reached the relevant point. This is used
1720     for both automatic and manual callouts.
1721    
1722     Arguments:
1723     previous_callout points to previous callout item
1724     ptr current pattern pointer
1725     cd pointers to tables etc
1726    
1727     Returns: nothing
1728     */
1729    
1730     static void
1731     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1732     {
1733     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1734     PUT(previous_callout, 2 + LINK_SIZE, length);
1735     }
1736    
1737    
1738    
1739     #ifdef SUPPORT_UCP
1740     /*************************************************
1741     * Get othercase range *
1742     *************************************************/
1743    
1744     /* This function is passed the start and end of a class range, in UTF-8 mode
1745     with UCP support. It searches up the characters, looking for internal ranges of
1746     characters in the "other" case. Each call returns the next one, updating the
1747     start address.
1748    
1749     Arguments:
1750     cptr points to starting character value; updated
1751     d end value
1752     ocptr where to put start of othercase range
1753     odptr where to put end of othercase range
1754    
1755     Yield: TRUE when range returned; FALSE when no more
1756     */
1757    
1758     static BOOL
1759 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1760     unsigned int *odptr)
1761 nigel 77 {
1762 nigel 93 unsigned int c, othercase, next;
1763 nigel 77
1764     for (c = *cptr; c <= d; c++)
1765 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1766 nigel 77
1767     if (c > d) return FALSE;
1768    
1769     *ocptr = othercase;
1770     next = othercase + 1;
1771    
1772     for (++c; c <= d; c++)
1773     {
1774 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1775 nigel 77 next++;
1776     }
1777    
1778     *odptr = next - 1;
1779     *cptr = c;
1780    
1781     return TRUE;
1782     }
1783     #endif /* SUPPORT_UCP */
1784    
1785    
1786 nigel 93
1787 nigel 77 /*************************************************
1788 nigel 93 * Check if auto-possessifying is possible *
1789     *************************************************/
1790    
1791     /* This function is called for unlimited repeats of certain items, to see
1792     whether the next thing could possibly match the repeated item. If not, it makes
1793     sense to automatically possessify the repeated item.
1794    
1795     Arguments:
1796     op_code the repeated op code
1797     this data for this item, depends on the opcode
1798     utf8 TRUE in UTF-8 mode
1799     utf8_char used for utf8 character bytes, NULL if not relevant
1800     ptr next character in pattern
1801     options options bits
1802     cd contains pointers to tables etc.
1803    
1804     Returns: TRUE if possessifying is wanted
1805     */
1806    
1807     static BOOL
1808     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1809     const uschar *ptr, int options, compile_data *cd)
1810     {
1811     int next;
1812    
1813     /* Skip whitespace and comments in extended mode */
1814    
1815     if ((options & PCRE_EXTENDED) != 0)
1816     {
1817     for (;;)
1818     {
1819     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1820     if (*ptr == '#')
1821     {
1822     while (*(++ptr) != 0)
1823     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1824     }
1825     else break;
1826     }
1827     }
1828    
1829     /* If the next item is one that we can handle, get its value. A non-negative
1830     value is a character, a negative value is an escape value. */
1831    
1832     if (*ptr == '\\')
1833     {
1834     int temperrorcode = 0;
1835     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1836     if (temperrorcode != 0) return FALSE;
1837     ptr++; /* Point after the escape sequence */
1838     }
1839    
1840     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1841     {
1842     #ifdef SUPPORT_UTF8
1843     if (utf8) { GETCHARINC(next, ptr); } else
1844     #endif
1845     next = *ptr++;
1846     }
1847    
1848     else return FALSE;
1849    
1850     /* Skip whitespace and comments in extended mode */
1851    
1852     if ((options & PCRE_EXTENDED) != 0)
1853     {
1854     for (;;)
1855     {
1856     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1857     if (*ptr == '#')
1858     {
1859     while (*(++ptr) != 0)
1860     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1861     }
1862     else break;
1863     }
1864     }
1865    
1866     /* If the next thing is itself optional, we have to give up. */
1867    
1868     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1869     return FALSE;
1870    
1871     /* Now compare the next item with the previous opcode. If the previous is a
1872     positive single character match, "item" either contains the character or, if
1873     "item" is greater than 127 in utf8 mode, the character's bytes are in
1874     utf8_char. */
1875    
1876    
1877     /* Handle cases when the next item is a character. */
1878    
1879     if (next >= 0) switch(op_code)
1880     {
1881     case OP_CHAR:
1882     #ifdef SUPPORT_UTF8
1883     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1884     #endif
1885     return item != next;
1886    
1887     /* For CHARNC (caseless character) we must check the other case. If we have
1888     Unicode property support, we can use it to test the other case of
1889     high-valued characters. */
1890    
1891     case OP_CHARNC:
1892     #ifdef SUPPORT_UTF8
1893     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1894     #endif
1895     if (item == next) return FALSE;
1896     #ifdef SUPPORT_UTF8
1897     if (utf8)
1898     {
1899     unsigned int othercase;
1900     if (next < 128) othercase = cd->fcc[next]; else
1901     #ifdef SUPPORT_UCP
1902     othercase = _pcre_ucp_othercase((unsigned int)next);
1903     #else
1904     othercase = NOTACHAR;
1905     #endif
1906     return (unsigned int)item != othercase;
1907     }
1908     else
1909     #endif /* SUPPORT_UTF8 */
1910     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1911    
1912     /* For OP_NOT, "item" must be a single-byte character. */
1913    
1914     case OP_NOT:
1915     if (next < 0) return FALSE; /* Not a character */
1916     if (item == next) return TRUE;
1917     if ((options & PCRE_CASELESS) == 0) return FALSE;
1918     #ifdef SUPPORT_UTF8
1919     if (utf8)
1920     {
1921     unsigned int othercase;
1922     if (next < 128) othercase = cd->fcc[next]; else
1923     #ifdef SUPPORT_UCP
1924     othercase = _pcre_ucp_othercase(next);
1925     #else
1926     othercase = NOTACHAR;
1927     #endif
1928     return (unsigned int)item == othercase;
1929     }
1930     else
1931     #endif /* SUPPORT_UTF8 */
1932     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1933    
1934     case OP_DIGIT:
1935     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1936    
1937     case OP_NOT_DIGIT:
1938     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1939    
1940     case OP_WHITESPACE:
1941     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1942    
1943     case OP_NOT_WHITESPACE:
1944     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1945    
1946     case OP_WORDCHAR:
1947     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1948    
1949     case OP_NOT_WORDCHAR:
1950     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1951    
1952 ph10 180 case OP_HSPACE:
1953     case OP_NOT_HSPACE:
1954     switch(next)
1955     {
1956     case 0x09:
1957     case 0x20:
1958     case 0xa0:
1959     case 0x1680:
1960     case 0x180e:
1961     case 0x2000:
1962     case 0x2001:
1963     case 0x2002:
1964     case 0x2003:
1965     case 0x2004:
1966     case 0x2005:
1967     case 0x2006:
1968     case 0x2007:
1969     case 0x2008:
1970     case 0x2009:
1971     case 0x200A:
1972     case 0x202f:
1973     case 0x205f:
1974     case 0x3000:
1975     return op_code != OP_HSPACE;
1976     default:
1977     return op_code == OP_HSPACE;
1978     }
1979    
1980     case OP_VSPACE:
1981     case OP_NOT_VSPACE:
1982     switch(next)
1983     {
1984     case 0x0a:
1985     case 0x0b:
1986     case 0x0c:
1987     case 0x0d:
1988     case 0x85:
1989     case 0x2028:
1990     case 0x2029:
1991     return op_code != OP_VSPACE;
1992     default:
1993     return op_code == OP_VSPACE;
1994     }
1995    
1996 nigel 93 default:
1997     return FALSE;
1998     }
1999    
2000    
2001     /* Handle the case when the next item is \d, \s, etc. */
2002    
2003     switch(op_code)
2004     {
2005     case OP_CHAR:
2006     case OP_CHARNC:
2007     #ifdef SUPPORT_UTF8
2008     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2009     #endif
2010     switch(-next)
2011     {
2012     case ESC_d:
2013     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2014    
2015     case ESC_D:
2016     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2017    
2018     case ESC_s:
2019     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2020    
2021     case ESC_S:
2022     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2023    
2024     case ESC_w:
2025     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2026    
2027     case ESC_W:
2028     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2029 ph10 182
2030 ph10 180 case ESC_h:
2031     case ESC_H:
2032     switch(item)
2033     {
2034     case 0x09:
2035     case 0x20:
2036     case 0xa0:
2037     case 0x1680:
2038     case 0x180e:
2039     case 0x2000:
2040     case 0x2001:
2041     case 0x2002:
2042     case 0x2003:
2043     case 0x2004:
2044     case 0x2005:
2045     case 0x2006:
2046     case 0x2007:
2047     case 0x2008:
2048     case 0x2009:
2049     case 0x200A:
2050     case 0x202f:
2051     case 0x205f:
2052     case 0x3000:
2053     return -next != ESC_h;
2054     default:
2055     return -next == ESC_h;
2056 ph10 182 }
2057    
2058 ph10 180 case ESC_v:
2059     case ESC_V:
2060     switch(item)
2061     {
2062     case 0x0a:
2063     case 0x0b:
2064     case 0x0c:
2065     case 0x0d:
2066     case 0x85:
2067     case 0x2028:
2068     case 0x2029:
2069     return -next != ESC_v;
2070     default:
2071     return -next == ESC_v;
2072 ph10 182 }
2073 nigel 93
2074     default:
2075     return FALSE;
2076     }
2077    
2078     case OP_DIGIT:
2079 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2080     next == -ESC_h || next == -ESC_v;
2081 nigel 93
2082     case OP_NOT_DIGIT:
2083     return next == -ESC_d;
2084    
2085     case OP_WHITESPACE:
2086     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2087    
2088     case OP_NOT_WHITESPACE:
2089 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2090 nigel 93
2091 ph10 180 case OP_HSPACE:
2092     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2093    
2094     case OP_NOT_HSPACE:
2095     return next == -ESC_h;
2096 ph10 182
2097 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2098 ph10 182 case OP_VSPACE:
2099 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2100    
2101     case OP_NOT_VSPACE:
2102 ph10 182 return next == -ESC_v;
2103 ph10 180
2104 nigel 93 case OP_WORDCHAR:
2105 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2106 nigel 93
2107     case OP_NOT_WORDCHAR:
2108     return next == -ESC_w || next == -ESC_d;
2109 ph10 182
2110 nigel 93 default:
2111     return FALSE;
2112     }
2113    
2114     /* Control does not reach here */
2115     }
2116    
2117    
2118    
2119     /*************************************************
2120 nigel 77 * Compile one branch *
2121     *************************************************/
2122    
2123 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2124 nigel 77 changed during the branch, the pointer is used to change the external options
2125 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2126     to find out the amount of memory needed, as well as during the real compile
2127     phase. The value of lengthptr distinguishes the two phases.
2128 nigel 77
2129     Arguments:
2130     optionsptr pointer to the option bits
2131     codeptr points to the pointer to the current code point
2132     ptrptr points to the current pattern pointer
2133     errorcodeptr points to error code variable
2134     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2135     reqbyteptr set to the last literal character required, else < 0
2136     bcptr points to current branch chain
2137     cd contains pointers to tables etc.
2138 nigel 93 lengthptr NULL during the real compile phase
2139     points to length accumulator during pre-compile phase
2140 nigel 77
2141     Returns: TRUE on success
2142     FALSE, with *errorcodeptr set non-zero on error
2143     */
2144    
2145     static BOOL
2146 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2147     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2148     compile_data *cd, int *lengthptr)
2149 nigel 77 {
2150     int repeat_type, op_type;
2151     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2152     int bravalue = 0;
2153     int greedy_default, greedy_non_default;
2154     int firstbyte, reqbyte;
2155     int zeroreqbyte, zerofirstbyte;
2156     int req_caseopt, reqvary, tempreqvary;
2157     int options = *optionsptr;
2158     int after_manual_callout = 0;
2159 nigel 93 int length_prevgroup = 0;
2160 nigel 77 register int c;
2161     register uschar *code = *codeptr;
2162 nigel 93 uschar *last_code = code;
2163     uschar *orig_code = code;
2164 nigel 77 uschar *tempcode;
2165     BOOL inescq = FALSE;
2166     BOOL groupsetfirstbyte = FALSE;
2167     const uschar *ptr = *ptrptr;
2168     const uschar *tempptr;
2169     uschar *previous = NULL;
2170     uschar *previous_callout = NULL;
2171 nigel 93 uschar *save_hwm = NULL;
2172 nigel 77 uschar classbits[32];
2173    
2174     #ifdef SUPPORT_UTF8
2175     BOOL class_utf8;
2176     BOOL utf8 = (options & PCRE_UTF8) != 0;
2177     uschar *class_utf8data;
2178     uschar utf8_char[6];
2179     #else
2180     BOOL utf8 = FALSE;
2181 nigel 93 uschar *utf8_char = NULL;
2182 nigel 77 #endif
2183    
2184 nigel 93 #ifdef DEBUG
2185     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2186     #endif
2187    
2188 nigel 77 /* Set up the default and non-default settings for greediness */
2189    
2190     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2191     greedy_non_default = greedy_default ^ 1;
2192    
2193     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2194     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2195     matches a non-fixed char first char; reqbyte just remains unset if we never
2196     find one.
2197    
2198     When we hit a repeat whose minimum is zero, we may have to adjust these values
2199     to take the zero repeat into account. This is implemented by setting them to
2200     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2201     item types that can be repeated set these backoff variables appropriately. */
2202    
2203     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2204    
2205     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2206     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2207     value > 255. It is added into the firstbyte or reqbyte variables to record the
2208     case status of the value. This is used only for ASCII characters. */
2209    
2210     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2211    
2212     /* Switch on next character until the end of the branch */
2213    
2214     for (;; ptr++)
2215     {
2216     BOOL negate_class;
2217     BOOL possessive_quantifier;
2218     BOOL is_quantifier;
2219 nigel 93 BOOL is_recurse;
2220 ph10 180 BOOL reset_bracount;
2221 nigel 77 int class_charcount;
2222     int class_lastchar;
2223     int newoptions;
2224     int recno;
2225 ph10 172 int refsign;
2226 nigel 77 int skipbytes;
2227     int subreqbyte;
2228     int subfirstbyte;
2229 nigel 93 int terminator;
2230 nigel 77 int mclength;
2231     uschar mcbuffer[8];
2232    
2233 nigel 93 /* Get next byte in the pattern */
2234 nigel 77
2235     c = *ptr;
2236    
2237 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2238     previous cycle of this loop. */
2239    
2240     if (lengthptr != NULL)
2241     {
2242     #ifdef DEBUG
2243     if (code > cd->hwm) cd->hwm = code; /* High water info */
2244     #endif
2245     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2246     {
2247     *errorcodeptr = ERR52;
2248     goto FAILED;
2249     }
2250    
2251     /* There is at least one situation where code goes backwards: this is the
2252     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2253     the class is simply eliminated. However, it is created first, so we have to
2254     allow memory for it. Therefore, don't ever reduce the length at this point.
2255     */
2256    
2257     if (code < last_code) code = last_code;
2258     *lengthptr += code - last_code;
2259     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2260    
2261     /* If "previous" is set and it is not at the start of the work space, move
2262     it back to there, in order to avoid filling up the work space. Otherwise,
2263     if "previous" is NULL, reset the current code pointer to the start. */
2264    
2265     if (previous != NULL)
2266     {
2267     if (previous > orig_code)
2268     {
2269     memmove(orig_code, previous, code - previous);
2270     code -= previous - orig_code;
2271     previous = orig_code;
2272     }
2273     }
2274     else code = orig_code;
2275    
2276     /* Remember where this code item starts so we can pick up the length
2277     next time round. */
2278    
2279     last_code = code;
2280     }
2281    
2282     /* In the real compile phase, just check the workspace used by the forward
2283     reference list. */
2284    
2285     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2286     {
2287     *errorcodeptr = ERR52;
2288     goto FAILED;
2289     }
2290    
2291 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2292    
2293     if (inescq && c != 0)
2294     {
2295     if (c == '\\' && ptr[1] == 'E')
2296     {
2297     inescq = FALSE;
2298     ptr++;
2299     continue;
2300     }
2301     else
2302     {
2303     if (previous_callout != NULL)
2304     {
2305 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2306     complete_callout(previous_callout, ptr, cd);
2307 nigel 77 previous_callout = NULL;
2308     }
2309     if ((options & PCRE_AUTO_CALLOUT) != 0)
2310     {
2311     previous_callout = code;
2312     code = auto_callout(code, ptr, cd);
2313     }
2314     goto NORMAL_CHAR;
2315     }
2316     }
2317    
2318     /* Fill in length of a previous callout, except when the next thing is
2319     a quantifier. */
2320    
2321     is_quantifier = c == '*' || c == '+' || c == '?' ||
2322     (c == '{' && is_counted_repeat(ptr+1));
2323    
2324     if (!is_quantifier && previous_callout != NULL &&
2325     after_manual_callout-- <= 0)
2326     {
2327 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2328     complete_callout(previous_callout, ptr, cd);
2329 nigel 77 previous_callout = NULL;
2330     }
2331    
2332     /* In extended mode, skip white space and comments */
2333    
2334     if ((options & PCRE_EXTENDED) != 0)
2335     {
2336     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2337     if (c == '#')
2338     {
2339 nigel 93 while (*(++ptr) != 0)
2340 nigel 91 {
2341 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2342 nigel 91 }
2343 nigel 93 if (*ptr != 0) continue;
2344    
2345 nigel 91 /* Else fall through to handle end of string */
2346     c = 0;
2347 nigel 77 }
2348     }
2349    
2350     /* No auto callout for quantifiers. */
2351    
2352     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2353     {
2354     previous_callout = code;
2355     code = auto_callout(code, ptr, cd);
2356     }
2357    
2358     switch(c)
2359     {
2360 nigel 93 /* ===================================================================*/
2361     case 0: /* The branch terminates at string end */
2362     case '|': /* or | or ) */
2363 nigel 77 case ')':
2364     *firstbyteptr = firstbyte;
2365     *reqbyteptr = reqbyte;
2366     *codeptr = code;
2367     *ptrptr = ptr;
2368 nigel 93 if (lengthptr != NULL)
2369     {
2370     *lengthptr += code - last_code; /* To include callout length */
2371     DPRINTF((">> end branch\n"));
2372     }
2373 nigel 77 return TRUE;
2374    
2375 nigel 93
2376     /* ===================================================================*/
2377 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2378     the setting of any following char as a first character. */
2379    
2380     case '^':
2381     if ((options & PCRE_MULTILINE) != 0)
2382     {
2383     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2384     }
2385     previous = NULL;
2386     *code++ = OP_CIRC;
2387     break;
2388    
2389     case '$':
2390     previous = NULL;
2391     *code++ = OP_DOLL;
2392     break;
2393    
2394     /* There can never be a first char if '.' is first, whatever happens about
2395     repeats. The value of reqbyte doesn't change either. */
2396    
2397     case '.':
2398     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2399     zerofirstbyte = firstbyte;
2400     zeroreqbyte = reqbyte;
2401     previous = code;
2402     *code++ = OP_ANY;
2403     break;
2404    
2405 nigel 93
2406     /* ===================================================================*/
2407 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2408     32-byte bitmap of the permitted characters, except in the special case
2409     where there is only one such character. For negated classes, we build the
2410     map as usual, then invert it at the end. However, we use a different opcode
2411     so that data characters > 255 can be handled correctly.
2412 nigel 77
2413     If the class contains characters outside the 0-255 range, a different
2414     opcode is compiled. It may optionally have a bit map for characters < 256,
2415     but those above are are explicitly listed afterwards. A flag byte tells
2416     whether the bitmap is present, and whether this is a negated class or not.
2417     */
2418    
2419     case '[':
2420     previous = code;
2421    
2422     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2423     they are encountered at the top level, so we'll do that too. */
2424    
2425     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2426     check_posix_syntax(ptr, &tempptr, cd))
2427     {
2428     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2429     goto FAILED;
2430     }
2431    
2432     /* If the first character is '^', set the negation flag and skip it. */
2433    
2434     if ((c = *(++ptr)) == '^')
2435     {
2436     negate_class = TRUE;
2437     c = *(++ptr);
2438     }
2439     else
2440     {
2441     negate_class = FALSE;
2442     }
2443    
2444     /* Keep a count of chars with values < 256 so that we can optimize the case
2445 nigel 93 of just a single character (as long as it's < 256). However, For higher
2446     valued UTF-8 characters, we don't yet do any optimization. */
2447 nigel 77
2448     class_charcount = 0;
2449     class_lastchar = -1;
2450    
2451 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2452     temporary bit of memory, in case the class contains only 1 character (less
2453     than 256), because in that case the compiled code doesn't use the bit map.
2454     */
2455    
2456     memset(classbits, 0, 32 * sizeof(uschar));
2457    
2458 nigel 77 #ifdef SUPPORT_UTF8
2459     class_utf8 = FALSE; /* No chars >= 256 */
2460 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2461 nigel 77 #endif
2462    
2463     /* Process characters until ] is reached. By writing this as a "do" it
2464 nigel 93 means that an initial ] is taken as a data character. At the start of the
2465     loop, c contains the first byte of the character. */
2466 nigel 77
2467 nigel 93 if (c != 0) do
2468 nigel 77 {
2469 nigel 93 const uschar *oldptr;
2470    
2471 nigel 77 #ifdef SUPPORT_UTF8
2472     if (utf8 && c > 127)
2473     { /* Braces are required because the */
2474     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2475     }
2476     #endif
2477    
2478     /* Inside \Q...\E everything is literal except \E */
2479    
2480     if (inescq)
2481     {
2482 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2483 nigel 77 {
2484 nigel 93 inescq = FALSE; /* Reset literal state */
2485     ptr++; /* Skip the 'E' */
2486     continue; /* Carry on with next */
2487 nigel 77 }
2488 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2489 nigel 77 }
2490    
2491     /* Handle POSIX class names. Perl allows a negation extension of the
2492     form [:^name:]. A square bracket that doesn't match the syntax is
2493     treated as a literal. We also recognize the POSIX constructions
2494     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2495     5.6 and 5.8 do. */
2496    
2497     if (c == '[' &&
2498     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2499     check_posix_syntax(ptr, &tempptr, cd))
2500     {
2501     BOOL local_negate = FALSE;
2502 nigel 87 int posix_class, taboffset, tabopt;
2503 nigel 77 register const uschar *cbits = cd->cbits;
2504 nigel 87 uschar pbits[32];
2505 nigel 77
2506     if (ptr[1] != ':')
2507     {
2508     *errorcodeptr = ERR31;
2509     goto FAILED;
2510     }
2511    
2512     ptr += 2;
2513     if (*ptr == '^')
2514     {
2515     local_negate = TRUE;
2516     ptr++;
2517     }
2518    
2519     posix_class = check_posix_name(ptr, tempptr - ptr);
2520     if (posix_class < 0)
2521     {
2522     *errorcodeptr = ERR30;
2523     goto FAILED;
2524     }
2525    
2526     /* If matching is caseless, upper and lower are converted to
2527     alpha. This relies on the fact that the class table starts with
2528     alpha, lower, upper as the first 3 entries. */
2529    
2530     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2531     posix_class = 0;
2532    
2533 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2534     because we may be adding and subtracting from it, and we don't want to
2535     subtract bits that may be in the main map already. At the end we or the
2536     result into the bit map that is being built. */
2537 nigel 77
2538     posix_class *= 3;
2539 nigel 87
2540     /* Copy in the first table (always present) */
2541    
2542     memcpy(pbits, cbits + posix_class_maps[posix_class],
2543     32 * sizeof(uschar));
2544    
2545     /* If there is a second table, add or remove it as required. */
2546    
2547     taboffset = posix_class_maps[posix_class + 1];
2548     tabopt = posix_class_maps[posix_class + 2];
2549    
2550     if (taboffset >= 0)
2551 nigel 77 {
2552 nigel 87 if (tabopt >= 0)
2553     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2554 nigel 77 else
2555 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2556 nigel 77 }
2557    
2558 nigel 87 /* Not see if we need to remove any special characters. An option
2559     value of 1 removes vertical space and 2 removes underscore. */
2560    
2561     if (tabopt < 0) tabopt = -tabopt;
2562     if (tabopt == 1) pbits[1] &= ~0x3c;
2563     else if (tabopt == 2) pbits[11] &= 0x7f;
2564    
2565     /* Add the POSIX table or its complement into the main table that is
2566     being built and we are done. */
2567    
2568     if (local_negate)
2569     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2570     else
2571     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2572    
2573 nigel 77 ptr = tempptr + 1;
2574     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2575     continue; /* End of POSIX syntax handling */
2576     }
2577    
2578     /* Backslash may introduce a single character, or it may introduce one
2579 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2580     case. Inside a class (and only there) it is treated as backspace.
2581     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2582     to or into the one we are building. We assume they have more than one
2583 nigel 77 character in them, so set class_charcount bigger than one. */
2584    
2585     if (c == '\\')
2586     {
2587 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2588     if (*errorcodeptr != 0) goto FAILED;
2589 nigel 77
2590     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2591     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2592 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2593 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2594     {
2595     if (ptr[1] == '\\' && ptr[2] == 'E')
2596     {
2597     ptr += 2; /* avoid empty string */
2598     }
2599     else inescq = TRUE;
2600     continue;
2601     }
2602    
2603     if (c < 0)
2604     {
2605     register const uschar *cbits = cd->cbits;
2606     class_charcount += 2; /* Greater than 1 is what matters */
2607 nigel 93
2608     /* Save time by not doing this in the pre-compile phase. */
2609    
2610     if (lengthptr == NULL) switch (-c)
2611 nigel 77 {
2612     case ESC_d:
2613     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2614     continue;
2615    
2616     case ESC_D:
2617     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2618     continue;
2619    
2620     case ESC_w:
2621     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2622     continue;
2623    
2624     case ESC_W:
2625     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2626     continue;
2627    
2628     case ESC_s:
2629     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2630     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2631     continue;
2632    
2633     case ESC_S:
2634     for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2635     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2636     continue;
2637    
2638 nigel 93 case ESC_E: /* Perl ignores an orphan \E */
2639     continue;
2640 ph10 180
2641 nigel 93 default: /* Not recognized; fall through */
2642     break; /* Need "default" setting to stop compiler warning. */
2643     }
2644    
2645     /* In the pre-compile phase, just do the recognition. */
2646    
2647     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2648     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2649 ph10 180
2650 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
2651     they use extra memory. */
2652 ph10 180
2653 ph10 178 if (-c == ESC_h)
2654     {
2655     SETBIT(classbits, 0x09); /* VT */
2656     SETBIT(classbits, 0x20); /* SPACE */
2657 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
2658 ph10 178 #ifdef SUPPORT_UTF8
2659     if (utf8)
2660 ph10 180 {
2661 ph10 178 class_utf8 = TRUE;
2662     *class_utf8data++ = XCL_SINGLE;
2663 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2664 ph10 178 *class_utf8data++ = XCL_SINGLE;
2665 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2666     *class_utf8data++ = XCL_RANGE;
2667     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2668     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2669 ph10 178 *class_utf8data++ = XCL_SINGLE;
2670 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2671 ph10 178 *class_utf8data++ = XCL_SINGLE;
2672 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2673 ph10 178 *class_utf8data++ = XCL_SINGLE;
2674 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2675     }
2676     #endif
2677     continue;
2678     }
2679 nigel 93
2680 ph10 178 if (-c == ESC_H)
2681     {
2682     for (c = 0; c < 32; c++)
2683     {
2684     int x = 0xff;
2685     switch (c)
2686 ph10 180 {
2687 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
2688     case 0x20/8: x ^= 1 << (0x20%8); break;
2689     case 0xa0/8: x ^= 1 << (0xa0%8); break;
2690     default: break;
2691     }
2692     classbits[c] |= x;
2693 ph10 180 }
2694    
2695 ph10 178 #ifdef SUPPORT_UTF8
2696     if (utf8)
2697 ph10 180 {
2698 ph10 178 class_utf8 = TRUE;
2699 ph10 180 *class_utf8data++ = XCL_RANGE;
2700     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2701     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2702     *class_utf8data++ = XCL_RANGE;
2703     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2704     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2705     *class_utf8data++ = XCL_RANGE;
2706     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2707     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2708     *class_utf8data++ = XCL_RANGE;
2709     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2710     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2711     *class_utf8data++ = XCL_RANGE;
2712     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2713     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2714     *class_utf8data++ = XCL_RANGE;
2715     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2716     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2717     *class_utf8data++ = XCL_RANGE;
2718     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2719     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2720     }
2721     #endif
2722     continue;
2723     }
2724 ph10 178
2725     if (-c == ESC_v)
2726     {
2727     SETBIT(classbits, 0x0a); /* LF */
2728     SETBIT(classbits, 0x0b); /* VT */
2729 ph10 180 SETBIT(classbits, 0x0c); /* FF */
2730     SETBIT(classbits, 0x0d); /* CR */
2731     SETBIT(classbits, 0x85); /* NEL */
2732 ph10 178 #ifdef SUPPORT_UTF8
2733     if (utf8)
2734 ph10 180 {
2735 ph10 178 class_utf8 = TRUE;
2736 ph10 180 *class_utf8data++ = XCL_RANGE;
2737     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2738     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2739     }
2740     #endif
2741     continue;
2742     }
2743 ph10 178
2744     if (-c == ESC_V)
2745     {
2746     for (c = 0; c < 32; c++)
2747     {
2748     int x = 0xff;
2749     switch (c)
2750 ph10 180 {
2751 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
2752     x ^= 1 << (0x0b%8);
2753     x ^= 1 << (0x0c%8);
2754 ph10 180 x ^= 1 << (0x0d%8);
2755 ph10 178 break;
2756     case 0x85/8: x ^= 1 << (0x85%8); break;
2757     default: break;
2758     }
2759     classbits[c] |= x;
2760 ph10 180 }
2761    
2762 ph10 178 #ifdef SUPPORT_UTF8
2763     if (utf8)
2764 ph10 180 {
2765 ph10 178 class_utf8 = TRUE;
2766 ph10 180 *class_utf8data++ = XCL_RANGE;
2767     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2768     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2769     *class_utf8data++ = XCL_RANGE;
2770     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2771     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2772     }
2773     #endif
2774     continue;
2775     }
2776 ph10 178
2777 nigel 93 /* We need to deal with \P and \p in both phases. */
2778    
2779 nigel 77 #ifdef SUPPORT_UCP
2780 nigel 93 if (-c == ESC_p || -c == ESC_P)
2781     {
2782     BOOL negated;
2783     int pdata;
2784     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2785     if (ptype < 0) goto FAILED;
2786     class_utf8 = TRUE;
2787     *class_utf8data++ = ((-c == ESC_p) != negated)?
2788     XCL_PROP : XCL_NOTPROP;
2789     *class_utf8data++ = ptype;
2790     *class_utf8data++ = pdata;
2791     class_charcount -= 2; /* Not a < 256 character */
2792 nigel 77 continue;
2793 nigel 93 }
2794 nigel 77 #endif
2795 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
2796     strict mode. By default, for compatibility with Perl, they are
2797     treated as literals. */
2798 nigel 77
2799 nigel 93 if ((options & PCRE_EXTRA) != 0)
2800     {
2801     *errorcodeptr = ERR7;
2802     goto FAILED;
2803     }
2804 nigel 77
2805 nigel 93 class_charcount -= 2; /* Undo the default count from above */
2806     c = *ptr; /* Get the final character and fall through */
2807 nigel 77 }
2808    
2809     /* Fall through if we have a single character (c >= 0). This may be
2810 nigel 93 greater than 256 in UTF-8 mode. */
2811 nigel 77
2812     } /* End of backslash handling */
2813    
2814     /* A single character may be followed by '-' to form a range. However,
2815     Perl does not permit ']' to be the end of the range. A '-' character
2816 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
2817     entirely. The code for handling \Q and \E is messy. */
2818 nigel 77
2819 nigel 93 CHECK_RANGE:
2820     while (ptr[1] == '\\' && ptr[2] == 'E')
2821 nigel 77 {
2822 nigel 93 inescq = FALSE;
2823     ptr += 2;
2824     }
2825    
2826     oldptr = ptr;
2827    
2828     if (!inescq && ptr[1] == '-')
2829     {
2830 nigel 77 int d;
2831     ptr += 2;
2832 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2833 nigel 77
2834 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
2835     mode. */
2836    
2837     while (*ptr == '\\' && ptr[1] == 'Q')
2838     {
2839     ptr += 2;
2840     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2841     inescq = TRUE;
2842     break;
2843     }
2844    
2845     if (*ptr == 0 || (!inescq && *ptr == ']'))
2846     {
2847     ptr = oldptr;
2848     goto LONE_SINGLE_CHARACTER;
2849     }
2850    
2851 nigel 77 #ifdef SUPPORT_UTF8
2852     if (utf8)
2853     { /* Braces are required because the */
2854     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2855     }
2856     else
2857     #endif
2858     d = *ptr; /* Not UTF-8 mode */
2859    
2860     /* The second part of a range can be a single-character escape, but
2861     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2862     in such circumstances. */
2863    
2864 nigel 93 if (!inescq && d == '\\')
2865 nigel 77 {
2866 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2867     if (*errorcodeptr != 0) goto FAILED;
2868 nigel 77
2869 nigel 93 /* \b is backslash; \X is literal X; \R is literal R; any other
2870     special means the '-' was literal */
2871 nigel 77
2872     if (d < 0)
2873     {
2874     if (d == -ESC_b) d = '\b';
2875 nigel 93 else if (d == -ESC_X) d = 'X';
2876     else if (d == -ESC_R) d = 'R'; else
2877 nigel 77 {
2878 nigel 93 ptr = oldptr;
2879 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2880     }
2881     }
2882     }
2883    
2884 nigel 93 /* Check that the two values are in the correct order. Optimize
2885     one-character ranges */
2886 nigel 77
2887 nigel 93 if (d < c)
2888     {
2889     *errorcodeptr = ERR8;
2890     goto FAILED;
2891     }
2892    
2893 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2894    
2895     /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2896     matching, we have to use an XCLASS with extra data items. Caseless
2897     matching for characters > 127 is available only if UCP support is
2898     available. */
2899    
2900     #ifdef SUPPORT_UTF8
2901     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2902     {
2903     class_utf8 = TRUE;
2904    
2905     /* With UCP support, we can find the other case equivalents of
2906     the relevant characters. There may be several ranges. Optimize how
2907     they fit with the basic range. */
2908    
2909     #ifdef SUPPORT_UCP
2910     if ((options & PCRE_CASELESS) != 0)
2911     {
2912 nigel 93 unsigned int occ, ocd;
2913     unsigned int cc = c;
2914     unsigned int origd = d;
2915 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
2916     {
2917 ph10 180 if (occ >= (unsigned int)c &&
2918     ocd <= (unsigned int)d)
2919 ph10 176 continue; /* Skip embedded ranges */
2920 nigel 77
2921 ph10 180 if (occ < (unsigned int)c &&
2922 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
2923 nigel 77 { /* if there is overlap, */
2924     c = occ; /* noting that if occ < c */
2925     continue; /* we can't have ocd > d */
2926     } /* because a subrange is */
2927 ph10 180 if (ocd > (unsigned int)d &&
2928 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
2929 nigel 77 { /* the basic range. */
2930     d = ocd;
2931     continue;
2932     }
2933    
2934     if (occ == ocd)
2935     {
2936     *class_utf8data++ = XCL_SINGLE;
2937     }
2938     else
2939     {
2940     *class_utf8data++ = XCL_RANGE;
2941     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2942     }
2943     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2944     }
2945     }
2946     #endif /* SUPPORT_UCP */
2947    
2948     /* Now record the original range, possibly modified for UCP caseless
2949     overlapping ranges. */
2950    
2951     *class_utf8data++ = XCL_RANGE;
2952     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2953     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2954    
2955     /* With UCP support, we are done. Without UCP support, there is no
2956     caseless matching for UTF-8 characters > 127; we can use the bit map
2957     for the smaller ones. */
2958    
2959     #ifdef SUPPORT_UCP
2960     continue; /* With next character in the class */
2961     #else
2962     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2963    
2964     /* Adjust upper limit and fall through to set up the map */
2965    
2966     d = 127;
2967    
2968     #endif /* SUPPORT_UCP */
2969     }
2970     #endif /* SUPPORT_UTF8 */
2971    
2972     /* We use the bit map for all cases when not in UTF-8 mode; else
2973     ranges that lie entirely within 0-127 when there is UCP support; else
2974     for partial ranges without UCP support. */
2975    
2976 nigel 93 class_charcount += d - c + 1;
2977     class_lastchar = d;
2978    
2979     /* We can save a bit of time by skipping this in the pre-compile. */
2980    
2981     if (lengthptr == NULL) for (; c <= d; c++)
2982 nigel 77 {
2983     classbits[c/8] |= (1 << (c&7));
2984     if ((options & PCRE_CASELESS) != 0)
2985     {
2986     int uc = cd->fcc[c]; /* flip case */
2987     classbits[uc/8] |= (1 << (uc&7));
2988     }
2989     }
2990    
2991     continue; /* Go get the next char in the class */
2992     }
2993    
2994     /* Handle a lone single character - we can get here for a normal
2995     non-escape char, or after \ that introduces a single character or for an
2996     apparent range that isn't. */
2997    
2998     LONE_SINGLE_CHARACTER:
2999    
3000     /* Handle a character that cannot go in the bit map */
3001    
3002     #ifdef SUPPORT_UTF8
3003     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3004     {
3005     class_utf8 = TRUE;
3006     *class_utf8data++ = XCL_SINGLE;
3007     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3008    
3009     #ifdef SUPPORT_UCP
3010     if ((options & PCRE_CASELESS) != 0)
3011     {
3012 nigel 93 unsigned int othercase;
3013     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3014 nigel 77 {
3015     *class_utf8data++ = XCL_SINGLE;
3016     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3017     }
3018     }
3019     #endif /* SUPPORT_UCP */
3020    
3021     }
3022     else
3023     #endif /* SUPPORT_UTF8 */
3024    
3025     /* Handle a single-byte character */
3026     {
3027     classbits[c/8] |= (1 << (c&7));
3028     if ((options & PCRE_CASELESS) != 0)
3029     {
3030     c = cd->fcc[c]; /* flip case */
3031     classbits[c/8] |= (1 << (c&7));
3032     }
3033     class_charcount++;
3034     class_lastchar = c;
3035     }
3036     }
3037    
3038 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3039 nigel 77
3040 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3041 nigel 77
3042 nigel 93 if (c == 0) /* Missing terminating ']' */
3043     {
3044     *errorcodeptr = ERR6;
3045     goto FAILED;
3046     }
3047    
3048 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3049     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
3050     can optimize the negative case only if there were no characters >= 128
3051     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
3052     single-bytes only. This is an historical hangover. Maybe one day we can
3053     tidy these opcodes to handle multi-byte characters.
3054    
3055     The optimization throws away the bit map. We turn the item into a
3056     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3057     that OP_NOT does not support multibyte characters. In the positive case, it
3058     can cause firstbyte to be set. Otherwise, there can be no first char if
3059     this item is first, whatever repeat count may follow. In the case of
3060     reqbyte, save the previous value for reinstating. */
3061    
3062     #ifdef SUPPORT_UTF8
3063     if (class_charcount == 1 &&
3064     (!utf8 ||
3065     (!class_utf8 && (!negate_class || class_lastchar < 128))))
3066    
3067     #else
3068     if (class_charcount == 1)
3069     #endif
3070     {
3071     zeroreqbyte = reqbyte;
3072    
3073     /* The OP_NOT opcode works on one-byte characters only. */
3074    
3075     if (negate_class)
3076     {
3077     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3078     zerofirstbyte = firstbyte;
3079     *code++ = OP_NOT;
3080     *code++ = class_lastchar;
3081     break;
3082     }
3083    
3084     /* For a single, positive character, get the value into mcbuffer, and
3085     then we can handle this with the normal one-character code. */
3086    
3087     #ifdef SUPPORT_UTF8
3088     if (utf8 && class_lastchar > 127)
3089     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3090     else
3091     #endif
3092     {
3093     mcbuffer[0] = class_lastchar;
3094     mclength = 1;
3095     }
3096     goto ONE_CHAR;
3097     } /* End of 1-char optimization */
3098    
3099     /* The general case - not the one-char optimization. If this is the first
3100     thing in the branch, there can be no first char setting, whatever the
3101     repeat count. Any reqbyte setting must remain unchanged after any kind of
3102     repeat. */
3103    
3104     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3105     zerofirstbyte = firstbyte;
3106     zeroreqbyte = reqbyte;
3107    
3108     /* If there are characters with values > 255, we have to compile an
3109     extended class, with its own opcode. If there are no characters < 256,
3110 nigel 93 we can omit the bitmap in the actual compiled code. */
3111 nigel 77
3112     #ifdef SUPPORT_UTF8
3113     if (class_utf8)
3114     {
3115     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3116     *code++ = OP_XCLASS;
3117     code += LINK_SIZE;
3118     *code = negate_class? XCL_NOT : 0;
3119    
3120 nigel 93 /* If the map is required, move up the extra data to make room for it;
3121     otherwise just move the code pointer to the end of the extra data. */
3122 nigel 77
3123     if (class_charcount > 0)
3124     {
3125     *code++ |= XCL_MAP;
3126 nigel 93 memmove(code + 32, code, class_utf8data - code);
3127 nigel 77 memcpy(code, classbits, 32);
3128 nigel 93 code = class_utf8data + 32;
3129 nigel 77 }
3130 nigel 93 else code = class_utf8data;
3131 nigel 77
3132     /* Now fill in the complete length of the item */
3133    
3134     PUT(previous, 1, code - previous);
3135     break; /* End of class handling */
3136     }
3137     #endif
3138    
3139     /* If there are no characters > 255, negate the 32-byte map if necessary,
3140     and copy it into the code vector. If this is the first thing in the branch,
3141     there can be no first char setting, whatever the repeat count. Any reqbyte
3142     setting must remain unchanged after any kind of repeat. */
3143    
3144     if (negate_class)
3145     {
3146     *code++ = OP_NCLASS;
3147 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3148     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3149 nigel 77 }
3150     else
3151     {
3152     *code++ = OP_CLASS;
3153     memcpy(code, classbits, 32);
3154     }
3155     code += 32;
3156     break;
3157    
3158 nigel 93
3159     /* ===================================================================*/
3160 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3161     has been tested above. */
3162    
3163     case '{':
3164     if (!is_quantifier) goto NORMAL_CHAR;
3165     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3166     if (*errorcodeptr != 0) goto FAILED;
3167     goto REPEAT;
3168    
3169     case '*':
3170     repeat_min = 0;
3171     repeat_max = -1;
3172     goto REPEAT;
3173    
3174     case '+':
3175     repeat_min = 1;
3176     repeat_max = -1;
3177     goto REPEAT;
3178    
3179     case '?':
3180     repeat_min = 0;
3181     repeat_max = 1;
3182    
3183     REPEAT:
3184     if (previous == NULL)
3185     {
3186     *errorcodeptr = ERR9;
3187     goto FAILED;
3188     }
3189    
3190     if (repeat_min == 0)
3191     {
3192     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3193     reqbyte = zeroreqbyte; /* Ditto */
3194     }
3195    
3196     /* Remember whether this is a variable length repeat */
3197    
3198     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3199    
3200     op_type = 0; /* Default single-char op codes */
3201     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3202    
3203     /* Save start of previous item, in case we have to move it up to make space
3204     for an inserted OP_ONCE for the additional '+' extension. */
3205    
3206     tempcode = previous;
3207    
3208     /* If the next character is '+', we have a possessive quantifier. This
3209     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3210     If the next character is '?' this is a minimizing repeat, by default,
3211     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3212     repeat type to the non-default. */
3213    
3214     if (ptr[1] == '+')
3215     {
3216     repeat_type = 0; /* Force greedy */
3217     possessive_quantifier = TRUE;
3218     ptr++;
3219     }
3220     else if (ptr[1] == '?')
3221     {
3222     repeat_type = greedy_non_default;
3223     ptr++;
3224     }
3225     else repeat_type = greedy_default;
3226    
3227     /* If previous was a character match, abolish the item and generate a
3228     repeat item instead. If a char item has a minumum of more than one, ensure
3229     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3230     the first thing in a branch because the x will have gone into firstbyte
3231     instead. */
3232    
3233     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3234     {
3235     /* Deal with UTF-8 characters that take up more than one byte. It's
3236     easier to write this out separately than try to macrify it. Use c to
3237     hold the length of the character in bytes, plus 0x80 to flag that it's a
3238     length rather than a small character. */
3239    
3240     #ifdef SUPPORT_UTF8
3241     if (utf8 && (code[-1] & 0x80) != 0)
3242     {
3243     uschar *lastchar = code - 1;
3244     while((*lastchar & 0xc0) == 0x80) lastchar--;
3245     c = code - lastchar; /* Length of UTF-8 character */
3246     memcpy(utf8_char, lastchar, c); /* Save the char */
3247     c |= 0x80; /* Flag c as a length */
3248     }
3249     else
3250     #endif
3251    
3252     /* Handle the case of a single byte - either with no UTF8 support, or
3253     with UTF-8 disabled, or for a UTF-8 character < 128. */
3254    
3255     {
3256     c = code[-1];
3257     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3258     }
3259    
3260 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3261     the line is something that cannot possibly match this character. If so,
3262     automatically possessifying this item gains some performance in the case
3263     where the match fails. */
3264    
3265     if (!possessive_quantifier &&
3266     repeat_max < 0 &&
3267     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3268     options, cd))
3269     {
3270     repeat_type = 0; /* Force greedy */
3271     possessive_quantifier = TRUE;
3272     }
3273    
3274 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3275     }
3276    
3277     /* If previous was a single negated character ([^a] or similar), we use
3278     one of the special opcodes, replacing it. The code is shared with single-
3279     character repeats by setting opt_type to add a suitable offset into
3280 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3281     currently used only for single-byte chars. */
3282 nigel 77
3283     else if (*previous == OP_NOT)
3284     {
3285     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3286     c = previous[1];
3287 nigel 93 if (!possessive_quantifier &&
3288     repeat_max < 0 &&
3289     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3290     {
3291     repeat_type = 0; /* Force greedy */
3292     possessive_quantifier = TRUE;
3293     }
3294 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3295     }
3296    
3297     /* If previous was a character type match (\d or similar), abolish it and
3298     create a suitable repeat item. The code is shared with single-character
3299     repeats by setting op_type to add a suitable offset into repeat_type. Note
3300     the the Unicode property types will be present only when SUPPORT_UCP is
3301     defined, but we don't wrap the little bits of code here because it just
3302     makes it horribly messy. */
3303    
3304     else if (*previous < OP_EODN)
3305     {
3306     uschar *oldcode;
3307 nigel 87 int prop_type, prop_value;
3308 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3309     c = *previous;
3310    
3311 nigel 93 if (!possessive_quantifier &&
3312     repeat_max < 0 &&
3313     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3314     {
3315     repeat_type = 0; /* Force greedy */
3316     possessive_quantifier = TRUE;
3317     }
3318    
3319 nigel 77 OUTPUT_SINGLE_REPEAT:
3320 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3321     {
3322     prop_type = previous[1];
3323     prop_value = previous[2];
3324     }
3325     else prop_type = prop_value = -1;
3326 nigel 77
3327     oldcode = code;
3328     code = previous; /* Usually overwrite previous item */
3329    
3330     /* If the maximum is zero then the minimum must also be zero; Perl allows
3331     this case, so we do too - by simply omitting the item altogether. */
3332    
3333     if (repeat_max == 0) goto END_REPEAT;
3334    
3335     /* All real repeats make it impossible to handle partial matching (maybe
3336     one day we will be able to remove this restriction). */
3337    
3338     if (repeat_max != 1) cd->nopartial = TRUE;
3339    
3340     /* Combine the op_type with the repeat_type */
3341    
3342     repeat_type += op_type;
3343    
3344     /* A minimum of zero is handled either as the special case * or ?, or as
3345     an UPTO, with the maximum given. */
3346    
3347     if (repeat_min == 0)
3348     {
3349     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3350     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3351     else
3352     {
3353     *code++ = OP_UPTO + repeat_type;
3354     PUT2INC(code, 0, repeat_max);
3355     }
3356     }
3357    
3358     /* A repeat minimum of 1 is optimized into some special cases. If the
3359 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3360 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3361     one less than the maximum. */
3362    
3363     else if (repeat_min == 1)
3364     {
3365     if (repeat_max == -1)
3366     *code++ = OP_PLUS + repeat_type;
3367     else
3368     {
3369     code = oldcode; /* leave previous item in place */
3370     if (repeat_max == 1) goto END_REPEAT;
3371     *code++ = OP_UPTO + repeat_type;
3372     PUT2INC(code, 0, repeat_max - 1);
3373     }
3374     }
3375    
3376     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3377     handled as an EXACT followed by an UPTO. */
3378    
3379     else
3380     {
3381     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3382     PUT2INC(code, 0, repeat_min);
3383    
3384     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3385     we have to insert the character for the previous code. For a repeated
3386 nigel 87 Unicode property match, there are two extra bytes that define the
3387 nigel 77 required property. In UTF-8 mode, long characters have their length in
3388     c, with the 0x80 bit as a flag. */
3389    
3390     if (repeat_max < 0)
3391     {
3392     #ifdef SUPPORT_UTF8
3393     if (utf8 && c >= 128)
3394     {
3395     memcpy(code, utf8_char, c & 7);
3396     code += c & 7;
3397     }
3398     else
3399     #endif
3400     {
3401     *code++ = c;
3402 nigel 87 if (prop_type >= 0)
3403     {
3404     *code++ = prop_type;
3405     *code++ = prop_value;
3406     }
3407 nigel 77 }
3408     *code++ = OP_STAR + repeat_type;
3409     }
3410    
3411     /* Else insert an UPTO if the max is greater than the min, again
3412 nigel 93 preceded by the character, for the previously inserted code. If the
3413     UPTO is just for 1 instance, we can use QUERY instead. */
3414 nigel 77
3415     else if (repeat_max != repeat_min)
3416     {
3417     #ifdef SUPPORT_UTF8
3418     if (utf8 && c >= 128)
3419     {
3420     memcpy(code, utf8_char, c & 7);
3421     code += c & 7;
3422     }
3423     else
3424     #endif
3425     *code++ = c;
3426 nigel 87 if (prop_type >= 0)
3427     {
3428     *code++ = prop_type;
3429     *code++ = prop_value;
3430     }
3431 nigel 77 repeat_max -= repeat_min;
3432 nigel 93
3433     if (repeat_max == 1)
3434     {
3435     *code++ = OP_QUERY + repeat_type;
3436     }
3437     else
3438     {
3439     *code++ = OP_UPTO + repeat_type;
3440     PUT2INC(code, 0, repeat_max);
3441     }
3442 nigel 77 }
3443     }
3444    
3445     /* The character or character type itself comes last in all cases. */
3446    
3447     #ifdef SUPPORT_UTF8
3448     if (utf8 && c >= 128)
3449     {
3450     memcpy(code, utf8_char, c & 7);
3451     code += c & 7;
3452     }
3453     else
3454     #endif
3455     *code++ = c;
3456    
3457 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3458     define the required property. */
3459 nigel 77
3460     #ifdef SUPPORT_UCP
3461 nigel 87 if (prop_type >= 0)
3462     {
3463     *code++ = prop_type;
3464     *code++ = prop_value;
3465     }
3466 nigel 77 #endif
3467     }
3468    
3469     /* If previous was a character class or a back reference, we put the repeat
3470     stuff after it, but just skip the item if the repeat was {0,0}. */
3471    
3472     else if (*previous == OP_CLASS ||
3473     *previous == OP_NCLASS ||
3474     #ifdef SUPPORT_UTF8
3475     *previous == OP_XCLASS ||
3476     #endif
3477     *previous == OP_REF)
3478     {
3479     if (repeat_max == 0)
3480     {
3481     code = previous;
3482     goto END_REPEAT;
3483     }
3484    
3485     /* All real repeats make it impossible to handle partial matching (maybe
3486     one day we will be able to remove this restriction). */
3487    
3488     if (repeat_max != 1) cd->nopartial = TRUE;
3489    
3490     if (repeat_min == 0 && repeat_max == -1)
3491     *code++ = OP_CRSTAR + repeat_type;
3492     else if (repeat_min == 1 && repeat_max == -1)
3493     *code++ = OP_CRPLUS + repeat_type;
3494     else if (repeat_min == 0 && repeat_max == 1)
3495     *code++ = OP_CRQUERY + repeat_type;
3496     else
3497     {
3498     *code++ = OP_CRRANGE + repeat_type;
3499     PUT2INC(code, 0, repeat_min);
3500     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3501     PUT2INC(code, 0, repeat_max);
3502     }
3503     }
3504    
3505     /* If previous was a bracket group, we may have to replicate it in certain
3506     cases. */
3507    
3508 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3509     *previous == OP_ONCE || *previous == OP_COND)
3510 nigel 77 {
3511     register int i;
3512     int ketoffset = 0;
3513     int len = code - previous;
3514     uschar *bralink = NULL;
3515    
3516 nigel 93 /* Repeating a DEFINE group is pointless */
3517    
3518     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3519     {
3520     *errorcodeptr = ERR55;
3521     goto FAILED;
3522     }
3523    
3524     /* This is a paranoid check to stop integer overflow later on */
3525    
3526     if (len > MAX_DUPLENGTH)
3527     {
3528     *errorcodeptr = ERR50;
3529     goto FAILED;
3530     }
3531    
3532 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3533     by scanning through from the start, and compute the offset back to it
3534     from the current code pointer. There may be an OP_OPT setting following
3535     the final KET, so we can't find the end just by going back from the code
3536     pointer. */
3537    
3538     if (repeat_max == -1)
3539     {
3540     register uschar *ket = previous;
3541     do ket += GET(ket, 1); while (*ket != OP_KET);
3542     ketoffset = code - ket;
3543     }
3544    
3545     /* The case of a zero minimum is special because of the need to stick
3546     OP_BRAZERO in front of it, and because the group appears once in the
3547     data, whereas in other cases it appears the minimum number of times. For
3548     this reason, it is simplest to treat this case separately, as otherwise
3549     the code gets far too messy. There are several special subcases when the
3550     minimum is zero. */
3551    
3552     if (repeat_min == 0)
3553     {
3554     /* If the maximum is also zero, we just omit the group from the output
3555     altogether. */
3556    
3557     if (repeat_max == 0)
3558     {
3559     code = previous;
3560     goto END_REPEAT;
3561     }
3562    
3563     /* If the maximum is 1 or unlimited, we just have to stick in the
3564     BRAZERO and do no more at this point. However, we do need to adjust
3565     any OP_RECURSE calls inside the group that refer to the group itself or
3566 nigel 93 any internal or forward referenced group, because the offset is from
3567     the start of the whole regex. Temporarily terminate the pattern while
3568     doing this. */
3569 nigel 77
3570     if (repeat_max <= 1)
3571     {
3572     *code = OP_END;
3573 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3574 nigel 77 memmove(previous+1, previous, len);
3575     code++;
3576     *previous++ = OP_BRAZERO + repeat_type;
3577     }
3578    
3579     /* If the maximum is greater than 1 and limited, we have to replicate
3580     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3581     The first one has to be handled carefully because it's the original
3582     copy, which has to be moved up. The remainder can be handled by code
3583     that is common with the non-zero minimum case below. We have to
3584     adjust the value or repeat_max, since one less copy is required. Once
3585     again, we may have to adjust any OP_RECURSE calls inside the group. */
3586    
3587     else
3588     {
3589     int offset;
3590     *code = OP_END;
3591 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3592 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3593     code += 2 + LINK_SIZE;
3594     *previous++ = OP_BRAZERO + repeat_type;
3595     *previous++ = OP_BRA;
3596    
3597     /* We chain together the bracket offset fields that have to be
3598     filled in later when the ends of the brackets are reached. */
3599    
3600     offset = (bralink == NULL)? 0 : previous - bralink;
3601     bralink = previous;
3602     PUTINC(previous, 0, offset);
3603     }
3604    
3605     repeat_max--;
3606     }
3607    
3608     /* If the minimum is greater than zero, replicate the group as many
3609     times as necessary, and adjust the maximum to the number of subsequent
3610     copies that we need. If we set a first char from the group, and didn't
3611 nigel 93 set a required char, copy the latter from the former. If there are any
3612     forward reference subroutine calls in the group, there will be entries on
3613     the workspace list; replicate these with an appropriate increment. */
3614 nigel 77
3615     else
3616     {
3617     if (repeat_min > 1)
3618     {
3619 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3620     just adjust the length as if we had. */
3621    
3622     if (lengthptr != NULL)
3623     *lengthptr += (repeat_min - 1)*length_prevgroup;
3624    
3625     /* This is compiling for real */
3626    
3627     else
3628 nigel 77 {
3629 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3630     for (i = 1; i < repeat_min; i++)
3631     {
3632     uschar *hc;
3633     uschar *this_hwm = cd->hwm;
3634     memcpy(code, previous, len);
3635     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3636     {
3637     PUT(cd->hwm, 0, GET(hc, 0) + len);
3638     cd->hwm += LINK_SIZE;
3639     }
3640     save_hwm = this_hwm;
3641     code += len;
3642     }
3643 nigel 77 }
3644     }
3645 nigel 93
3646 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3647     }
3648    
3649     /* This code is common to both the zero and non-zero minimum cases. If
3650     the maximum is limited, it replicates the group in a nested fashion,
3651     remembering the bracket starts on a stack. In the case of a zero minimum,
3652     the first one was set up above. In all cases the repeat_max now specifies
3653 nigel 93 the number of additional copies needed. Again, we must remember to
3654     replicate entries on the forward reference list. */
3655 nigel 77
3656     if (repeat_max >= 0)
3657     {
3658 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3659     just adjust the length as if we had. For each repetition we must add 1
3660     to the length for BRAZERO and for all but the last repetition we must
3661     add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3662    
3663     if (lengthptr != NULL && repeat_max > 0)
3664     *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3665     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3666    
3667     /* This is compiling for real */
3668    
3669     else for (i = repeat_max - 1; i >= 0; i--)
3670 nigel 77 {
3671 nigel 93 uschar *hc;
3672     uschar *this_hwm = cd->hwm;
3673    
3674 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3675    
3676     /* All but the final copy start a new nesting, maintaining the
3677     chain of brackets outstanding. */
3678    
3679     if (i != 0)
3680     {
3681     int offset;
3682     *code++ = OP_BRA;
3683     offset = (bralink == NULL)? 0 : code - bralink;
3684     bralink = code;
3685     PUTINC(code, 0, offset);
3686     }
3687    
3688     memcpy(code, previous, len);
3689 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3690     {
3691     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3692     cd->hwm += LINK_SIZE;
3693     }
3694     save_hwm = this_hwm;
3695 nigel 77 code += len;
3696     }
3697    
3698     /* Now chain through the pending brackets, and fill in their length
3699     fields (which are holding the chain links pro tem). */
3700    
3701     while (bralink != NULL)
3702     {
3703     int oldlinkoffset;
3704     int offset = code - bralink + 1;
3705     uschar *bra = code - offset;
3706     oldlinkoffset = GET(bra, 1);
3707     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3708     *code++ = OP_KET;
3709     PUTINC(code, 0, offset);
3710     PUT(bra, 1, offset);
3711     }
3712     }
3713    
3714     /* If the maximum is unlimited, set a repeater in the final copy. We
3715     can't just offset backwards from the current code point, because we
3716     don't know if there's been an options resetting after the ket. The
3717 nigel 93 correct offset was computed above.
3718 nigel 77
3719 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3720     this group is a non-atomic one that could match an empty string. If so,
3721     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3722     that runtime checking can be done. [This check is also applied to
3723     atomic groups at runtime, but in a different way.] */
3724    
3725     else
3726     {
3727     uschar *ketcode = code - ketoffset;
3728     uschar *bracode = ketcode - GET(ketcode, 1);
3729     *ketcode = OP_KETRMAX + repeat_type;
3730     if (lengthptr == NULL && *bracode != OP_ONCE)
3731     {
3732     uschar *scode = bracode;
3733     do
3734     {
3735     if (could_be_empty_branch(scode, ketcode, utf8))
3736     {
3737     *bracode += OP_SBRA - OP_BRA;
3738     break;
3739     }
3740     scode += GET(scode, 1);
3741     }
3742     while (*scode == OP_ALT);
3743     }
3744     }
3745 nigel 77 }
3746    
3747     /* Else there's some kind of shambles */
3748    
3749     else
3750     {
3751     *errorcodeptr = ERR11;
3752     goto FAILED;
3753     }
3754    
3755 nigel 93 /* If the character following a repeat is '+', or if certain optimization
3756     tests above succeeded, possessive_quantifier is TRUE. For some of the
3757     simpler opcodes, there is an special alternative opcode for this. For
3758     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3759     The '+' notation is just syntactic sugar, taken from Sun's Java package,
3760     but the special opcodes can optimize it a bit. The repeated item starts at
3761     tempcode, not at previous, which might be the first part of a string whose
3762     (former) last char we repeated.
3763 nigel 77
3764 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3765     an 'upto' may follow. We skip over an 'exact' item, and then test the
3766     length of what remains before proceeding. */
3767    
3768 nigel 77 if (possessive_quantifier)
3769     {
3770 nigel 93 int len;
3771     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3772     *tempcode == OP_NOTEXACT)
3773     tempcode += _pcre_OP_lengths[*tempcode];
3774     len = code - tempcode;
3775     if (len > 0) switch (*tempcode)
3776     {
3777     case OP_STAR: *tempcode = OP_POSSTAR; break;
3778     case OP_PLUS: *tempcode = OP_POSPLUS; break;
3779     case OP_QUERY: *tempcode = OP_POSQUERY; break;
3780     case OP_UPTO: *tempcode = OP_POSUPTO; break;
3781    
3782     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3783     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3784     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3785     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3786    
3787     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3788     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3789     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3790     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3791    
3792     default:
3793     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3794     code += 1 + LINK_SIZE;
3795     len += 1 + LINK_SIZE;
3796     tempcode[0] = OP_ONCE;
3797     *code++ = OP_KET;
3798     PUTINC(code, 0, len);
3799     PUT(tempcode, 1, len);
3800     break;
3801     }
3802 nigel 77 }
3803    
3804     /* In all case we no longer have a previous item. We also set the
3805     "follows varying string" flag for subsequently encountered reqbytes if
3806     it isn't already set and we have just passed a varying length item. */
3807    
3808     END_REPEAT:
3809     previous = NULL;
3810     cd->req_varyopt |= reqvary;
3811     break;
3812    
3813    
3814 nigel 93 /* ===================================================================*/
3815     /* Start of nested parenthesized sub-expression, or comment or lookahead or
3816     lookbehind or option setting or condition or all the other extended
3817     parenthesis forms. First deal with the specials; all are introduced by ?,
3818     and the appearance of any of them means that this is not a capturing
3819     group. */
3820 nigel 77
3821     case '(':
3822     newoptions = options;
3823     skipbytes = 0;
3824 nigel 93 bravalue = OP_CBRA;
3825     save_hwm = cd->hwm;
3826 ph10 180 reset_bracount = FALSE;
3827 nigel 77
3828     if (*(++ptr) == '?')
3829     {
3830 nigel 93 int i, set, unset, namelen;
3831 nigel 77 int *optset;
3832 nigel 93 const uschar *name;
3833     uschar *slot;
3834 nigel 77
3835     switch (*(++ptr))
3836     {
3837     case '#': /* Comment; skip to ket */
3838     ptr++;
3839 nigel 93 while (*ptr != 0 && *ptr != ')') ptr++;
3840     if (*ptr == 0)
3841     {
3842     *errorcodeptr = ERR18;
3843     goto FAILED;
3844     }
3845 nigel 77 continue;
3846    
3847 nigel 93
3848     /* ------------------------------------------------------------ */
3849 ph10 175 case '|': /* Reset capture count for each branch */
3850     reset_bracount = TRUE;
3851 ph10 180 /* Fall through */
3852 ph10 175
3853     /* ------------------------------------------------------------ */
3854 nigel 93 case ':': /* Non-capturing bracket */
3855 nigel 77 bravalue = OP_BRA;
3856     ptr++;
3857     break;
3858    
3859 nigel 93
3860     /* ------------------------------------------------------------ */
3861 nigel 77 case '(':
3862     bravalue = OP_COND; /* Conditional group */
3863    
3864 nigel 93 /* A condition can be an assertion, a number (referring to a numbered
3865     group), a name (referring to a named group), or 'R', referring to
3866     recursion. R<digits> and R&name are also permitted for recursion tests.
3867 nigel 77
3868 nigel 93 There are several syntaxes for testing a named group: (?(name)) is used
3869     by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3870    
3871     There are two unfortunate ambiguities, caused by history. (a) 'R' can
3872     be the recursive thing or the name 'R' (and similarly for 'R' followed
3873     by digits), and (b) a number could be a name that consists of digits.
3874     In both cases, we look for a name first; if not found, we try the other
3875     cases. */
3876    
3877     /* For conditions that are assertions, check the syntax, and then exit
3878     the switch. This will take control down to where bracketed groups,
3879     including assertions, are processed. */
3880    
3881     if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3882     break;
3883    
3884     /* Most other conditions use OP_CREF (a couple change to OP_RREF
3885     below), and all need to skip 3 bytes at the start of the group. */
3886    
3887     code[1+LINK_SIZE] = OP_CREF;
3888     skipbytes = 3;
3889 ph10 172 refsign = -1;
3890 nigel 93
3891     /* Check for a test for recursion in a named group. */
3892    
3893     if (ptr[1] == 'R' && ptr[2] == '&')
3894 nigel 77 {
3895 nigel 93 terminator = -1;
3896     ptr += 2;
3897     code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3898     }
3899 nigel 91
3900 nigel 93 /* Check for a test for a named group's having been set, using the Perl
3901     syntax (?(<name>) or (?('name') */
3902 nigel 91
3903 nigel 93 else if (ptr[1] == '<')
3904     {
3905     terminator = '>';
3906     ptr++;
3907     }
3908     else if (ptr[1] == '\'')
3909     {
3910     terminator = '\'';
3911     ptr++;
3912     }
3913 ph10 172 else
3914 ph10 167 {
3915     terminator = 0;
3916 ph10 172 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
3917     }
3918 nigel 77
3919 nigel 93 /* We now expect to read a name; any thing else is an error */
3920 nigel 77
3921 nigel 93 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3922     {
3923     ptr += 1; /* To get the right offset */
3924     *errorcodeptr = ERR28;
3925     goto FAILED;
3926     }
3927    
3928     /* Read the name, but also get it as a number if it's all digits */
3929    
3930     recno = 0;
3931     name = ++ptr;
3932     while ((cd->ctypes[*ptr] & ctype_word) != 0)
3933     {
3934     if (recno >= 0)
3935     recno = ((digitab[*ptr] & ctype_digit) != 0)?
3936     recno * 10 + *ptr - '0' : -1;
3937 nigel 91 ptr++;
3938 nigel 93 }
3939     namelen = ptr - name;
3940 nigel 91
3941 nigel 93 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3942     {
3943     ptr--; /* Error offset */
3944     *errorcodeptr = ERR26;
3945     goto FAILED;
3946     }
3947 nigel 91
3948 nigel 93 /* Do no further checking in the pre-compile phase. */
3949 nigel 91
3950 nigel 93 if (lengthptr != NULL) break;
3951 nigel 91
3952 nigel 93 /* In the real compile we do the work of looking for the actual
3953 ph10 167 reference. If the string started with "+" or "-" we require the rest to
3954     be digits, in which case recno will be set. */
3955 ph10 172
3956 ph10 167 if (refsign > 0)
3957     {
3958     if (recno <= 0)
3959     {
3960     *errorcodeptr = ERR58;
3961     goto FAILED;
3962 ph10 172 }
3963 ph10 167 if (refsign == '-')
3964     {
3965 ph10 172 recno = cd->bracount - recno + 1;
3966 ph10 167 if (recno <= 0)
3967     {
3968     *errorcodeptr = ERR15;
3969     goto FAILED;
3970 ph10 172 }
3971 ph10 167 }
3972 ph10 172 else recno += cd->bracount;
3973 ph10 167 PUT2(code, 2+LINK_SIZE, recno);
3974     break;
3975 ph10 172 }
3976 nigel 91
3977 ph10 167 /* Otherwise (did not start with "+" or "-"), start by looking for the
3978     name. */
3979 ph10 172
3980 nigel 93 slot = cd->name_table;
3981     for (i = 0; i < cd->names_found; i++)
3982     {
3983     if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3984     slot += cd->name_entry_size;
3985     }
3986 nigel 91
3987 nigel 93 /* Found a previous named subpattern */
3988 nigel 91
3989 nigel 93 if (i < cd->names_found)
3990     {
3991     recno = GET2(slot, 0);
3992     PUT2(code, 2+LINK_SIZE, recno);
3993     }
3994 nigel 91
3995 nigel 93 /* Search the pattern for a forward reference */
3996 nigel 91
3997 nigel 93 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3998     (options & PCRE_EXTENDED) != 0)) > 0)
3999     {
4000     PUT2(code, 2+LINK_SIZE, i);
4001     }
4002 nigel 91
4003 nigel 93 /* If terminator == 0 it means that the name followed directly after
4004     the opening parenthesis [e.g. (?(abc)...] and in this case there are
4005     some further alternatives to try. For the cases where terminator != 0
4006     [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4007     now checked all the possibilities, so give an error. */
4008 nigel 91
4009 nigel 93 else if (terminator != 0)
4010     {
4011     *errorcodeptr = ERR15;
4012     goto FAILED;
4013     }
4014    
4015     /* Check for (?(R) for recursion. Allow digits after R to specify a
4016     specific group number. */
4017    
4018     else if (*name == 'R')
4019     {
4020     recno = 0;
4021     for (i = 1; i < namelen; i++)
4022 nigel 91 {
4023 nigel 93 if ((digitab[name[i]] & ctype_digit) == 0)
4024     {
4025     *errorcodeptr = ERR15;
4026     goto FAILED;
4027     }
4028     recno = recno * 10 + name[i] - '0';
4029 nigel 77 }
4030 nigel 93 if (recno == 0) recno = RREF_ANY;
4031     code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4032     PUT2(code, 2+LINK_SIZE, recno);
4033 nigel 77 }
4034 nigel 91
4035 nigel 93 /* Similarly, check for the (?(DEFINE) "condition", which is always
4036     false. */
4037 nigel 91
4038 nigel 93 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4039     {
4040     code[1+LINK_SIZE] = OP_DEF;
4041     skipbytes = 1;
4042     }
4043    
4044     /* Check for the "name" actually being a subpattern number. */
4045    
4046     else if (recno > 0)
4047     {
4048     PUT2(code, 2+LINK_SIZE, recno);
4049     }
4050    
4051     /* Either an unidentified subpattern, or a reference to (?(0) */
4052    
4053     else
4054     {
4055     *errorcodeptr = (recno == 0)? ERR35: ERR15;
4056     goto FAILED;
4057     }
4058 nigel 77 break;
4059    
4060 nigel 93
4061     /* ------------------------------------------------------------ */
4062 nigel 77 case '=': /* Positive lookahead */
4063     bravalue = OP_ASSERT;
4064     ptr++;
4065     break;
4066    
4067 nigel 93
4068     /* ------------------------------------------------------------ */
4069 nigel 77 case '!': /* Negative lookahead */
4070     bravalue = OP_ASSERT_NOT;
4071     ptr++;
4072     break;
4073    
4074 nigel 93
4075     /* ------------------------------------------------------------ */
4076     case '<': /* Lookbehind or named define */
4077     switch (ptr[1])
4078 nigel 77 {
4079     case '=': /* Positive lookbehind */
4080     bravalue = OP_ASSERTBACK;
4081 nigel 93 ptr += 2;
4082 nigel 77 break;
4083    
4084     case '!': /* Negative lookbehind */
4085     bravalue = OP_ASSERTBACK_NOT;
4086 nigel 93 ptr += 2;
4087 nigel 77 break;
4088 nigel 93
4089     default: /* Could be name define, else bad */
4090     if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4091     ptr++; /* Correct offset for error */
4092     *errorcodeptr = ERR24;
4093     goto FAILED;
4094 nigel 77 }
4095     break;
4096    
4097 nigel 93
4098     /* ------------------------------------------------------------ */
4099 nigel 77 case '>': /* One-time brackets */
4100     bravalue = OP_ONCE;
4101     ptr++;
4102     break;
4103    
4104 nigel 93
4105     /* ------------------------------------------------------------ */
4106 nigel 77 case 'C': /* Callout - may be followed by digits; */
4107     previous_callout = code; /* Save for later completion */
4108     after_manual_callout = 1; /* Skip one item before completing */
4109 nigel 93 *code++ = OP_CALLOUT;
4110     {
4111 nigel 77 int n = 0;
4112     while ((digitab[*(++ptr)] & ctype_digit) != 0)
4113     n = n * 10 + *ptr - '0';
4114 nigel 93 if (*ptr != ')')
4115     {
4116     *errorcodeptr = ERR39;
4117     goto FAILED;
4118     }
4119 nigel 77 if (n > 255)
4120     {
4121     *errorcodeptr = ERR38;
4122     goto FAILED;
4123     }
4124     *code++ = n;
4125     PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4126     PUT(code, LINK_SIZE, 0); /* Default length */
4127     code += 2 * LINK_SIZE;
4128     }
4129     previous = NULL;
4130     continue;
4131    
4132 nigel 93
4133     /* ------------------------------------------------------------ */
4134     case 'P': /* Python-style named subpattern handling */
4135     if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4136 nigel 77 {
4137 nigel 93 is_recurse = *ptr == '>';
4138     terminator = ')';
4139     goto NAMED_REF_OR_RECURSE;
4140     }
4141     else if (*ptr != '<') /* Test for Python-style definition */
4142     {
4143     *errorcodeptr = ERR41;
4144     goto FAILED;
4145     }
4146     /* Fall through to handle (?P< as (?< is handled */
4147 nigel 77
4148    
4149 nigel 93 /* ------------------------------------------------------------ */
4150     DEFINE_NAME: /* Come here from (?< handling */
4151 &nb