/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 269 - (hide annotations) (download)
Fri Nov 16 16:22:24 2007 UTC (6 years, 5 months ago) by ph10
File MIME type: text/plain
File size: 195027 byte(s)
Improve error messages for (?+-a) and (?-+a).

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 nigel 77 static const short int escapes[] = {
102     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 nigel 77 0, 0, -ESC_z /* x - z */
112     };
113    
114 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
115 nigel 77 static const short int escapes[] = {
116     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139     };
140     #endif
141    
142    
143 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144     searched linearly. Put all the names into a single string, in order to reduce
145 ph10 240 the number of relocations when a shared library is dynamically linked. */
146 ph10 210
147     typedef struct verbitem {
148     int len;
149     int op;
150 ph10 211 } verbitem;
151 ph10 210
152 ph10 240 static const char verbnames[] =
153 ph10 243 "ACCEPT\0"
154     "COMMIT\0"
155     "F\0"
156     "FAIL\0"
157     "PRUNE\0"
158     "SKIP\0"
159     "THEN";
160 ph10 240
161 ph10 210 static verbitem verbs[] = {
162 ph10 240 { 6, OP_ACCEPT },
163     { 6, OP_COMMIT },
164     { 1, OP_FAIL },
165     { 4, OP_FAIL },
166     { 5, OP_PRUNE },
167     { 4, OP_SKIP },
168     { 4, OP_THEN }
169 ph10 210 };
170    
171     static int verbcount = sizeof(verbs)/sizeof(verbitem);
172    
173    
174 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
175     now all in a single string, to reduce the number of relocations when a shared
176 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
177     length entry. The first three must be alpha, lower, upper, as this is assumed
178     for handling case independence. */
179 nigel 77
180 ph10 240 static const char posix_names[] =
181 ph10 243 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182     "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 ph10 240 "word\0" "xdigit";
184 nigel 77
185     static const uschar posix_name_lengths[] = {
186     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187    
188 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
189     base map, with an optional addition or removal of another map. Then, for some
190     classes, there is some additional tweaking: for [:blank:] the vertical space
191     characters are removed, and for [:alpha:] and [:alnum:] the underscore
192     character is removed. The triples in the table consist of the base map offset,
193     second map offset or -1 if no second map, and a non-negative value for map
194     addition or a negative value for map subtraction (if there are two maps). The
195     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196     remove vertical space characters, 2 => remove underscore. */
197 nigel 77
198     static const int posix_class_maps[] = {
199 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
200     cbit_lower, -1, 0, /* lower */
201     cbit_upper, -1, 0, /* upper */
202     cbit_word, -1, 2, /* alnum - word without underscore */
203     cbit_print, cbit_cntrl, 0, /* ascii */
204     cbit_space, -1, 1, /* blank - a GNU extension */
205     cbit_cntrl, -1, 0, /* cntrl */
206     cbit_digit, -1, 0, /* digit */
207     cbit_graph, -1, 0, /* graph */
208     cbit_print, -1, 0, /* print */
209     cbit_punct, -1, 0, /* punct */
210     cbit_space, -1, 0, /* space */
211     cbit_word, -1, 0, /* word - a Perl extension */
212     cbit_xdigit,-1, 0 /* xdigit */
213 nigel 77 };
214    
215    
216 nigel 93 #define STRING(a) # a
217     #define XSTRING(s) STRING(s)
218    
219 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
220 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
221     they are documented. Always add a new error instead. Messages marked DEAD below
222 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
223     the number of relocations needed when a shared library is loaded dynamically,
224     it is now one long string. We cannot use a table of offsets, because the
225     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226     simply count through to the one we want - this isn't a performance issue
227 ph10 240 because these strings are used only when there is a compilation error. */
228 nigel 77
229 ph10 240 static const char error_texts[] =
230     "no error\0"
231     "\\ at end of pattern\0"
232     "\\c at end of pattern\0"
233     "unrecognized character follows \\\0"
234     "numbers out of order in {} quantifier\0"
235 nigel 77 /* 5 */
236 ph10 240 "number too big in {} quantifier\0"
237     "missing terminating ] for character class\0"
238     "invalid escape sequence in character class\0"
239     "range out of order in character class\0"
240     "nothing to repeat\0"
241 nigel 77 /* 10 */
242 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243     "internal error: unexpected repeat\0"
244 ph10 269 "unrecognized character after (? or (?-\0"
245 ph10 240 "POSIX named classes are supported only within a class\0"
246     "missing )\0"
247 nigel 77 /* 15 */
248 ph10 240 "reference to non-existent subpattern\0"
249     "erroffset passed as NULL\0"
250     "unknown option bit(s) set\0"
251     "missing ) after comment\0"
252     "parentheses nested too deeply\0" /** DEAD **/
253 nigel 77 /* 20 */
254 ph10 240 "regular expression is too large\0"
255     "failed to get memory\0"
256     "unmatched parentheses\0"
257     "internal error: code overflow\0"
258     "unrecognized character after (?<\0"
259 nigel 77 /* 25 */
260 ph10 240 "lookbehind assertion is not fixed length\0"
261     "malformed number or name after (?(\0"
262     "conditional group contains more than two branches\0"
263     "assertion expected after (?(\0"
264     "(?R or (?[+-]digits must be followed by )\0"
265 nigel 77 /* 30 */
266 ph10 240 "unknown POSIX class name\0"
267     "POSIX collating elements are not supported\0"
268     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269     "spare error\0" /** DEAD **/
270     "character value in \\x{...} sequence is too large\0"
271 nigel 77 /* 35 */
272 ph10 240 "invalid condition (?(0)\0"
273     "\\C not allowed in lookbehind assertion\0"
274     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275     "number after (?C is > 255\0"
276     "closing ) for (?C expected\0"
277 nigel 77 /* 40 */
278 ph10 240 "recursive call could loop indefinitely\0"
279     "unrecognized character after (?P\0"
280     "syntax error in subpattern name (missing terminator)\0"
281     "two named subpatterns have the same name\0"
282     "invalid UTF-8 string\0"
283 nigel 77 /* 45 */
284 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
285     "malformed \\P or \\p sequence\0"
286     "unknown property name after \\P or \\p\0"
287     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 nigel 91 /* 50 */
290 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
291     "octal value is greater than \\377 (not in UTF-8 mode)\0"
292     "internal error: overran compiling workspace\0"
293     "internal error: previously-checked referenced subpattern not found\0"
294     "DEFINE group contains more than one branch\0"
295 nigel 93 /* 55 */
296 ph10 240 "repeating a DEFINE group is not allowed\0"
297     "inconsistent NEWLINE options\0"
298     "\\g is not followed by a braced name or an optionally braced non-zero number\0"
299     "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
300     "(*VERB) with an argument is not supported\0"
301 ph10 211 /* 60 */
302 ph10 240 "(*VERB) not recognized\0"
303 ph10 268 "number is too big\0"
304 ph10 269 "subpattern name expected after (?&\0"
305     "digit expected after (?+";
306 nigel 77
307    
308     /* Table to identify digits and hex digits. This is used when compiling
309     patterns. Note that the tables in chartables are dependent on the locale, and
310     may mark arbitrary characters as digits - but the PCRE compiling code expects
311     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
312     a private table here. It costs 256 bytes, but it is a lot faster than doing
313     character value tests (at least in some simple cases I timed), and in some
314     applications one wants PCRE to compile efficiently as well as match
315     efficiently.
316    
317     For convenience, we use the same bit definitions as in chartables:
318    
319     0x04 decimal digit
320     0x08 hexadecimal digit
321    
322     Then we can use ctype_digit and ctype_xdigit in the code. */
323    
324 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
325 nigel 77 static const unsigned char digitab[] =
326     {
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
333     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
334     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
335     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
336     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
337     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
338     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
339     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
340     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
341     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
342     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
343     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
344     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
347     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
348     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
350     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
351     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
352     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
355     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
356     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
357     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
358     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
359    
360 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
361 nigel 77 static const unsigned char digitab[] =
362     {
363     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
364     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
365     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
366     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
367     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
368     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
369     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
370     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
371     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
374 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
375 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
376     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
377     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
378     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
379     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
380     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
381     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
382     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
383     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
387     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
388     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
393     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
394     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
395    
396     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
397     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
398     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
399     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
400     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
401     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
402     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
403     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
404     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
405     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
406     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
407     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
408 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
409 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
410     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
411     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
412     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
413     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
414     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
415     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
416     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
417     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
418     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
419     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
420     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
421     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
422     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
423     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
424     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
425     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
426     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
427     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
428     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
429     #endif
430    
431    
432     /* Definition to allow mutual recursion */
433    
434     static BOOL
435 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
436 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
437 nigel 77
438    
439    
440     /*************************************************
441 ph10 240 * Find an error text *
442     *************************************************/
443    
444 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
445     some of the text is of unknown length, we can't use a table of offsets.
446     Instead, just count through the strings. This is not a performance issue
447 ph10 240 because it happens only when there has been a compilation error.
448    
449     Argument: the error number
450     Returns: pointer to the error string
451     */
452    
453     static const char *
454     find_error_text(int n)
455     {
456     const char *s = error_texts;
457 ph10 243 for (; n > 0; n--) while (*s++ != 0);
458 ph10 240 return s;
459     }
460    
461    
462     /*************************************************
463 nigel 77 * Handle escapes *
464     *************************************************/
465    
466     /* This function is called when a \ has been encountered. It either returns a
467     positive value for a simple escape such as \n, or a negative value which
468 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
469     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
470     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
471     ptr is pointing at the \. On exit, it is on the final character of the escape
472     sequence.
473 nigel 77
474     Arguments:
475     ptrptr points to the pattern position pointer
476     errorcodeptr points to the errorcode variable
477     bracount number of previous extracting brackets
478     options the options bits
479     isclass TRUE if inside a character class
480    
481     Returns: zero or positive => a data character
482     negative => a special escape sequence
483 ph10 213 on error, errorcodeptr is set
484 nigel 77 */
485    
486     static int
487     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
488     int options, BOOL isclass)
489     {
490 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
491     const uschar *ptr = *ptrptr + 1;
492 nigel 77 int c, i;
493    
494 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
495     ptr--; /* Set pointer back to the last byte */
496    
497 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
498    
499     if (c == 0) *errorcodeptr = ERR1;
500    
501     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
502     a table. A non-zero result is something that can be returned immediately.
503     Otherwise further processing may be required. */
504    
505 ph10 97 #ifndef EBCDIC /* ASCII coding */
506 nigel 77 else if (c < '0' || c > 'z') {} /* Not alphameric */
507     else if ((i = escapes[c - '0']) != 0) c = i;
508    
509 ph10 97 #else /* EBCDIC coding */
510 nigel 77 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
511     else if ((i = escapes[c - 0x48]) != 0) c = i;
512     #endif
513    
514     /* Escapes that need further processing, or are illegal. */
515    
516     else
517     {
518     const uschar *oldptr;
519 nigel 93 BOOL braced, negated;
520    
521 nigel 77 switch (c)
522     {
523     /* A number of Perl escapes are not handled by PCRE. We give an explicit
524     error. */
525    
526     case 'l':
527     case 'L':
528     case 'N':
529     case 'u':
530     case 'U':
531     *errorcodeptr = ERR37;
532     break;
533    
534 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
535     is an absolute backreference. If negative, it is a relative backreference.
536 ph10 172 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
537     reference to a named group. This is part of Perl's movement towards a
538     unified syntax for back references. As this is synonymous with \k{name}, we
539 ph10 171 fudge it up by pretending it really was \k. */
540 nigel 93
541     case 'g':
542     if (ptr[1] == '{')
543     {
544 ph10 171 const uschar *p;
545     for (p = ptr+2; *p != 0 && *p != '}'; p++)
546     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
547 ph10 172 if (*p != 0 && *p != '}')
548 ph10 171 {
549     c = -ESC_k;
550     break;
551 ph10 172 }
552 nigel 93 braced = TRUE;
553     ptr++;
554     }
555     else braced = FALSE;
556    
557     if (ptr[1] == '-')
558     {
559     negated = TRUE;
560     ptr++;
561     }
562     else negated = FALSE;
563    
564     c = 0;
565     while ((digitab[ptr[1]] & ctype_digit) != 0)
566     c = c * 10 + *(++ptr) - '0';
567 ph10 220
568 ph10 213 if (c < 0)
569     {
570     *errorcodeptr = ERR61;
571     break;
572 ph10 220 }
573 nigel 93
574     if (c == 0 || (braced && *(++ptr) != '}'))
575     {
576     *errorcodeptr = ERR57;
577 ph10 213 break;
578 nigel 93 }
579    
580     if (negated)
581     {
582     if (c > bracount)
583     {
584     *errorcodeptr = ERR15;
585 ph10 213 break;
586 nigel 93 }
587     c = bracount - (c - 1);
588     }
589    
590     c = -(ESC_REF + c);
591     break;
592    
593 nigel 77 /* The handling of escape sequences consisting of a string of digits
594     starting with one that is not zero is not straightforward. By experiment,
595     the way Perl works seems to be as follows:
596    
597     Outside a character class, the digits are read as a decimal number. If the
598     number is less than 10, or if there are that many previous extracting
599     left brackets, then it is a back reference. Otherwise, up to three octal
600     digits are read to form an escaped byte. Thus \123 is likely to be octal
601     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
602     value is greater than 377, the least significant 8 bits are taken. Inside a
603     character class, \ followed by a digit is always an octal number. */
604    
605     case '1': case '2': case '3': case '4': case '5':
606     case '6': case '7': case '8': case '9':
607    
608     if (!isclass)
609     {
610     oldptr = ptr;
611     c -= '0';
612     while ((digitab[ptr[1]] & ctype_digit) != 0)
613     c = c * 10 + *(++ptr) - '0';
614 ph10 213 if (c < 0)
615     {
616     *errorcodeptr = ERR61;
617 ph10 220 break;
618     }
619 nigel 77 if (c < 10 || c <= bracount)
620     {
621     c = -(ESC_REF + c);
622     break;
623     }
624     ptr = oldptr; /* Put the pointer back and fall through */
625     }
626    
627     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
628     generates a binary zero byte and treats the digit as a following literal.
629     Thus we have to pull back the pointer by one. */
630    
631     if ((c = *ptr) >= '8')
632     {
633     ptr--;
634     c = 0;
635     break;
636     }
637    
638     /* \0 always starts an octal number, but we may drop through to here with a
639 nigel 91 larger first octal digit. The original code used just to take the least
640     significant 8 bits of octal numbers (I think this is what early Perls used
641     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
642     than 3 octal digits. */
643 nigel 77
644     case '0':
645     c -= '0';
646     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
647     c = c * 8 + *(++ptr) - '0';
648 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
649 nigel 77 break;
650    
651 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
652     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
653     treated as a data character. */
654 nigel 77
655     case 'x':
656 nigel 87 if (ptr[1] == '{')
657 nigel 77 {
658     const uschar *pt = ptr + 2;
659 nigel 87 int count = 0;
660    
661 nigel 77 c = 0;
662     while ((digitab[*pt] & ctype_xdigit) != 0)
663     {
664 nigel 87 register int cc = *pt++;
665     if (c == 0 && cc == '0') continue; /* Leading zeroes */
666 nigel 77 count++;
667 nigel 87
668 ph10 97 #ifndef EBCDIC /* ASCII coding */
669 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
670 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
671 ph10 97 #else /* EBCDIC coding */
672 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
673 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
674 nigel 77 #endif
675     }
676 nigel 87
677 nigel 77 if (*pt == '}')
678     {
679 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
680 nigel 77 ptr = pt;
681     break;
682     }
683 nigel 87
684 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
685     recognize this construct; fall through to the normal \x handling. */
686     }
687    
688 nigel 87 /* Read just a single-byte hex-defined char */
689 nigel 77
690     c = 0;
691     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
692     {
693     int cc; /* Some compilers don't like ++ */
694     cc = *(++ptr); /* in initializers */
695 ph10 97 #ifndef EBCDIC /* ASCII coding */
696 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
697     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
698 ph10 97 #else /* EBCDIC coding */
699 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
700     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
701     #endif
702     }
703     break;
704    
705 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
706     This coding is ASCII-specific, but then the whole concept of \cx is
707     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
708 nigel 77
709     case 'c':
710     c = *(++ptr);
711     if (c == 0)
712     {
713     *errorcodeptr = ERR2;
714 ph10 213 break;
715 nigel 77 }
716    
717 ph10 97 #ifndef EBCDIC /* ASCII coding */
718 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
719     c ^= 0x40;
720 ph10 97 #else /* EBCDIC coding */
721 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
722     c ^= 0xC0;
723     #endif
724     break;
725    
726     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
727     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
728     for Perl compatibility, it is a literal. This code looks a bit odd, but
729     there used to be some cases other than the default, and there may be again
730     in future, so I haven't "optimized" it. */
731    
732     default:
733     if ((options & PCRE_EXTRA) != 0) switch(c)
734     {
735     default:
736     *errorcodeptr = ERR3;
737     break;
738     }
739     break;
740     }
741     }
742    
743     *ptrptr = ptr;
744     return c;
745     }
746    
747    
748    
749     #ifdef SUPPORT_UCP
750     /*************************************************
751     * Handle \P and \p *
752     *************************************************/
753    
754     /* This function is called after \P or \p has been encountered, provided that
755     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
756     pointing at the P or p. On exit, it is pointing at the final character of the
757     escape sequence.
758    
759     Argument:
760     ptrptr points to the pattern position pointer
761     negptr points to a boolean that is set TRUE for negation else FALSE
762 nigel 87 dptr points to an int that is set to the detailed property value
763 nigel 77 errorcodeptr points to the error code variable
764    
765 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
766 nigel 77 */
767    
768     static int
769 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
770 nigel 77 {
771     int c, i, bot, top;
772     const uschar *ptr = *ptrptr;
773 nigel 87 char name[32];
774 nigel 77
775     c = *(++ptr);
776     if (c == 0) goto ERROR_RETURN;
777    
778     *negptr = FALSE;
779    
780 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
781     negation. */
782 nigel 77
783     if (c == '{')
784     {
785     if (ptr[1] == '^')
786     {
787     *negptr = TRUE;
788     ptr++;
789     }
790 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
791 nigel 77 {
792     c = *(++ptr);
793     if (c == 0) goto ERROR_RETURN;
794     if (c == '}') break;
795     name[i] = c;
796     }
797 nigel 87 if (c !='}') goto ERROR_RETURN;
798 nigel 77 name[i] = 0;
799     }
800    
801     /* Otherwise there is just one following character */
802    
803     else
804     {
805     name[0] = c;
806     name[1] = 0;
807     }
808    
809     *ptrptr = ptr;
810    
811     /* Search for a recognized property name using binary chop */
812    
813     bot = 0;
814     top = _pcre_utt_size;
815    
816     while (bot < top)
817     {
818 nigel 87 i = (bot + top) >> 1;
819 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
820 nigel 87 if (c == 0)
821     {
822     *dptr = _pcre_utt[i].value;
823     return _pcre_utt[i].type;
824     }
825 nigel 77 if (c > 0) bot = i + 1; else top = i;
826     }
827    
828     *errorcodeptr = ERR47;
829     *ptrptr = ptr;
830     return -1;
831    
832     ERROR_RETURN:
833     *errorcodeptr = ERR46;
834     *ptrptr = ptr;
835     return -1;
836     }
837     #endif
838    
839    
840    
841    
842     /*************************************************
843     * Check for counted repeat *
844     *************************************************/
845    
846     /* This function is called when a '{' is encountered in a place where it might
847     start a quantifier. It looks ahead to see if it really is a quantifier or not.
848     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
849     where the ddds are digits.
850    
851     Arguments:
852     p pointer to the first char after '{'
853    
854     Returns: TRUE or FALSE
855     */
856    
857     static BOOL
858     is_counted_repeat(const uschar *p)
859     {
860     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
861     while ((digitab[*p] & ctype_digit) != 0) p++;
862     if (*p == '}') return TRUE;
863    
864     if (*p++ != ',') return FALSE;
865     if (*p == '}') return TRUE;
866    
867     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
868     while ((digitab[*p] & ctype_digit) != 0) p++;
869    
870     return (*p == '}');
871     }
872    
873    
874    
875     /*************************************************
876     * Read repeat counts *
877     *************************************************/
878    
879     /* Read an item of the form {n,m} and return the values. This is called only
880     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
881     so the syntax is guaranteed to be correct, but we need to check the values.
882    
883     Arguments:
884     p pointer to first char after '{'
885     minp pointer to int for min
886     maxp pointer to int for max
887     returned as -1 if no max
888     errorcodeptr points to error code variable
889    
890     Returns: pointer to '}' on success;
891     current ptr on error, with errorcodeptr set non-zero
892     */
893    
894     static const uschar *
895     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
896     {
897     int min = 0;
898     int max = -1;
899    
900 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
901     an integer overflow. */
902    
903 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
904 nigel 81 if (min < 0 || min > 65535)
905     {
906     *errorcodeptr = ERR5;
907     return p;
908     }
909 nigel 77
910 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
911     Also, max must not be less than min. */
912    
913 nigel 77 if (*p == '}') max = min; else
914     {
915     if (*(++p) != '}')
916     {
917     max = 0;
918     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
919 nigel 81 if (max < 0 || max > 65535)
920     {
921     *errorcodeptr = ERR5;
922     return p;
923     }
924 nigel 77 if (max < min)
925     {
926     *errorcodeptr = ERR4;
927     return p;
928     }
929     }
930     }
931    
932 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
933     '}'. */
934 nigel 77
935 nigel 81 *minp = min;
936     *maxp = max;
937 nigel 77 return p;
938     }
939    
940    
941    
942     /*************************************************
943 nigel 93 * Find forward referenced subpattern *
944 nigel 91 *************************************************/
945    
946 nigel 93 /* This function scans along a pattern's text looking for capturing
947     subpatterns, and counting them. If it finds a named pattern that matches the
948     name it is given, it returns its number. Alternatively, if the name is NULL, it
949     returns when it reaches a given numbered subpattern. This is used for forward
950     references to subpatterns. We know that if (?P< is encountered, the name will
951     be terminated by '>' because that is checked in the first pass.
952 nigel 91
953     Arguments:
954 nigel 93 ptr current position in the pattern
955     count current count of capturing parens so far encountered
956     name name to seek, or NULL if seeking a numbered subpattern
957     lorn name length, or subpattern number if name is NULL
958     xmode TRUE if we are in /x mode
959 nigel 91
960     Returns: the number of the named subpattern, or -1 if not found
961     */
962    
963     static int
964 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
965     BOOL xmode)
966 nigel 91 {
967     const uschar *thisname;
968 nigel 93
969 nigel 91 for (; *ptr != 0; ptr++)
970     {
971 nigel 93 int term;
972    
973     /* Skip over backslashed characters and also entire \Q...\E */
974    
975     if (*ptr == '\\')
976     {
977     if (*(++ptr) == 0) return -1;
978     if (*ptr == 'Q') for (;;)
979     {
980     while (*(++ptr) != 0 && *ptr != '\\');
981     if (*ptr == 0) return -1;
982     if (*(++ptr) == 'E') break;
983     }
984     continue;
985     }
986    
987     /* Skip over character classes */
988    
989     if (*ptr == '[')
990     {
991     while (*(++ptr) != ']')
992     {
993 ph10 220 if (*ptr == 0) return -1;
994 nigel 93 if (*ptr == '\\')
995     {
996     if (*(++ptr) == 0) return -1;
997     if (*ptr == 'Q') for (;;)
998     {
999     while (*(++ptr) != 0 && *ptr != '\\');
1000     if (*ptr == 0) return -1;
1001     if (*(++ptr) == 'E') break;
1002     }
1003     continue;
1004     }
1005     }
1006     continue;
1007     }
1008    
1009     /* Skip comments in /x mode */
1010    
1011     if (xmode && *ptr == '#')
1012     {
1013     while (*(++ptr) != 0 && *ptr != '\n');
1014     if (*ptr == 0) return -1;
1015     continue;
1016     }
1017    
1018     /* An opening parens must now be a real metacharacter */
1019    
1020 nigel 91 if (*ptr != '(') continue;
1021 ph10 210 if (ptr[1] != '?' && ptr[1] != '*')
1022 nigel 93 {
1023     count++;
1024     if (name == NULL && count == lorn) return count;
1025     continue;
1026     }
1027    
1028     ptr += 2;
1029     if (*ptr == 'P') ptr++; /* Allow optional P */
1030    
1031     /* We have to disambiguate (?<! and (?<= from (?<name> */
1032    
1033     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1034     *ptr != '\'')
1035     continue;
1036    
1037 nigel 91 count++;
1038 nigel 93
1039     if (name == NULL && count == lorn) return count;
1040     term = *ptr++;
1041     if (term == '<') term = '>';
1042 nigel 91 thisname = ptr;
1043 nigel 93 while (*ptr != term) ptr++;
1044     if (name != NULL && lorn == ptr - thisname &&
1045     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1046 nigel 91 return count;
1047     }
1048 nigel 93
1049 nigel 91 return -1;
1050     }
1051    
1052    
1053    
1054     /*************************************************
1055 nigel 77 * Find first significant op code *
1056     *************************************************/
1057    
1058     /* This is called by several functions that scan a compiled expression looking
1059     for a fixed first character, or an anchoring op code etc. It skips over things
1060     that do not influence this. For some calls, a change of option is important.
1061     For some calls, it makes sense to skip negative forward and all backward
1062     assertions, and also the \b assertion; for others it does not.
1063    
1064     Arguments:
1065     code pointer to the start of the group
1066     options pointer to external options
1067     optbit the option bit whose changing is significant, or
1068     zero if none are
1069     skipassert TRUE if certain assertions are to be skipped
1070    
1071     Returns: pointer to the first significant opcode
1072     */
1073    
1074     static const uschar*
1075     first_significant_code(const uschar *code, int *options, int optbit,
1076     BOOL skipassert)
1077     {
1078     for (;;)
1079     {
1080     switch ((int)*code)
1081     {
1082     case OP_OPT:
1083     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1084     *options = (int)code[1];
1085     code += 2;
1086     break;
1087    
1088     case OP_ASSERT_NOT:
1089     case OP_ASSERTBACK:
1090     case OP_ASSERTBACK_NOT:
1091     if (!skipassert) return code;
1092     do code += GET(code, 1); while (*code == OP_ALT);
1093     code += _pcre_OP_lengths[*code];
1094     break;
1095    
1096     case OP_WORD_BOUNDARY:
1097     case OP_NOT_WORD_BOUNDARY:
1098     if (!skipassert) return code;
1099     /* Fall through */
1100    
1101     case OP_CALLOUT:
1102     case OP_CREF:
1103 nigel 93 case OP_RREF:
1104     case OP_DEF:
1105 nigel 77 code += _pcre_OP_lengths[*code];
1106     break;
1107    
1108     default:
1109     return code;
1110     }
1111     }
1112     /* Control never reaches here */
1113     }
1114    
1115    
1116    
1117    
1118     /*************************************************
1119     * Find the fixed length of a pattern *
1120     *************************************************/
1121    
1122     /* Scan a pattern and compute the fixed length of subject that will match it,
1123     if the length is fixed. This is needed for dealing with backward assertions.
1124     In UTF8 mode, the result is in characters rather than bytes.
1125    
1126     Arguments:
1127     code points to the start of the pattern (the bracket)
1128     options the compiling options
1129    
1130     Returns: the fixed length, or -1 if there is no fixed length,
1131     or -2 if \C was encountered
1132     */
1133    
1134     static int
1135     find_fixedlength(uschar *code, int options)
1136     {
1137     int length = -1;
1138    
1139     register int branchlength = 0;
1140     register uschar *cc = code + 1 + LINK_SIZE;
1141    
1142     /* Scan along the opcodes for this branch. If we get to the end of the
1143     branch, check the length against that of the other branches. */
1144    
1145     for (;;)
1146     {
1147     int d;
1148     register int op = *cc;
1149     switch (op)
1150     {
1151 nigel 93 case OP_CBRA:
1152 nigel 77 case OP_BRA:
1153     case OP_ONCE:
1154     case OP_COND:
1155 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1156 nigel 77 if (d < 0) return d;
1157     branchlength += d;
1158     do cc += GET(cc, 1); while (*cc == OP_ALT);
1159     cc += 1 + LINK_SIZE;
1160     break;
1161    
1162     /* Reached end of a branch; if it's a ket it is the end of a nested
1163     call. If it's ALT it is an alternation in a nested call. If it is
1164     END it's the end of the outer call. All can be handled by the same code. */
1165    
1166     case OP_ALT:
1167     case OP_KET:
1168     case OP_KETRMAX:
1169     case OP_KETRMIN:
1170     case OP_END:
1171     if (length < 0) length = branchlength;
1172     else if (length != branchlength) return -1;
1173     if (*cc != OP_ALT) return length;
1174     cc += 1 + LINK_SIZE;
1175     branchlength = 0;
1176     break;
1177    
1178     /* Skip over assertive subpatterns */
1179    
1180     case OP_ASSERT:
1181     case OP_ASSERT_NOT:
1182     case OP_ASSERTBACK:
1183     case OP_ASSERTBACK_NOT:
1184     do cc += GET(cc, 1); while (*cc == OP_ALT);
1185     /* Fall through */
1186    
1187     /* Skip over things that don't match chars */
1188    
1189     case OP_REVERSE:
1190     case OP_CREF:
1191 nigel 93 case OP_RREF:
1192     case OP_DEF:
1193 nigel 77 case OP_OPT:
1194     case OP_CALLOUT:
1195     case OP_SOD:
1196     case OP_SOM:
1197     case OP_EOD:
1198     case OP_EODN:
1199     case OP_CIRC:
1200     case OP_DOLL:
1201     case OP_NOT_WORD_BOUNDARY:
1202     case OP_WORD_BOUNDARY:
1203     cc += _pcre_OP_lengths[*cc];
1204     break;
1205    
1206     /* Handle literal characters */
1207    
1208     case OP_CHAR:
1209     case OP_CHARNC:
1210 nigel 91 case OP_NOT:
1211 nigel 77 branchlength++;
1212     cc += 2;
1213     #ifdef SUPPORT_UTF8
1214     if ((options & PCRE_UTF8) != 0)
1215     {
1216     while ((*cc & 0xc0) == 0x80) cc++;
1217     }
1218     #endif
1219     break;
1220    
1221     /* Handle exact repetitions. The count is already in characters, but we
1222     need to skip over a multibyte character in UTF8 mode. */
1223    
1224     case OP_EXACT:
1225     branchlength += GET2(cc,1);
1226     cc += 4;
1227     #ifdef SUPPORT_UTF8
1228     if ((options & PCRE_UTF8) != 0)
1229     {
1230     while((*cc & 0x80) == 0x80) cc++;
1231     }
1232     #endif
1233     break;
1234    
1235     case OP_TYPEEXACT:
1236     branchlength += GET2(cc,1);
1237 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1238 nigel 77 cc += 4;
1239     break;
1240    
1241     /* Handle single-char matchers */
1242    
1243     case OP_PROP:
1244     case OP_NOTPROP:
1245 nigel 87 cc += 2;
1246 nigel 77 /* Fall through */
1247    
1248     case OP_NOT_DIGIT:
1249     case OP_DIGIT:
1250     case OP_NOT_WHITESPACE:
1251     case OP_WHITESPACE:
1252     case OP_NOT_WORDCHAR:
1253     case OP_WORDCHAR:
1254     case OP_ANY:
1255     branchlength++;
1256     cc++;
1257     break;
1258    
1259     /* The single-byte matcher isn't allowed */
1260    
1261     case OP_ANYBYTE:
1262     return -2;
1263    
1264     /* Check a class for variable quantification */
1265    
1266     #ifdef SUPPORT_UTF8
1267     case OP_XCLASS:
1268     cc += GET(cc, 1) - 33;
1269     /* Fall through */
1270     #endif
1271    
1272     case OP_CLASS:
1273     case OP_NCLASS:
1274     cc += 33;
1275    
1276     switch (*cc)
1277     {
1278     case OP_CRSTAR:
1279     case OP_CRMINSTAR:
1280     case OP_CRQUERY:
1281     case OP_CRMINQUERY:
1282     return -1;
1283    
1284     case OP_CRRANGE:
1285     case OP_CRMINRANGE:
1286     if (GET2(cc,1) != GET2(cc,3)) return -1;
1287     branchlength += GET2(cc,1);
1288     cc += 5;
1289     break;
1290    
1291     default:
1292     branchlength++;
1293     }
1294     break;
1295    
1296     /* Anything else is variable length */
1297    
1298     default:
1299     return -1;
1300     }
1301     }
1302     /* Control never gets here */
1303     }
1304    
1305    
1306    
1307    
1308     /*************************************************
1309     * Scan compiled regex for numbered bracket *
1310     *************************************************/
1311    
1312     /* This little function scans through a compiled pattern until it finds a
1313     capturing bracket with the given number.
1314    
1315     Arguments:
1316     code points to start of expression
1317     utf8 TRUE in UTF-8 mode
1318     number the required bracket number
1319    
1320     Returns: pointer to the opcode for the bracket, or NULL if not found
1321     */
1322    
1323     static const uschar *
1324     find_bracket(const uschar *code, BOOL utf8, int number)
1325     {
1326     for (;;)
1327     {
1328     register int c = *code;
1329     if (c == OP_END) return NULL;
1330 nigel 91
1331     /* XCLASS is used for classes that cannot be represented just by a bit
1332     map. This includes negated single high-valued characters. The length in
1333     the table is zero; the actual length is stored in the compiled code. */
1334    
1335     if (c == OP_XCLASS) code += GET(code, 1);
1336    
1337 nigel 93 /* Handle capturing bracket */
1338 nigel 91
1339 nigel 93 else if (c == OP_CBRA)
1340 nigel 77 {
1341 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1342 nigel 77 if (n == number) return (uschar *)code;
1343 nigel 93 code += _pcre_OP_lengths[c];
1344 nigel 77 }
1345 nigel 91
1346 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1347     repeated character types, we have to test for \p and \P, which have an extra
1348 ph10 218 two bytes of parameters. */
1349 nigel 91
1350 nigel 77 else
1351     {
1352 ph10 218 switch(c)
1353     {
1354     case OP_TYPESTAR:
1355     case OP_TYPEMINSTAR:
1356     case OP_TYPEPLUS:
1357     case OP_TYPEMINPLUS:
1358     case OP_TYPEQUERY:
1359     case OP_TYPEMINQUERY:
1360     case OP_TYPEPOSSTAR:
1361     case OP_TYPEPOSPLUS:
1362     case OP_TYPEPOSQUERY:
1363     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1364 ph10 220 break;
1365 ph10 221
1366     case OP_TYPEUPTO:
1367     case OP_TYPEMINUPTO:
1368     case OP_TYPEEXACT:
1369     case OP_TYPEPOSUPTO:
1370     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1371     break;
1372 ph10 220 }
1373    
1374 ph10 218 /* Add in the fixed length from the table */
1375 ph10 220
1376 nigel 77 code += _pcre_OP_lengths[c];
1377 ph10 220
1378 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1379     a multi-byte character. The length in the table is a minimum, so we have to
1380     arrange to skip the extra bytes. */
1381 ph10 220
1382 ph10 107 #ifdef SUPPORT_UTF8
1383 nigel 77 if (utf8) switch(c)
1384     {
1385     case OP_CHAR:
1386     case OP_CHARNC:
1387     case OP_EXACT:
1388     case OP_UPTO:
1389     case OP_MINUPTO:
1390 nigel 93 case OP_POSUPTO:
1391 nigel 77 case OP_STAR:
1392     case OP_MINSTAR:
1393 nigel 93 case OP_POSSTAR:
1394 nigel 77 case OP_PLUS:
1395     case OP_MINPLUS:
1396 nigel 93 case OP_POSPLUS:
1397 nigel 77 case OP_QUERY:
1398     case OP_MINQUERY:
1399 nigel 93 case OP_POSQUERY:
1400     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1401 nigel 77 break;
1402     }
1403 ph10 111 #endif
1404 nigel 77 }
1405     }
1406     }
1407    
1408    
1409    
1410     /*************************************************
1411     * Scan compiled regex for recursion reference *
1412     *************************************************/
1413    
1414     /* This little function scans through a compiled pattern until it finds an
1415     instance of OP_RECURSE.
1416    
1417     Arguments:
1418     code points to start of expression
1419     utf8 TRUE in UTF-8 mode
1420    
1421     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1422     */
1423    
1424     static const uschar *
1425     find_recurse(const uschar *code, BOOL utf8)
1426     {
1427     for (;;)
1428     {
1429     register int c = *code;
1430     if (c == OP_END) return NULL;
1431 nigel 91 if (c == OP_RECURSE) return code;
1432 ph10 220
1433 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1434     map. This includes negated single high-valued characters. The length in
1435     the table is zero; the actual length is stored in the compiled code. */
1436    
1437     if (c == OP_XCLASS) code += GET(code, 1);
1438    
1439 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1440     repeated character types, we have to test for \p and \P, which have an extra
1441 ph10 218 two bytes of parameters. */
1442 nigel 91
1443 nigel 77 else
1444     {
1445 ph10 218 switch(c)
1446     {
1447     case OP_TYPESTAR:
1448     case OP_TYPEMINSTAR:
1449     case OP_TYPEPLUS:
1450     case OP_TYPEMINPLUS:
1451     case OP_TYPEQUERY:
1452     case OP_TYPEMINQUERY:
1453     case OP_TYPEPOSSTAR:
1454     case OP_TYPEPOSPLUS:
1455     case OP_TYPEPOSQUERY:
1456     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1457 ph10 220 break;
1458 ph10 221
1459     case OP_TYPEPOSUPTO:
1460     case OP_TYPEUPTO:
1461     case OP_TYPEMINUPTO:
1462     case OP_TYPEEXACT:
1463     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1464     break;
1465 ph10 220 }
1466    
1467 ph10 218 /* Add in the fixed length from the table */
1468    
1469 nigel 77 code += _pcre_OP_lengths[c];
1470 ph10 220
1471 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1472     by a multi-byte character. The length in the table is a minimum, so we have
1473     to arrange to skip the extra bytes. */
1474 ph10 220
1475 ph10 107 #ifdef SUPPORT_UTF8
1476 nigel 77 if (utf8) switch(c)
1477     {
1478     case OP_CHAR:
1479     case OP_CHARNC:
1480     case OP_EXACT:
1481     case OP_UPTO:
1482     case OP_MINUPTO:
1483 nigel 93 case OP_POSUPTO:
1484 nigel 77 case OP_STAR:
1485     case OP_MINSTAR:
1486 nigel 93 case OP_POSSTAR:
1487 nigel 77 case OP_PLUS:
1488     case OP_MINPLUS:
1489 nigel 93 case OP_POSPLUS:
1490 nigel 77 case OP_QUERY:
1491     case OP_MINQUERY:
1492 nigel 93 case OP_POSQUERY:
1493     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1494 nigel 77 break;
1495     }
1496 ph10 111 #endif
1497 nigel 77 }
1498     }
1499     }
1500    
1501    
1502    
1503     /*************************************************
1504     * Scan compiled branch for non-emptiness *
1505     *************************************************/
1506    
1507     /* This function scans through a branch of a compiled pattern to see whether it
1508 nigel 93 can match the empty string or not. It is called from could_be_empty()
1509     below and from compile_branch() when checking for an unlimited repeat of a
1510     group that can match nothing. Note that first_significant_code() skips over
1511     assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1512     struck an inner bracket whose current branch will already have been scanned.
1513 nigel 77
1514     Arguments:
1515     code points to start of search
1516     endcode points to where to stop
1517     utf8 TRUE if in UTF8 mode
1518    
1519     Returns: TRUE if what is matched could be empty
1520     */
1521    
1522     static BOOL
1523     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1524     {
1525     register int c;
1526 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1527 nigel 77 code < endcode;
1528     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1529     {
1530     const uschar *ccode;
1531    
1532     c = *code;
1533 ph10 172
1534 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1535 nigel 77
1536 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1537     {
1538 ph10 172 code += _pcre_OP_lengths[c];
1539 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1540     c = *code;
1541     continue;
1542     }
1543    
1544     /* For other groups, scan the branches. */
1545 ph10 172
1546 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1547 nigel 77 {
1548     BOOL empty_branch;
1549     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1550    
1551     /* Scan a closed bracket */
1552    
1553     empty_branch = FALSE;
1554     do
1555     {
1556     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1557     empty_branch = TRUE;
1558     code += GET(code, 1);
1559     }
1560     while (*code == OP_ALT);
1561     if (!empty_branch) return FALSE; /* All branches are non-empty */
1562 ph10 172 c = *code;
1563 nigel 93 continue;
1564 nigel 77 }
1565    
1566 nigel 93 /* Handle the other opcodes */
1567    
1568     switch (c)
1569 nigel 77 {
1570 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1571     cannot be represented just by a bit map. This includes negated single
1572     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1573 ph10 220 actual length is stored in the compiled code, so we must update "code"
1574 ph10 216 here. */
1575 nigel 77
1576     #ifdef SUPPORT_UTF8
1577     case OP_XCLASS:
1578 ph10 216 ccode = code += GET(code, 1);
1579 nigel 77 goto CHECK_CLASS_REPEAT;
1580     #endif
1581    
1582     case OP_CLASS:
1583     case OP_NCLASS:
1584     ccode = code + 33;
1585    
1586     #ifdef SUPPORT_UTF8
1587     CHECK_CLASS_REPEAT:
1588     #endif
1589    
1590     switch (*ccode)
1591     {
1592     case OP_CRSTAR: /* These could be empty; continue */
1593     case OP_CRMINSTAR:
1594     case OP_CRQUERY:
1595     case OP_CRMINQUERY:
1596     break;
1597    
1598     default: /* Non-repeat => class must match */
1599     case OP_CRPLUS: /* These repeats aren't empty */
1600     case OP_CRMINPLUS:
1601     return FALSE;
1602    
1603     case OP_CRRANGE:
1604     case OP_CRMINRANGE:
1605     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1606     break;
1607     }
1608     break;
1609    
1610     /* Opcodes that must match a character */
1611    
1612     case OP_PROP:
1613     case OP_NOTPROP:
1614     case OP_EXTUNI:
1615     case OP_NOT_DIGIT:
1616     case OP_DIGIT:
1617     case OP_NOT_WHITESPACE:
1618     case OP_WHITESPACE:
1619     case OP_NOT_WORDCHAR:
1620     case OP_WORDCHAR:
1621     case OP_ANY:
1622     case OP_ANYBYTE:
1623     case OP_CHAR:
1624     case OP_CHARNC:
1625     case OP_NOT:
1626     case OP_PLUS:
1627     case OP_MINPLUS:
1628 nigel 93 case OP_POSPLUS:
1629 nigel 77 case OP_EXACT:
1630     case OP_NOTPLUS:
1631     case OP_NOTMINPLUS:
1632 nigel 93 case OP_NOTPOSPLUS:
1633 nigel 77 case OP_NOTEXACT:
1634     case OP_TYPEPLUS:
1635     case OP_TYPEMINPLUS:
1636 nigel 93 case OP_TYPEPOSPLUS:
1637 nigel 77 case OP_TYPEEXACT:
1638     return FALSE;
1639 ph10 227
1640     /* These are going to continue, as they may be empty, but we have to
1641     fudge the length for the \p and \P cases. */
1642    
1643 ph10 224 case OP_TYPESTAR:
1644     case OP_TYPEMINSTAR:
1645     case OP_TYPEPOSSTAR:
1646     case OP_TYPEQUERY:
1647     case OP_TYPEMINQUERY:
1648     case OP_TYPEPOSQUERY:
1649     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1650 ph10 227 break;
1651    
1652 ph10 224 /* Same for these */
1653 ph10 227
1654 ph10 224 case OP_TYPEUPTO:
1655     case OP_TYPEMINUPTO:
1656     case OP_TYPEPOSUPTO:
1657     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1658     break;
1659 nigel 77
1660     /* End of branch */
1661    
1662     case OP_KET:
1663     case OP_KETRMAX:
1664     case OP_KETRMIN:
1665     case OP_ALT:
1666     return TRUE;
1667    
1668 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1669     MINUPTO, and POSUPTO may be followed by a multibyte character */
1670 nigel 77
1671     #ifdef SUPPORT_UTF8
1672     case OP_STAR:
1673     case OP_MINSTAR:
1674 nigel 93 case OP_POSSTAR:
1675 nigel 77 case OP_QUERY:
1676     case OP_MINQUERY:
1677 nigel 93 case OP_POSQUERY:
1678 nigel 77 case OP_UPTO:
1679     case OP_MINUPTO:
1680 nigel 93 case OP_POSUPTO:
1681 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1682     break;
1683     #endif
1684     }
1685     }
1686    
1687     return TRUE;
1688     }
1689    
1690    
1691    
1692     /*************************************************
1693     * Scan compiled regex for non-emptiness *
1694     *************************************************/
1695    
1696     /* This function is called to check for left recursive calls. We want to check
1697     the current branch of the current pattern to see if it could match the empty
1698     string. If it could, we must look outwards for branches at other levels,
1699     stopping when we pass beyond the bracket which is the subject of the recursion.
1700    
1701     Arguments:
1702     code points to start of the recursion
1703     endcode points to where to stop (current RECURSE item)
1704     bcptr points to the chain of current (unclosed) branch starts
1705     utf8 TRUE if in UTF-8 mode
1706    
1707     Returns: TRUE if what is matched could be empty
1708     */
1709    
1710     static BOOL
1711     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1712     BOOL utf8)
1713     {
1714     while (bcptr != NULL && bcptr->current >= code)
1715     {
1716     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1717     bcptr = bcptr->outer;
1718     }
1719     return TRUE;
1720     }
1721    
1722    
1723    
1724     /*************************************************
1725     * Check for POSIX class syntax *
1726     *************************************************/
1727    
1728     /* This function is called when the sequence "[:" or "[." or "[=" is
1729     encountered in a character class. It checks whether this is followed by an
1730     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1731     ".]" or "=]".
1732    
1733     Argument:
1734     ptr pointer to the initial [
1735     endptr where to return the end pointer
1736     cd pointer to compile data
1737    
1738     Returns: TRUE or FALSE
1739     */
1740    
1741     static BOOL
1742     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1743     {
1744     int terminator; /* Don't combine these lines; the Solaris cc */
1745     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1746     if (*(++ptr) == '^') ptr++;
1747     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1748     if (*ptr == terminator && ptr[1] == ']')
1749     {
1750     *endptr = ptr;
1751     return TRUE;
1752     }
1753     return FALSE;
1754     }
1755    
1756    
1757    
1758    
1759     /*************************************************
1760     * Check POSIX class name *
1761     *************************************************/
1762    
1763     /* This function is called to check the name given in a POSIX-style class entry
1764     such as [:alnum:].
1765    
1766     Arguments:
1767     ptr points to the first letter
1768     len the length of the name
1769    
1770     Returns: a value representing the name, or -1 if unknown
1771     */
1772    
1773     static int
1774     check_posix_name(const uschar *ptr, int len)
1775     {
1776 ph10 240 const char *pn = posix_names;
1777 nigel 77 register int yield = 0;
1778     while (posix_name_lengths[yield] != 0)
1779     {
1780     if (len == posix_name_lengths[yield] &&
1781 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
1782 ph10 243 pn += posix_name_lengths[yield] + 1;
1783 nigel 77 yield++;
1784     }
1785     return -1;
1786     }
1787    
1788    
1789     /*************************************************
1790     * Adjust OP_RECURSE items in repeated group *
1791     *************************************************/
1792    
1793     /* OP_RECURSE items contain an offset from the start of the regex to the group
1794     that is referenced. This means that groups can be replicated for fixed
1795     repetition simply by copying (because the recursion is allowed to refer to
1796     earlier groups that are outside the current group). However, when a group is
1797     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1798     it, after it has been compiled. This means that any OP_RECURSE items within it
1799     that refer to the group itself or any contained groups have to have their
1800 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1801     the partially compiled regex must be temporarily terminated with OP_END.
1802 nigel 77
1803 nigel 93 This function has been extended with the possibility of forward references for
1804     recursions and subroutine calls. It must also check the list of such references
1805     for the group we are dealing with. If it finds that one of the recursions in
1806     the current group is on this list, it adjusts the offset in the list, not the
1807     value in the reference (which is a group number).
1808    
1809 nigel 77 Arguments:
1810     group points to the start of the group
1811     adjust the amount by which the group is to be moved
1812     utf8 TRUE in UTF-8 mode
1813     cd contains pointers to tables etc.
1814 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1815 nigel 77
1816     Returns: nothing
1817     */
1818    
1819     static void
1820 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1821     uschar *save_hwm)
1822 nigel 77 {
1823     uschar *ptr = group;
1824 ph10 224
1825 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1826     {
1827 nigel 93 int offset;
1828     uschar *hc;
1829    
1830     /* See if this recursion is on the forward reference list. If so, adjust the
1831     reference. */
1832    
1833     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1834     {
1835     offset = GET(hc, 0);
1836     if (cd->start_code + offset == ptr + 1)
1837     {
1838     PUT(hc, 0, offset + adjust);
1839     break;
1840     }
1841     }
1842    
1843     /* Otherwise, adjust the recursion offset if it's after the start of this
1844     group. */
1845    
1846     if (hc >= cd->hwm)
1847     {
1848     offset = GET(ptr, 1);
1849     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1850     }
1851    
1852 nigel 77 ptr += 1 + LINK_SIZE;
1853     }
1854     }
1855    
1856    
1857    
1858     /*************************************************
1859     * Insert an automatic callout point *
1860     *************************************************/
1861    
1862     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1863     callout points before each pattern item.
1864    
1865     Arguments:
1866     code current code pointer
1867     ptr current pattern pointer
1868     cd pointers to tables etc
1869    
1870     Returns: new code pointer
1871     */
1872    
1873     static uschar *
1874     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1875     {
1876     *code++ = OP_CALLOUT;
1877     *code++ = 255;
1878     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1879     PUT(code, LINK_SIZE, 0); /* Default length */
1880     return code + 2*LINK_SIZE;
1881     }
1882    
1883    
1884    
1885     /*************************************************
1886     * Complete a callout item *
1887     *************************************************/
1888    
1889     /* A callout item contains the length of the next item in the pattern, which
1890     we can't fill in till after we have reached the relevant point. This is used
1891     for both automatic and manual callouts.
1892    
1893     Arguments:
1894     previous_callout points to previous callout item
1895     ptr current pattern pointer
1896     cd pointers to tables etc
1897    
1898     Returns: nothing
1899     */
1900    
1901     static void
1902     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1903     {
1904     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1905     PUT(previous_callout, 2 + LINK_SIZE, length);
1906     }
1907    
1908    
1909    
1910     #ifdef SUPPORT_UCP
1911     /*************************************************
1912     * Get othercase range *
1913     *************************************************/
1914    
1915     /* This function is passed the start and end of a class range, in UTF-8 mode
1916     with UCP support. It searches up the characters, looking for internal ranges of
1917     characters in the "other" case. Each call returns the next one, updating the
1918     start address.
1919    
1920     Arguments:
1921     cptr points to starting character value; updated
1922     d end value
1923     ocptr where to put start of othercase range
1924     odptr where to put end of othercase range
1925    
1926     Yield: TRUE when range returned; FALSE when no more
1927     */
1928    
1929     static BOOL
1930 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1931     unsigned int *odptr)
1932 nigel 77 {
1933 nigel 93 unsigned int c, othercase, next;
1934 nigel 77
1935     for (c = *cptr; c <= d; c++)
1936 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1937 nigel 77
1938     if (c > d) return FALSE;
1939    
1940     *ocptr = othercase;
1941     next = othercase + 1;
1942    
1943     for (++c; c <= d; c++)
1944     {
1945 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1946 nigel 77 next++;
1947     }
1948    
1949     *odptr = next - 1;
1950     *cptr = c;
1951    
1952     return TRUE;
1953     }
1954     #endif /* SUPPORT_UCP */
1955    
1956    
1957 nigel 93
1958 nigel 77 /*************************************************
1959 nigel 93 * Check if auto-possessifying is possible *
1960     *************************************************/
1961    
1962     /* This function is called for unlimited repeats of certain items, to see
1963     whether the next thing could possibly match the repeated item. If not, it makes
1964     sense to automatically possessify the repeated item.
1965    
1966     Arguments:
1967     op_code the repeated op code
1968     this data for this item, depends on the opcode
1969     utf8 TRUE in UTF-8 mode
1970     utf8_char used for utf8 character bytes, NULL if not relevant
1971     ptr next character in pattern
1972     options options bits
1973     cd contains pointers to tables etc.
1974    
1975     Returns: TRUE if possessifying is wanted
1976     */
1977    
1978     static BOOL
1979     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1980     const uschar *ptr, int options, compile_data *cd)
1981     {
1982     int next;
1983    
1984     /* Skip whitespace and comments in extended mode */
1985    
1986     if ((options & PCRE_EXTENDED) != 0)
1987     {
1988     for (;;)
1989     {
1990     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1991     if (*ptr == '#')
1992     {
1993     while (*(++ptr) != 0)
1994     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1995     }
1996     else break;
1997     }
1998     }
1999    
2000     /* If the next item is one that we can handle, get its value. A non-negative
2001     value is a character, a negative value is an escape value. */
2002    
2003     if (*ptr == '\\')
2004     {
2005     int temperrorcode = 0;
2006     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2007     if (temperrorcode != 0) return FALSE;
2008     ptr++; /* Point after the escape sequence */
2009     }
2010    
2011     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2012     {
2013     #ifdef SUPPORT_UTF8
2014     if (utf8) { GETCHARINC(next, ptr); } else
2015     #endif
2016     next = *ptr++;
2017     }
2018    
2019     else return FALSE;
2020    
2021     /* Skip whitespace and comments in extended mode */
2022    
2023     if ((options & PCRE_EXTENDED) != 0)
2024     {
2025     for (;;)
2026     {
2027     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2028     if (*ptr == '#')
2029     {
2030     while (*(++ptr) != 0)
2031     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2032     }
2033     else break;
2034     }
2035     }
2036    
2037     /* If the next thing is itself optional, we have to give up. */
2038    
2039     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2040     return FALSE;
2041    
2042     /* Now compare the next item with the previous opcode. If the previous is a
2043     positive single character match, "item" either contains the character or, if
2044     "item" is greater than 127 in utf8 mode, the character's bytes are in
2045     utf8_char. */
2046    
2047    
2048     /* Handle cases when the next item is a character. */
2049    
2050     if (next >= 0) switch(op_code)
2051     {
2052     case OP_CHAR:
2053     #ifdef SUPPORT_UTF8
2054     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2055     #endif
2056     return item != next;
2057    
2058     /* For CHARNC (caseless character) we must check the other case. If we have
2059     Unicode property support, we can use it to test the other case of
2060     high-valued characters. */
2061    
2062     case OP_CHARNC:
2063     #ifdef SUPPORT_UTF8
2064     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2065     #endif
2066     if (item == next) return FALSE;
2067     #ifdef SUPPORT_UTF8
2068     if (utf8)
2069     {
2070     unsigned int othercase;
2071     if (next < 128) othercase = cd->fcc[next]; else
2072     #ifdef SUPPORT_UCP
2073     othercase = _pcre_ucp_othercase((unsigned int)next);
2074     #else
2075     othercase = NOTACHAR;
2076     #endif
2077     return (unsigned int)item != othercase;
2078     }
2079     else
2080     #endif /* SUPPORT_UTF8 */
2081     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2082    
2083     /* For OP_NOT, "item" must be a single-byte character. */
2084    
2085     case OP_NOT:
2086     if (next < 0) return FALSE; /* Not a character */
2087     if (item == next) return TRUE;
2088     if ((options & PCRE_CASELESS) == 0) return FALSE;
2089     #ifdef SUPPORT_UTF8
2090     if (utf8)
2091     {
2092     unsigned int othercase;
2093     if (next < 128) othercase = cd->fcc[next]; else
2094     #ifdef SUPPORT_UCP
2095     othercase = _pcre_ucp_othercase(next);
2096     #else
2097     othercase = NOTACHAR;
2098     #endif
2099     return (unsigned int)item == othercase;
2100     }
2101     else
2102     #endif /* SUPPORT_UTF8 */
2103     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2104    
2105     case OP_DIGIT:
2106     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2107    
2108     case OP_NOT_DIGIT:
2109     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2110    
2111     case OP_WHITESPACE:
2112     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2113    
2114     case OP_NOT_WHITESPACE:
2115     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2116    
2117     case OP_WORDCHAR:
2118     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2119    
2120     case OP_NOT_WORDCHAR:
2121     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2122    
2123 ph10 180 case OP_HSPACE:
2124     case OP_NOT_HSPACE:
2125     switch(next)
2126     {
2127     case 0x09:
2128     case 0x20:
2129     case 0xa0:
2130     case 0x1680:
2131     case 0x180e:
2132     case 0x2000:
2133     case 0x2001:
2134     case 0x2002:
2135     case 0x2003:
2136     case 0x2004:
2137     case 0x2005:
2138     case 0x2006:
2139     case 0x2007:
2140     case 0x2008:
2141     case 0x2009:
2142     case 0x200A:
2143     case 0x202f:
2144     case 0x205f:
2145     case 0x3000:
2146     return op_code != OP_HSPACE;
2147     default:
2148     return op_code == OP_HSPACE;
2149     }
2150    
2151     case OP_VSPACE:
2152     case OP_NOT_VSPACE:
2153     switch(next)
2154     {
2155     case 0x0a:
2156     case 0x0b:
2157     case 0x0c:
2158     case 0x0d:
2159     case 0x85:
2160     case 0x2028:
2161     case 0x2029:
2162     return op_code != OP_VSPACE;
2163     default:
2164     return op_code == OP_VSPACE;
2165     }
2166    
2167 nigel 93 default:
2168     return FALSE;
2169     }
2170    
2171    
2172     /* Handle the case when the next item is \d, \s, etc. */
2173    
2174     switch(op_code)
2175     {
2176     case OP_CHAR:
2177     case OP_CHARNC:
2178     #ifdef SUPPORT_UTF8
2179     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2180     #endif
2181     switch(-next)
2182     {
2183     case ESC_d:
2184     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2185    
2186     case ESC_D:
2187     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2188    
2189     case ESC_s:
2190     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2191    
2192     case ESC_S:
2193     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2194    
2195     case ESC_w:
2196     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2197    
2198     case ESC_W:
2199     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2200 ph10 182
2201 ph10 180 case ESC_h:
2202     case ESC_H:
2203     switch(item)
2204     {
2205     case 0x09:
2206     case 0x20:
2207     case 0xa0:
2208     case 0x1680:
2209     case 0x180e:
2210     case 0x2000:
2211     case 0x2001:
2212     case 0x2002:
2213     case 0x2003:
2214     case 0x2004:
2215     case 0x2005:
2216     case 0x2006:
2217     case 0x2007:
2218     case 0x2008:
2219     case 0x2009:
2220     case 0x200A:
2221     case 0x202f:
2222     case 0x205f:
2223     case 0x3000:
2224     return -next != ESC_h;
2225     default:
2226     return -next == ESC_h;
2227 ph10 182 }
2228    
2229 ph10 180 case ESC_v:
2230     case ESC_V:
2231     switch(item)
2232     {
2233     case 0x0a:
2234     case 0x0b:
2235     case 0x0c:
2236     case 0x0d:
2237     case 0x85:
2238     case 0x2028:
2239     case 0x2029:
2240     return -next != ESC_v;
2241     default:
2242     return -next == ESC_v;
2243 ph10 182 }
2244 nigel 93
2245     default:
2246     return FALSE;
2247     }
2248    
2249     case OP_DIGIT:
2250 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2251     next == -ESC_h || next == -ESC_v;
2252 nigel 93
2253     case OP_NOT_DIGIT:
2254     return next == -ESC_d;
2255    
2256     case OP_WHITESPACE:
2257     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2258    
2259     case OP_NOT_WHITESPACE:
2260 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2261 nigel 93
2262 ph10 180 case OP_HSPACE:
2263     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2264    
2265     case OP_NOT_HSPACE:
2266     return next == -ESC_h;
2267 ph10 182
2268 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2269 ph10 182 case OP_VSPACE:
2270 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2271    
2272     case OP_NOT_VSPACE:
2273 ph10 182 return next == -ESC_v;
2274 ph10 180
2275 nigel 93 case OP_WORDCHAR:
2276 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2277 nigel 93
2278     case OP_NOT_WORDCHAR:
2279     return next == -ESC_w || next == -ESC_d;
2280 ph10 182
2281 nigel 93 default:
2282     return FALSE;
2283     }
2284    
2285     /* Control does not reach here */
2286     }
2287    
2288    
2289    
2290     /*************************************************
2291 nigel 77 * Compile one branch *
2292     *************************************************/
2293    
2294 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2295 nigel 77 changed during the branch, the pointer is used to change the external options
2296 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2297     to find out the amount of memory needed, as well as during the real compile
2298     phase. The value of lengthptr distinguishes the two phases.
2299 nigel 77
2300     Arguments:
2301     optionsptr pointer to the option bits
2302     codeptr points to the pointer to the current code point
2303     ptrptr points to the current pattern pointer
2304     errorcodeptr points to error code variable
2305     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2306     reqbyteptr set to the last literal character required, else < 0
2307     bcptr points to current branch chain
2308     cd contains pointers to tables etc.
2309 nigel 93 lengthptr NULL during the real compile phase
2310     points to length accumulator during pre-compile phase
2311 nigel 77
2312     Returns: TRUE on success
2313     FALSE, with *errorcodeptr set non-zero on error
2314     */
2315    
2316     static BOOL
2317 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2318     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2319     compile_data *cd, int *lengthptr)
2320 nigel 77 {
2321     int repeat_type, op_type;
2322     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2323     int bravalue = 0;
2324     int greedy_default, greedy_non_default;
2325     int firstbyte, reqbyte;
2326     int zeroreqbyte, zerofirstbyte;
2327     int req_caseopt, reqvary, tempreqvary;
2328     int options = *optionsptr;
2329     int after_manual_callout = 0;
2330 nigel 93 int length_prevgroup = 0;
2331 nigel 77 register int c;
2332     register uschar *code = *codeptr;
2333 nigel 93 uschar *last_code = code;
2334     uschar *orig_code = code;
2335 nigel 77 uschar *tempcode;
2336     BOOL inescq = FALSE;
2337     BOOL groupsetfirstbyte = FALSE;
2338     const uschar *ptr = *ptrptr;
2339     const uschar *tempptr;
2340     uschar *previous = NULL;
2341     uschar *previous_callout = NULL;
2342 nigel 93 uschar *save_hwm = NULL;
2343 nigel 77 uschar classbits[32];
2344    
2345     #ifdef SUPPORT_UTF8
2346     BOOL class_utf8;
2347     BOOL utf8 = (options & PCRE_UTF8) != 0;
2348     uschar *class_utf8data;
2349     uschar utf8_char[6];
2350     #else
2351     BOOL utf8 = FALSE;
2352 nigel 93 uschar *utf8_char = NULL;
2353 nigel 77 #endif
2354    
2355 nigel 93 #ifdef DEBUG
2356     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2357     #endif
2358    
2359 nigel 77 /* Set up the default and non-default settings for greediness */
2360    
2361     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2362     greedy_non_default = greedy_default ^ 1;
2363    
2364     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2365     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2366     matches a non-fixed char first char; reqbyte just remains unset if we never
2367     find one.
2368    
2369     When we hit a repeat whose minimum is zero, we may have to adjust these values
2370     to take the zero repeat into account. This is implemented by setting them to
2371     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2372     item types that can be repeated set these backoff variables appropriately. */
2373    
2374     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2375    
2376     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2377     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2378     value > 255. It is added into the firstbyte or reqbyte variables to record the
2379     case status of the value. This is used only for ASCII characters. */
2380    
2381     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2382    
2383     /* Switch on next character until the end of the branch */
2384    
2385     for (;; ptr++)
2386     {
2387     BOOL negate_class;
2388 ph10 264 BOOL should_flip_negation;
2389 nigel 77 BOOL possessive_quantifier;
2390     BOOL is_quantifier;
2391 nigel 93 BOOL is_recurse;
2392 ph10 180 BOOL reset_bracount;
2393 nigel 77 int class_charcount;
2394     int class_lastchar;
2395     int newoptions;
2396     int recno;
2397 ph10 172 int refsign;
2398 nigel 77 int skipbytes;
2399     int subreqbyte;
2400     int subfirstbyte;
2401 nigel 93 int terminator;
2402 nigel 77 int mclength;
2403     uschar mcbuffer[8];
2404    
2405 nigel 93 /* Get next byte in the pattern */
2406 nigel 77
2407     c = *ptr;
2408    
2409 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2410     previous cycle of this loop. */
2411    
2412     if (lengthptr != NULL)
2413     {
2414     #ifdef DEBUG
2415     if (code > cd->hwm) cd->hwm = code; /* High water info */
2416     #endif
2417     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2418     {
2419     *errorcodeptr = ERR52;
2420     goto FAILED;
2421     }
2422    
2423     /* There is at least one situation where code goes backwards: this is the
2424     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2425     the class is simply eliminated. However, it is created first, so we have to
2426     allow memory for it. Therefore, don't ever reduce the length at this point.
2427     */
2428    
2429     if (code < last_code) code = last_code;
2430 ph10 202
2431     /* Paranoid check for integer overflow */
2432    
2433     if (OFLOW_MAX - *lengthptr < code - last_code)
2434     {
2435     *errorcodeptr = ERR20;
2436     goto FAILED;
2437     }
2438    
2439 nigel 93 *lengthptr += code - last_code;
2440     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2441    
2442     /* If "previous" is set and it is not at the start of the work space, move
2443     it back to there, in order to avoid filling up the work space. Otherwise,
2444     if "previous" is NULL, reset the current code pointer to the start. */
2445    
2446     if (previous != NULL)
2447     {
2448     if (previous > orig_code)
2449     {
2450     memmove(orig_code, previous, code - previous);
2451     code -= previous - orig_code;
2452     previous = orig_code;
2453     }
2454     }
2455     else code = orig_code;
2456    
2457     /* Remember where this code item starts so we can pick up the length
2458     next time round. */
2459    
2460     last_code = code;
2461     }
2462    
2463     /* In the real compile phase, just check the workspace used by the forward
2464     reference list. */
2465    
2466     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2467     {
2468     *errorcodeptr = ERR52;
2469     goto FAILED;
2470     }
2471    
2472 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2473    
2474     if (inescq && c != 0)
2475     {
2476     if (c == '\\' && ptr[1] == 'E')
2477     {
2478     inescq = FALSE;
2479     ptr++;
2480     continue;
2481     }
2482     else
2483     {
2484     if (previous_callout != NULL)
2485     {
2486 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2487     complete_callout(previous_callout, ptr, cd);
2488 nigel 77 previous_callout = NULL;
2489     }
2490     if ((options & PCRE_AUTO_CALLOUT) != 0)
2491     {
2492     previous_callout = code;
2493     code = auto_callout(code, ptr, cd);
2494     }
2495     goto NORMAL_CHAR;
2496     }
2497     }
2498    
2499     /* Fill in length of a previous callout, except when the next thing is
2500     a quantifier. */
2501    
2502     is_quantifier = c == '*' || c == '+' || c == '?' ||
2503     (c == '{' && is_counted_repeat(ptr+1));
2504    
2505     if (!is_quantifier && previous_callout != NULL &&
2506     after_manual_callout-- <= 0)
2507     {
2508 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2509     complete_callout(previous_callout, ptr, cd);
2510 nigel 77 previous_callout = NULL;
2511     }
2512    
2513     /* In extended mode, skip white space and comments */
2514    
2515     if ((options & PCRE_EXTENDED) != 0)
2516     {
2517     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2518     if (c == '#')
2519     {
2520 nigel 93 while (*(++ptr) != 0)
2521 nigel 91 {
2522 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2523 nigel 91 }
2524 nigel 93 if (*ptr != 0) continue;
2525    
2526 nigel 91 /* Else fall through to handle end of string */
2527     c = 0;
2528 nigel 77 }
2529     }
2530    
2531     /* No auto callout for quantifiers. */
2532    
2533     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2534     {
2535     previous_callout = code;
2536     code = auto_callout(code, ptr, cd);
2537     }
2538    
2539     switch(c)
2540     {
2541 nigel 93 /* ===================================================================*/
2542     case 0: /* The branch terminates at string end */
2543     case '|': /* or | or ) */
2544 nigel 77 case ')':
2545     *firstbyteptr = firstbyte;
2546     *reqbyteptr = reqbyte;
2547     *codeptr = code;
2548     *ptrptr = ptr;
2549 nigel 93 if (lengthptr != NULL)
2550     {
2551 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2552     {
2553     *errorcodeptr = ERR20;
2554     goto FAILED;
2555     }
2556 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2557     DPRINTF((">> end branch\n"));
2558     }
2559 nigel 77 return TRUE;
2560    
2561 nigel 93
2562     /* ===================================================================*/
2563 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2564     the setting of any following char as a first character. */
2565    
2566     case '^':
2567     if ((options & PCRE_MULTILINE) != 0)
2568     {
2569     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2570     }
2571     previous = NULL;
2572     *code++ = OP_CIRC;
2573     break;
2574    
2575     case '$':
2576     previous = NULL;
2577     *code++ = OP_DOLL;
2578     break;
2579    
2580     /* There can never be a first char if '.' is first, whatever happens about
2581     repeats. The value of reqbyte doesn't change either. */
2582    
2583     case '.':
2584     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2585     zerofirstbyte = firstbyte;
2586     zeroreqbyte = reqbyte;
2587     previous = code;
2588     *code++ = OP_ANY;
2589     break;
2590    
2591 nigel 93
2592     /* ===================================================================*/
2593 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2594     32-byte bitmap of the permitted characters, except in the special case
2595     where there is only one such character. For negated classes, we build the
2596     map as usual, then invert it at the end. However, we use a different opcode
2597     so that data characters > 255 can be handled correctly.
2598 nigel 77
2599     If the class contains characters outside the 0-255 range, a different
2600     opcode is compiled. It may optionally have a bit map for characters < 256,
2601     but those above are are explicitly listed afterwards. A flag byte tells
2602     whether the bitmap is present, and whether this is a negated class or not.
2603     */
2604    
2605     case '[':
2606     previous = code;
2607    
2608     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2609     they are encountered at the top level, so we'll do that too. */
2610    
2611     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2612     check_posix_syntax(ptr, &tempptr, cd))
2613     {
2614     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2615     goto FAILED;
2616     }
2617    
2618 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2619 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2620 ph10 205 skip them too. This makes for compatibility with Perl. */
2621 ph10 208
2622 ph10 205 negate_class = FALSE;
2623     for (;;)
2624 nigel 77 {
2625     c = *(++ptr);
2626 ph10 205 if (c == '\\')
2627     {
2628 ph10 208 if (ptr[1] == 'E') ptr++;
2629 ph10 205 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2630 ph10 208 else break;
2631 ph10 205 }
2632     else if (!negate_class && c == '^')
2633     negate_class = TRUE;
2634     else break;
2635 ph10 208 }
2636 nigel 77
2637 ph10 264 /* If a class contains a negative special such as \S, we need to flip the
2638     negation flag at the end, so that support for characters > 255 works
2639     correctly (they are all included in the class). */
2640    
2641     should_flip_negation = FALSE;
2642    
2643 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
2644 nigel 93 of just a single character (as long as it's < 256). However, For higher
2645     valued UTF-8 characters, we don't yet do any optimization. */
2646 nigel 77
2647     class_charcount = 0;
2648     class_lastchar = -1;
2649    
2650 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2651     temporary bit of memory, in case the class contains only 1 character (less
2652     than 256), because in that case the compiled code doesn't use the bit map.
2653     */
2654    
2655     memset(classbits, 0, 32 * sizeof(uschar));
2656    
2657 nigel 77 #ifdef SUPPORT_UTF8
2658     class_utf8 = FALSE; /* No chars >= 256 */
2659 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2660 nigel 77 #endif
2661    
2662     /* Process characters until ] is reached. By writing this as a "do" it
2663 nigel 93 means that an initial ] is taken as a data character. At the start of the
2664     loop, c contains the first byte of the character. */
2665 nigel 77
2666 nigel 93 if (c != 0) do
2667 nigel 77 {
2668 nigel 93 const uschar *oldptr;
2669    
2670 nigel 77 #ifdef SUPPORT_UTF8
2671     if (utf8 && c > 127)
2672     { /* Braces are required because the */
2673     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2674     }
2675     #endif
2676    
2677     /* Inside \Q...\E everything is literal except \E */
2678    
2679     if (inescq)
2680     {
2681 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2682 nigel 77 {
2683 nigel 93 inescq = FALSE; /* Reset literal state */
2684     ptr++; /* Skip the 'E' */
2685     continue; /* Carry on with next */
2686 nigel 77 }
2687 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2688 nigel 77 }
2689    
2690     /* Handle POSIX class names. Perl allows a negation extension of the
2691     form [:^name:]. A square bracket that doesn't match the syntax is
2692     treated as a literal. We also recognize the POSIX constructions
2693     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2694     5.6 and 5.8 do. */
2695    
2696     if (c == '[' &&
2697     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2698     check_posix_syntax(ptr, &tempptr, cd))
2699     {
2700     BOOL local_negate = FALSE;
2701 nigel 87 int posix_class, taboffset, tabopt;
2702 nigel 77 register const uschar *cbits = cd->cbits;
2703 nigel 87 uschar pbits[32];
2704 nigel 77
2705     if (ptr[1] != ':')
2706     {
2707     *errorcodeptr = ERR31;
2708     goto FAILED;
2709     }
2710    
2711     ptr += 2;
2712     if (*ptr == '^')
2713     {
2714     local_negate = TRUE;
2715 ph10 265 should_flip_negation = TRUE; /* Note negative special */
2716 nigel 77 ptr++;
2717     }
2718    
2719     posix_class = check_posix_name(ptr, tempptr - ptr);
2720     if (posix_class < 0)
2721     {
2722     *errorcodeptr = ERR30;
2723     goto FAILED;
2724     }
2725    
2726     /* If matching is caseless, upper and lower are converted to
2727     alpha. This relies on the fact that the class table starts with
2728     alpha, lower, upper as the first 3 entries. */
2729    
2730     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2731     posix_class = 0;
2732    
2733 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2734     because we may be adding and subtracting from it, and we don't want to
2735     subtract bits that may be in the main map already. At the end we or the
2736     result into the bit map that is being built. */
2737 nigel 77
2738     posix_class *= 3;
2739 nigel 87
2740     /* Copy in the first table (always present) */
2741    
2742     memcpy(pbits, cbits + posix_class_maps[posix_class],
2743     32 * sizeof(uschar));
2744    
2745     /* If there is a second table, add or remove it as required. */
2746    
2747     taboffset = posix_class_maps[posix_class + 1];
2748     tabopt = posix_class_maps[posix_class + 2];
2749    
2750     if (taboffset >= 0)
2751 nigel 77 {
2752 nigel 87 if (tabopt >= 0)
2753     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2754 nigel 77 else
2755 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2756 nigel 77 }
2757    
2758 nigel 87 /* Not see if we need to remove any special characters. An option
2759     value of 1 removes vertical space and 2 removes underscore. */
2760    
2761     if (tabopt < 0) tabopt = -tabopt;
2762     if (tabopt == 1) pbits[1] &= ~0x3c;
2763     else if (tabopt == 2) pbits[11] &= 0x7f;
2764    
2765     /* Add the POSIX table or its complement into the main table that is
2766     being built and we are done. */
2767    
2768     if (local_negate)
2769     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2770     else
2771     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2772    
2773 nigel 77 ptr = tempptr + 1;
2774     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2775     continue; /* End of POSIX syntax handling */
2776     }
2777    
2778     /* Backslash may introduce a single character, or it may introduce one
2779 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2780     case. Inside a class (and only there) it is treated as backspace.
2781     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2782 ph10 205 to 'or' into the one we are building. We assume they have more than one
2783 nigel 77 character in them, so set class_charcount bigger than one. */
2784    
2785     if (c == '\\')
2786     {
2787 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2788     if (*errorcodeptr != 0) goto FAILED;
2789 nigel 77
2790     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2791     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2792 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2793 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2794     {
2795     if (ptr[1] == '\\' && ptr[2] == 'E')
2796     {
2797     ptr += 2; /* avoid empty string */
2798     }
2799     else inescq = TRUE;
2800     continue;
2801     }
2802 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2803 nigel 77
2804     if (c < 0)
2805     {
2806     register const uschar *cbits = cd->cbits;
2807     class_charcount += 2; /* Greater than 1 is what matters */
2808 nigel 93
2809     /* Save time by not doing this in the pre-compile phase. */
2810    
2811     if (lengthptr == NULL) switch (-c)
2812 nigel 77 {
2813     case ESC_d:
2814     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2815     continue;
2816    
2817     case ESC_D:
2818 ph10 264 should_flip_negation = TRUE;
2819 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2820     continue;
2821    
2822     case ESC_w:
2823     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2824     continue;
2825    
2826     case ESC_W:
2827 ph10 264 should_flip_negation = TRUE;
2828 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2829     continue;
2830    
2831     case ESC_s:
2832     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2833     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2834     continue;
2835    
2836     case ESC_S:
2837 ph10 264 should_flip_negation = TRUE;
2838 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2839     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2840     continue;
2841    
2842 nigel 93 case ESC_E: /* Perl ignores an orphan \E */
2843     continue;
2844 ph10 180
2845 nigel 93 default: /* Not recognized; fall through */
2846     break; /* Need "default" setting to stop compiler warning. */
2847     }
2848    
2849     /* In the pre-compile phase, just do the recognition. */
2850    
2851     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2852     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2853 ph10 180
2854 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
2855     they use extra memory. */
2856 ph10 180
2857 ph10 178 if (-c == ESC_h)
2858     {
2859     SETBIT(classbits, 0x09); /* VT */
2860     SETBIT(classbits, 0x20); /* SPACE */
2861 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
2862 ph10 178 #ifdef SUPPORT_UTF8
2863     if (utf8)
2864 ph10 180 {
2865 ph10 178 class_utf8 = TRUE;
2866     *class_utf8data++ = XCL_SINGLE;
2867 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2868 ph10 178 *class_utf8data++ = XCL_SINGLE;
2869 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2870     *class_utf8data++ = XCL_RANGE;
2871     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2872     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2873 ph10 178 *class_utf8data++ = XCL_SINGLE;
2874 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2875 ph10 178 *class_utf8data++ = XCL_SINGLE;
2876 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2877 ph10 178 *class_utf8data++ = XCL_SINGLE;
2878 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2879     }
2880     #endif
2881     continue;
2882     }
2883 nigel 93
2884 ph10 178 if (-c == ESC_H)
2885     {
2886     for (c = 0; c < 32; c++)
2887     {
2888     int x = 0xff;
2889     switch (c)
2890 ph10 180 {
2891 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
2892     case 0x20/8: x ^= 1 << (0x20%8); break;
2893     case 0xa0/8: x ^= 1 << (0xa0%8); break;
2894     default: break;
2895     }
2896     classbits[c] |= x;
2897 ph10 180 }
2898    
2899 ph10 178 #ifdef SUPPORT_UTF8
2900     if (utf8)
2901 ph10 180 {
2902 ph10 178 class_utf8 = TRUE;
2903 ph10 180 *class_utf8data++ = XCL_RANGE;
2904     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2905     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2906     *class_utf8data++ = XCL_RANGE;
2907     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2908     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2909     *class_utf8data++ = XCL_RANGE;
2910     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2911     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2912     *class_utf8data++ = XCL_RANGE;
2913     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2914     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2915     *class_utf8data++ = XCL_RANGE;
2916     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2917     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2918     *class_utf8data++ = XCL_RANGE;
2919     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2920     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2921     *class_utf8data++ = XCL_RANGE;
2922     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2923     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2924     }
2925     #endif
2926     continue;
2927     }
2928 ph10 178
2929     if (-c == ESC_v)
2930     {
2931     SETBIT(classbits, 0x0a); /* LF */
2932     SETBIT(classbits, 0x0b); /* VT */
2933 ph10 180 SETBIT(classbits, 0x0c); /* FF */
2934     SETBIT(classbits, 0x0d); /* CR */
2935     SETBIT(classbits, 0x85); /* NEL */
2936 ph10 178 #ifdef SUPPORT_UTF8
2937     if (utf8)
2938 ph10 180 {
2939 ph10 178 class_utf8 = TRUE;
2940 ph10 180 *class_utf8data++ = XCL_RANGE;
2941     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2942     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2943     }
2944     #endif
2945     continue;
2946     }
2947 ph10 178
2948     if (-c == ESC_V)
2949     {
2950     for (c = 0; c < 32; c++)
2951     {
2952     int x = 0xff;
2953     switch (c)
2954 ph10 180 {
2955 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
2956     x ^= 1 << (0x0b%8);
2957     x ^= 1 << (0x0c%8);
2958 ph10 180 x ^= 1 << (0x0d%8);
2959 ph10 178 break;
2960     case 0x85/8: x ^= 1 << (0x85%8); break;
2961     default: break;
2962     }
2963     classbits[c] |= x;
2964 ph10 180 }
2965    
2966 ph10 178 #ifdef SUPPORT_UTF8
2967     if (utf8)
2968 ph10 180 {
2969 ph10 178 class_utf8 = TRUE;
2970 ph10 180 *class_utf8data++ = XCL_RANGE;
2971     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2972     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2973     *class_utf8data++ = XCL_RANGE;
2974     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2975     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2976     }
2977     #endif
2978     continue;
2979     }
2980 ph10 178
2981 nigel 93 /* We need to deal with \P and \p in both phases. */
2982    
2983 nigel 77 #ifdef SUPPORT_UCP
2984 nigel 93 if (-c == ESC_p || -c == ESC_P)
2985     {
2986     BOOL negated;
2987     int pdata;
2988     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2989     if (ptype < 0) goto FAILED;
2990     class_utf8 = TRUE;
2991     *class_utf8data++ = ((-c == ESC_p) != negated)?
2992     XCL_PROP : XCL_NOTPROP;
2993     *class_utf8data++ = ptype;
2994     *class_utf8data++ = pdata;
2995     class_charcount -= 2; /* Not a < 256 character */
2996 nigel 77 continue;
2997 nigel 93 }
2998 nigel 77 #endif
2999 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
3000     strict mode. By default, for compatibility with Perl, they are
3001     treated as literals. */
3002 nigel 77
3003 nigel 93 if ((options & PCRE_EXTRA) != 0)
3004     {
3005     *errorcodeptr = ERR7;
3006     goto FAILED;
3007     }
3008 nigel 77
3009 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3010     c = *ptr; /* Get the final character and fall through */
3011 nigel 77 }
3012    
3013     /* Fall through if we have a single character (c >= 0). This may be
3014 nigel 93 greater than 256 in UTF-8 mode. */
3015 nigel 77
3016     } /* End of backslash handling */
3017    
3018     /* A single character may be followed by '-' to form a range. However,
3019     Perl does not permit ']' to be the end of the range. A '-' character
3020 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3021     entirely. The code for handling \Q and \E is messy. */
3022 nigel 77
3023 nigel 93 CHECK_RANGE:
3024     while (ptr[1] == '\\' && ptr[2] == 'E')
3025 nigel 77 {
3026 nigel 93 inescq = FALSE;
3027     ptr += 2;
3028     }
3029    
3030     oldptr = ptr;
3031 ph10 231
3032 ph10 230 /* Remember \r or \n */
3033 ph10 231
3034     if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3035    
3036 ph10 230 /* Check for range */
3037 nigel 93
3038     if (!inescq && ptr[1] == '-')
3039     {
3040 nigel 77 int d;
3041     ptr += 2;
3042 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3043 nigel 77
3044 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3045     mode. */
3046    
3047     while (*ptr == '\\' && ptr[1] == 'Q')
3048     {
3049     ptr += 2;
3050     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3051     inescq = TRUE;
3052     break;
3053     }
3054    
3055     if (*ptr == 0 || (!inescq && *ptr == ']'))
3056     {
3057     ptr = oldptr;
3058     goto LONE_SINGLE_CHARACTER;
3059     }
3060    
3061 nigel 77 #ifdef SUPPORT_UTF8
3062     if (utf8)
3063     { /* Braces are required because the */
3064     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3065     }
3066     else
3067     #endif
3068     d = *ptr; /* Not UTF-8 mode */
3069    
3070     /* The second part of a range can be a single-character escape, but
3071     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3072     in such circumstances. */
3073    
3074 nigel 93 if (!inescq && d == '\\')
3075 nigel 77 {
3076 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3077     if (*errorcodeptr != 0) goto FAILED;
3078 nigel 77
3079 nigel 93 /* \b is backslash; \X is literal X; \R is literal R; any other
3080     special means the '-' was literal */
3081 nigel 77
3082     if (d < 0)
3083     {
3084     if (d == -ESC_b) d = '\b';
3085 nigel 93 else if (d == -ESC_X) d = 'X';
3086     else if (d == -ESC_R) d = 'R'; else
3087 nigel 77 {
3088 nigel 93 ptr = oldptr;
3089 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3090     }
3091     }
3092     }
3093    
3094 nigel 93 /* Check that the two values are in the correct order. Optimize
3095     one-character ranges */
3096 nigel 77
3097 nigel 93 if (d < c)
3098     {
3099     *errorcodeptr = ERR8;
3100     goto FAILED;
3101     }
3102    
3103 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3104    
3105 ph10 230 /* Remember \r or \n */
3106 ph10 231
3107     if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3108    
3109 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3110     matching, we have to use an XCLASS with extra data items. Caseless
3111     matching for characters > 127 is available only if UCP support is
3112     available. */
3113    
3114     #ifdef SUPPORT_UTF8
3115     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3116     {
3117     class_utf8 = TRUE;
3118    
3119     /* With UCP support, we can find the other case equivalents of
3120     the relevant characters. There may be several ranges. Optimize how
3121     they fit with the basic range. */
3122    
3123     #ifdef SUPPORT_UCP
3124     if ((options & PCRE_CASELESS) != 0)
3125     {
3126 nigel 93 unsigned int occ, ocd;
3127     unsigned int cc = c;
3128     unsigned int origd = d;
3129 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3130     {
3131 ph10 180 if (occ >= (unsigned int)c &&
3132     ocd <= (unsigned int)d)
3133 ph10 176 continue; /* Skip embedded ranges */
3134 nigel 77
3135 ph10 180 if (occ < (unsigned int)c &&
3136 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3137 nigel 77 { /* if there is overlap, */
3138     c = occ; /* noting that if occ < c */
3139     continue; /* we can't have ocd > d */
3140     } /* because a subrange is */
3141 ph10 180 if (ocd > (unsigned int)d &&
3142 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3143 nigel 77 { /* the basic range. */
3144     d = ocd;
3145     continue;
3146     }
3147    
3148     if (occ == ocd)
3149     {
3150     *class_utf8data++ = XCL_SINGLE;
3151     }
3152     else
3153     {
3154     *class_utf8data++ = XCL_RANGE;
3155     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3156     }
3157     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3158     }
3159     }
3160     #endif /* SUPPORT_UCP */
3161    
3162     /* Now record the original range, possibly modified for UCP caseless
3163     overlapping ranges. */
3164    
3165     *class_utf8data++ = XCL_RANGE;
3166     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3167     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3168    
3169     /* With UCP support, we are done. Without UCP support, there is no
3170     caseless matching for UTF-8 characters > 127; we can use the bit map
3171     for the smaller ones. */
3172    
3173     #ifdef SUPPORT_UCP
3174     continue; /* With next character in the class */
3175     #else
3176     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3177    
3178     /* Adjust upper limit and fall through to set up the map */
3179    
3180     d = 127;
3181    
3182     #endif /* SUPPORT_UCP */
3183     }
3184     #endif /* SUPPORT_UTF8 */
3185    
3186     /* We use the bit map for all cases when not in UTF-8 mode; else
3187     ranges that lie entirely within 0-127 when there is UCP support; else
3188     for partial ranges without UCP support. */
3189    
3190 nigel 93 class_charcount += d - c + 1;
3191     class_lastchar = d;
3192    
3193     /* We can save a bit of time by skipping this in the pre-compile. */
3194    
3195     if (lengthptr == NULL) for (; c <= d; c++)
3196 nigel 77 {
3197     classbits[c/8] |= (1 << (c&7));
3198     if ((options & PCRE_CASELESS) != 0)
3199     {
3200     int uc = cd->fcc[c]; /* flip case */
3201     classbits[uc/8] |= (1 << (uc&7));
3202     }
3203     }
3204    
3205     continue; /* Go get the next char in the class */
3206     }
3207    
3208     /* Handle a lone single character - we can get here for a normal
3209     non-escape char, or after \ that introduces a single character or for an
3210     apparent range that isn't. */
3211    
3212     LONE_SINGLE_CHARACTER:
3213 ph10 231
3214 nigel 77 /* Handle a character that cannot go in the bit map */
3215    
3216     #ifdef SUPPORT_UTF8
3217     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3218     {
3219     class_utf8 = TRUE;
3220     *class_utf8data++ = XCL_SINGLE;
3221     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3222    
3223     #ifdef SUPPORT_UCP
3224     if ((options & PCRE_CASELESS) != 0)
3225     {
3226 nigel 93 unsigned int othercase;
3227     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3228 nigel 77 {
3229     *class_utf8data++ = XCL_SINGLE;
3230     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3231     }
3232     }
3233     #endif /* SUPPORT_UCP */
3234    
3235     }
3236     else
3237     #endif /* SUPPORT_UTF8 */
3238    
3239     /* Handle a single-byte character */
3240     {
3241     classbits[c/8] |= (1 << (c&7));
3242     if ((options & PCRE_CASELESS) != 0)
3243     {
3244     c = cd->fcc[c]; /* flip case */
3245     classbits[c/8] |= (1 << (c&7));
3246     }
3247     class_charcount++;
3248     class_lastchar = c;
3249     }
3250     }
3251    
3252 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3253 nigel 77
3254 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3255 nigel 77
3256 nigel 93 if (c == 0) /* Missing terminating ']' */
3257     {
3258     *errorcodeptr = ERR6;
3259     goto FAILED;
3260     }
3261 ph10 231
3262    
3263 ph10 230 /* This code has been disabled because it would mean that \s counts as
3264     an explicit \r or \n reference, and that's not really what is wanted. Now
3265     we set the flag only if there is a literal "\r" or "\n" in the class. */
3266 ph10 227
3267 ph10 230 #if 0
3268 ph10 226 /* Remember whether \r or \n are in this class */
3269 ph10 227
3270 ph10 226 if (negate_class)
3271     {
3272 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3273 ph10 226 }
3274     else
3275     {
3276 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3277 ph10 227 }
3278 ph10 230 #endif
3279 ph10 227
3280 ph10 231
3281 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3282 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3283     use of \p or \P, in other words, no use of any XCLASS features, we can
3284     optimize.
3285    
3286 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3287     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3288     operate on single-bytes only. This is an historical hangover. Maybe one day
3289     we can tidy these opcodes to handle multi-byte characters.
3290 nigel 77
3291     The optimization throws away the bit map. We turn the item into a
3292     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3293     that OP_NOT does not support multibyte characters. In the positive case, it
3294     can cause firstbyte to be set. Otherwise, there can be no first char if
3295     this item is first, whatever repeat count may follow. In the case of
3296     reqbyte, save the previous value for reinstating. */
3297    
3298     #ifdef SUPPORT_UTF8
3299 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3300 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3301 nigel 77 #else
3302     if (class_charcount == 1)
3303     #endif
3304     {
3305     zeroreqbyte = reqbyte;
3306    
3307     /* The OP_NOT opcode works on one-byte characters only. */
3308    
3309     if (negate_class)
3310     {
3311     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3312     zerofirstbyte = firstbyte;
3313     *code++ = OP_NOT;
3314     *code++ = class_lastchar;
3315     break;
3316     }
3317    
3318     /* For a single, positive character, get the value into mcbuffer, and
3319     then we can handle this with the normal one-character code. */
3320    
3321     #ifdef SUPPORT_UTF8
3322     if (utf8 && class_lastchar > 127)
3323     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3324     else
3325     #endif
3326     {
3327     mcbuffer[0] = class_lastchar;
3328     mclength = 1;
3329     }
3330     goto ONE_CHAR;
3331     } /* End of 1-char optimization */
3332    
3333     /* The general case - not the one-char optimization. If this is the first
3334     thing in the branch, there can be no first char setting, whatever the
3335     repeat count. Any reqbyte setting must remain unchanged after any kind of
3336     repeat. */
3337    
3338     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3339     zerofirstbyte = firstbyte;
3340     zeroreqbyte = reqbyte;
3341    
3342     /* If there are characters with values > 255, we have to compile an
3343 ph10 264 extended class, with its own opcode, unless there was a negated special
3344     such as \S in the class, because in that case all characters > 255 are in
3345     the class, so any that were explicitly given as well can be ignored. If
3346     (when there are explicit characters > 255 that must be listed) there are no
3347     characters < 256, we can omit the bitmap in the actual compiled code. */
3348 nigel 77
3349     #ifdef SUPPORT_UTF8
3350 ph10 264 if (class_utf8 && !should_flip_negation)
3351 nigel 77 {
3352     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3353     *code++ = OP_XCLASS;
3354     code += LINK_SIZE;
3355     *code = negate_class? XCL_NOT : 0;
3356    
3357 nigel 93 /* If the map is required, move up the extra data to make room for it;
3358     otherwise just move the code pointer to the end of the extra data. */
3359 nigel 77
3360     if (class_charcount > 0)
3361     {
3362     *code++ |= XCL_MAP;
3363 nigel 93 memmove(code + 32, code, class_utf8data - code);
3364 nigel 77 memcpy(code, classbits, 32);
3365 nigel 93 code = class_utf8data + 32;
3366 nigel 77 }
3367 nigel 93 else code = class_utf8data;
3368 nigel 77
3369     /* Now fill in the complete length of the item */
3370    
3371     PUT(previous, 1, code - previous);
3372     break; /* End of class handling */
3373     }
3374     #endif
3375    
3376 ph10 264 /* If there are no characters > 255, set the opcode to OP_CLASS or
3377     OP_NCLASS, depending on whether the whole class was negated and whether
3378     there were negative specials such as \S in the class. Then copy the 32-byte
3379     map into the code vector, negating it if necessary. */
3380    
3381     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3382 nigel 77 if (negate_class)
3383     {
3384 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3385     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3386 nigel 77 }
3387     else
3388     {
3389     memcpy(code, classbits, 32);
3390     }
3391     code += 32;
3392     break;
3393    
3394 nigel 93
3395     /* ===================================================================*/
3396 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3397     has been tested above. */
3398    
3399     case '{':
3400     if (!is_quantifier) goto NORMAL_CHAR;
3401     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3402     if (*errorcodeptr != 0) goto FAILED;
3403     goto REPEAT;
3404    
3405     case '*':
3406     repeat_min = 0;
3407     repeat_max = -1;
3408     goto REPEAT;
3409    
3410     case '+':
3411     repeat_min = 1;
3412     repeat_max = -1;
3413     goto REPEAT;
3414    
3415     case '?':
3416     repeat_min = 0;
3417     repeat_max = 1;
3418    
3419     REPEAT:
3420     if (previous == NULL)
3421     {
3422     *errorcodeptr = ERR9;
3423     goto FAILED;
3424     }
3425    
3426     if (repeat_min == 0)
3427     {
3428     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3429     reqbyte = zeroreqbyte; /* Ditto */
3430     }
3431    
3432     /* Remember whether this is a variable length repeat */
3433    
3434     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3435    
3436     op_type = 0; /* Default single-char op codes */
3437     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3438    
3439     /* Save start of previous item, in case we have to move it up to make space
3440     for an inserted OP_ONCE for the additional '+' extension. */
3441    
3442     tempcode = previous;
3443    
3444     /* If the next character is '+', we have a possessive quantifier. This
3445     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3446     If the next character is '?' this is a minimizing repeat, by default,
3447     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3448     repeat type to the non-default. */
3449    
3450     if (ptr[1] == '+')
3451     {
3452     repeat_type = 0; /* Force greedy */
3453     possessive_quantifier = TRUE;
3454     ptr++;
3455     }
3456     else if (ptr[1] == '?')
3457     {
3458     repeat_type = greedy_non_default;
3459     ptr++;
3460     }
3461     else repeat_type = greedy_default;
3462    
3463     /* If previous was a character match, abolish the item and generate a
3464     repeat item instead. If a char item has a minumum of more than one, ensure
3465     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3466     the first thing in a branch because the x will have gone into firstbyte
3467     instead. */
3468    
3469     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3470     {
3471     /* Deal with UTF-8 characters that take up more than one byte. It's
3472     easier to write this out separately than try to macrify it. Use c to
3473     hold the length of the character in bytes, plus 0x80 to flag that it's a
3474     length rather than a small character. */
3475    
3476     #ifdef SUPPORT_UTF8
3477     if (utf8 && (code[-1] & 0x80) != 0)
3478     {
3479     uschar *lastchar = code - 1;
3480     while((*lastchar & 0xc0) == 0x80) lastchar--;
3481     c = code - lastchar; /* Length of UTF-8 character */
3482     memcpy(utf8_char, lastchar, c); /* Save the char */
3483     c |= 0x80; /* Flag c as a length */
3484     }
3485     else
3486     #endif
3487    
3488     /* Handle the case of a single byte - either with no UTF8 support, or
3489     with UTF-8 disabled, or for a UTF-8 character < 128. */
3490    
3491     {
3492     c = code[-1];
3493     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3494     }
3495    
3496 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3497     the line is something that cannot possibly match this character. If so,
3498     automatically possessifying this item gains some performance in the case
3499     where the match fails. */
3500    
3501     if (!possessive_quantifier &&
3502     repeat_max < 0 &&
3503     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3504     options, cd))
3505     {
3506     repeat_type = 0; /* Force greedy */
3507     possessive_quantifier = TRUE;
3508     }
3509    
3510 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3511     }
3512    
3513     /* If previous was a single negated character ([^a] or similar), we use
3514     one of the special opcodes, replacing it. The code is shared with single-
3515     character repeats by setting opt_type to add a suitable offset into
3516 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3517     currently used only for single-byte chars. */
3518 nigel 77
3519     else if (*previous == OP_NOT)
3520     {
3521     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3522     c = previous[1];
3523 nigel 93 if (!possessive_quantifier &&
3524     repeat_max < 0 &&
3525     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3526     {
3527     repeat_type = 0; /* Force greedy */
3528     possessive_quantifier = TRUE;
3529     }
3530 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3531     }
3532    
3533     /* If previous was a character type match (\d or similar), abolish it and
3534     create a suitable repeat item. The code is shared with single-character
3535     repeats by setting op_type to add a suitable offset into repeat_type. Note
3536     the the Unicode property types will be present only when SUPPORT_UCP is
3537     defined, but we don't wrap the little bits of code here because it just
3538     makes it horribly messy. */
3539    
3540     else if (*previous < OP_EODN)
3541     {
3542     uschar *oldcode;
3543 nigel 87 int prop_type, prop_value;
3544 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3545     c = *previous;
3546    
3547 nigel 93 if (!possessive_quantifier &&
3548     repeat_max < 0 &&
3549     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3550     {
3551     repeat_type = 0; /* Force greedy */
3552     possessive_quantifier = TRUE;
3553     }
3554    
3555 nigel 77 OUTPUT_SINGLE_REPEAT:
3556 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3557     {
3558     prop_type = previous[1];
3559     prop_value = previous[2];
3560     }
3561     else prop_type = prop_value = -1;
3562 nigel 77
3563     oldcode = code;
3564     code = previous; /* Usually overwrite previous item */
3565    
3566     /* If the maximum is zero then the minimum must also be zero; Perl allows
3567     this case, so we do too - by simply omitting the item altogether. */
3568    
3569     if (repeat_max == 0) goto END_REPEAT;
3570    
3571     /* All real repeats make it impossible to handle partial matching (maybe
3572     one day we will be able to remove this restriction). */
3573    
3574 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3575 nigel 77
3576     /* Combine the op_type with the repeat_type */
3577    
3578     repeat_type += op_type;
3579    
3580     /* A minimum of zero is handled either as the special case * or ?, or as
3581     an UPTO, with the maximum given. */
3582    
3583     if (repeat_min == 0)
3584     {
3585     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3586     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3587     else
3588     {
3589     *code++ = OP_UPTO + repeat_type;
3590     PUT2INC(code, 0, repeat_max);
3591     }
3592     }
3593    
3594     /* A repeat minimum of 1 is optimized into some special cases. If the
3595 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3596 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3597     one less than the maximum. */
3598    
3599     else if (repeat_min == 1)
3600     {
3601     if (repeat_max == -1)
3602     *code++ = OP_PLUS + repeat_type;
3603     else
3604     {
3605     code = oldcode; /* leave previous item in place */
3606     if (repeat_max == 1) goto END_REPEAT;
3607     *code++ = OP_UPTO + repeat_type;
3608     PUT2INC(code, 0, repeat_max - 1);
3609     }
3610     }
3611    
3612     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3613     handled as an EXACT followed by an UPTO. */
3614    
3615     else
3616     {
3617     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3618     PUT2INC(code, 0, repeat_min);
3619    
3620     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3621     we have to insert the character for the previous code. For a repeated
3622 nigel 87 Unicode property match, there are two extra bytes that define the
3623 nigel 77 required property. In UTF-8 mode, long characters have their length in
3624     c, with the 0x80 bit as a flag. */
3625    
3626     if (repeat_max < 0)
3627     {
3628     #ifdef SUPPORT_UTF8
3629     if (utf8 && c >= 128)
3630     {
3631     memcpy(code, utf8_char, c & 7);
3632     code += c & 7;
3633     }
3634     else
3635     #endif
3636     {
3637     *code++ = c;
3638 nigel 87 if (prop_type >= 0)
3639     {
3640     *code++ = prop_type;
3641     *code++ = prop_value;
3642     }
3643 nigel 77 }
3644     *code++ = OP_STAR + repeat_type;
3645     }
3646    
3647     /* Else insert an UPTO if the max is greater than the min, again
3648 nigel 93 preceded by the character, for the previously inserted code. If the
3649     UPTO is just for 1 instance, we can use QUERY instead. */
3650 nigel 77
3651     else if (repeat_max != repeat_min)
3652     {
3653     #ifdef SUPPORT_UTF8
3654     if (utf8 && c >= 128)
3655     {
3656     memcpy(code, utf8_char, c & 7);
3657     code += c & 7;
3658     }
3659     else
3660     #endif
3661     *code++ = c;
3662 nigel 87 if (prop_type >= 0)
3663     {
3664     *code++ = prop_type;
3665     *code++ = prop_value;
3666     }
3667 nigel 77 repeat_max -= repeat_min;
3668 nigel 93
3669     if (repeat_max == 1)
3670     {
3671     *code++ = OP_QUERY + repeat_type;
3672     }
3673     else
3674     {
3675     *code++ = OP_UPTO + repeat_type;
3676     PUT2INC(code, 0, repeat_max);
3677     }
3678 nigel 77 }
3679     }
3680    
3681     /* The character or character type itself comes last in all cases. */
3682    
3683     #ifdef SUPPORT_UTF8
3684     if (utf8 && c >= 128)
3685     {
3686     memcpy(code, utf8_char, c & 7);
3687     code += c & 7;
3688     }
3689     else
3690     #endif
3691     *code++ = c;
3692    
3693 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3694     define the required property. */
3695 nigel 77
3696     #ifdef SUPPORT_UCP
3697 nigel 87 if (prop_type >= 0)
3698     {
3699     *code++ = prop_type;
3700     *code++ = prop_value;
3701     }
3702 nigel 77 #endif
3703     }
3704    
3705     /* If previous was a character class or a back reference, we put the repeat
3706     stuff after it, but just skip the item if the repeat was {0,0}. */
3707    
3708     else if (*previous == OP_CLASS ||
3709     *previous == OP_NCLASS ||
3710     #ifdef SUPPORT_UTF8
3711     *previous == OP_XCLASS ||
3712     #endif
3713     *previous == OP_REF)
3714     {
3715     if (repeat_max == 0)
3716     {
3717     code = previous;
3718     goto END_REPEAT;
3719     }
3720    
3721     /* All real repeats make it impossible to handle partial matching (maybe
3722     one day we will be able to remove this restriction). */
3723    
3724 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3725 nigel 77
3726     if (repeat_min == 0 && repeat_max == -1)
3727     *code++ = OP_CRSTAR + repeat_type;
3728     else if (repeat_min == 1 && repeat_max == -1)
3729     *code++ = OP_CRPLUS + repeat_type;
3730     else if (repeat_min == 0 && repeat_max == 1)
3731     *code++ = OP_CRQUERY + repeat_type;
3732     else
3733     {
3734     *code++ = OP_CRRANGE + repeat_type;
3735     PUT2INC(code, 0, repeat_min);
3736     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3737     PUT2INC(code, 0, repeat_max);
3738     }
3739     }
3740    
3741     /* If previous was a bracket group, we may have to replicate it in certain
3742     cases. */
3743    
3744 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3745     *previous == OP_ONCE || *previous == OP_COND)
3746 nigel 77 {
3747     register int i;
3748     int ketoffset = 0;
3749     int len = code - previous;
3750     uschar *bralink = NULL;
3751    
3752 nigel 93 /* Repeating a DEFINE group is pointless */
3753    
3754     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3755     {
3756     *errorcodeptr = ERR55;
3757     goto FAILED;
3758     }
3759    
3760 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3761     by scanning through from the start, and compute the offset back to it
3762     from the current code pointer. There may be an OP_OPT setting following
3763     the final KET, so we can't find the end just by going back from the code
3764     pointer. */
3765    
3766     if (repeat_max == -1)
3767     {
3768     register uschar *ket = previous;
3769     do ket += GET(ket, 1); while (*ket != OP_KET);
3770     ketoffset = code - ket;
3771     }
3772    
3773     /* The case of a zero minimum is special because of the need to stick
3774     OP_BRAZERO in front of it, and because the group appears once in the
3775     data, whereas in other cases it appears the minimum number of times. For
3776     this reason, it is simplest to treat this case separately, as otherwise
3777     the code gets far too messy. There are several special subcases when the
3778     minimum is zero. */
3779    
3780     if (repeat_min == 0)
3781     {
3782     /* If the maximum is also zero, we just omit the group from the output
3783     altogether. */
3784    
3785     if (repeat_max == 0)
3786     {
3787     code = previous;
3788     goto END_REPEAT;
3789     }
3790    
3791     /* If the maximum is 1 or unlimited, we just have to stick in the
3792     BRAZERO and do no more at this point. However, we do need to adjust
3793     any OP_RECURSE calls inside the group that refer to the group itself or
3794 nigel 93 any internal or forward referenced group, because the offset is from
3795     the start of the whole regex. Temporarily terminate the pattern while
3796     doing this. */
3797 nigel 77
3798     if (repeat_max <= 1)
3799     {
3800     *code = OP_END;
3801 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3802 nigel 77 memmove(previous+1, previous, len);
3803     code++;
3804     *previous++ = OP_BRAZERO + repeat_type;
3805     }
3806    
3807     /* If the maximum is greater than 1 and limited, we have to replicate
3808     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3809     The first one has to be handled carefully because it's the original
3810     copy, which has to be moved up. The remainder can be handled by code
3811     that is common with the non-zero minimum case below. We have to
3812     adjust the value or repeat_max, since one less copy is required. Once
3813     again, we may have to adjust any OP_RECURSE calls inside the group. */
3814    
3815     else
3816     {
3817     int offset;
3818     *code = OP_END;
3819 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3820 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3821     code += 2 + LINK_SIZE;
3822     *previous++ = OP_BRAZERO + repeat_type;
3823     *previous++ = OP_BRA;
3824    
3825     /* We chain together the bracket offset fields that have to be
3826     filled in later when the ends of the brackets are reached. */
3827    
3828     offset = (bralink == NULL)? 0 : previous - bralink;
3829     bralink = previous;
3830     PUTINC(previous, 0, offset);
3831     }
3832    
3833     repeat_max--;
3834     }
3835    
3836     /* If the minimum is greater than zero, replicate the group as many
3837     times as necessary, and adjust the maximum to the number of subsequent
3838     copies that we need. If we set a first char from the group, and didn't
3839 nigel 93 set a required char, copy the latter from the former. If there are any
3840     forward reference subroutine calls in the group, there will be entries on
3841     the workspace list; replicate these with an appropriate increment. */
3842 nigel 77
3843     else
3844     {
3845     if (repeat_min > 1)
3846     {
3847 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3848 ph10 202 just adjust the length as if we had. Do some paranoid checks for
3849     potential integer overflow. */
3850 nigel 93
3851     if (lengthptr != NULL)
3852 ph10 202 {
3853     int delta = (repeat_min - 1)*length_prevgroup;
3854     if ((double)(repeat_min - 1)*(double)length_prevgroup >
3855     (double)INT_MAX ||
3856     OFLOW_MAX - *lengthptr < delta)
3857     {
3858     *errorcodeptr = ERR20;
3859     goto FAILED;
3860     }
3861     *lengthptr += delta;
3862     }
3863 nigel 93
3864     /* This is compiling for real */
3865    
3866     else
3867 nigel 77 {
3868 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3869     for (i = 1; i < repeat_min; i++)
3870     {
3871     uschar *hc;
3872     uschar *this_hwm = cd->hwm;
3873     memcpy(code, previous, len);
3874     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3875     {
3876     PUT(cd->hwm, 0, GET(hc, 0) + len);
3877     cd->hwm += LINK_SIZE;
3878     }
3879     save_hwm = this_hwm;
3880     code += len;
3881     }
3882 nigel 77 }
3883     }
3884 nigel 93
3885 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3886     }
3887    
3888     /* This code is common to both the zero and non-zero minimum cases. If
3889     the maximum is limited, it replicates the group in a nested fashion,
3890     remembering the bracket starts on a stack. In the case of a zero minimum,
3891     the first one was set up above. In all cases the repeat_max now specifies
3892 nigel 93 the number of additional copies needed. Again, we must remember to
3893     replicate entries on the forward reference list. */
3894 nigel 77
3895     if (repeat_max >= 0)
3896     {
3897 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3898     just adjust the length as if we had. For each repetition we must add 1
3899     to the length for BRAZERO and for all but the last repetition we must
3900 ph10 202 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3901     paranoid checks to avoid integer overflow. */
3902 nigel 93
3903     if (lengthptr != NULL && repeat_max > 0)
3904 ph10 202 {
3905     int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3906     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3907     if ((double)repeat_max *
3908     (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3909     > (double)INT_MAX ||
3910     OFLOW_MAX - *lengthptr < delta)
3911     {
3912     *errorcodeptr = ERR20;
3913     goto FAILED;
3914     }
3915     *lengthptr += delta;
3916     }
3917 nigel 93
3918     /* This is compiling for real */
3919    
3920     else for (i = repeat_max - 1; i >= 0; i--)
3921 nigel 77 {
3922 nigel 93 uschar *hc;
3923     uschar *this_hwm = cd->hwm;
3924    
3925 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3926    
3927     /* All but the final copy start a new nesting, maintaining the
3928     chain of brackets outstanding. */
3929    
3930     if (i != 0)
3931     {
3932     int offset;
3933     *code++ = OP_BRA;
3934     offset = (bralink == NULL)? 0 : code - bralink;
3935     bralink = code;
3936     PUTINC(code, 0, offset);
3937     }
3938    
3939     memcpy(code, previous, len);
3940 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3941     {
3942     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3943     cd->hwm += LINK_SIZE;
3944     }
3945     save_hwm = this_hwm;
3946 nigel 77 code += len;
3947     }
3948    
3949     /* Now chain through the pending brackets, and fill in their length
3950     fields (which are holding the chain links pro tem). */
3951    
3952     while (bralink != NULL)
3953     {
3954     int oldlinkoffset;
3955     int offset = code - bralink + 1;
3956     uschar *bra = code - offset;
3957     oldlinkoffset = GET(bra, 1);
3958     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3959     *code++ = OP_KET;
3960     PUTINC(code, 0, offset);
3961     PUT(bra, 1, offset);
3962     }
3963     }
3964    
3965     /* If the maximum is unlimited, set a repeater in the final copy. We
3966     can't just offset backwards from the current code point, because we
3967     don't know if there's been an options resetting after the ket. The
3968 nigel 93 correct offset was computed above.
3969 nigel 77
3970 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3971     this group is a non-atomic one that could match an empty string. If so,
3972     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3973     that runtime checking can be done. [This check is also applied to
3974     atomic groups at runtime, but in a different way.] */
3975    
3976     else
3977     {
3978     uschar *ketcode = code - ketoffset;
3979     uschar *bracode = ketcode - GET(ketcode, 1);
3980     *ketcode = OP_KETRMAX + repeat_type;
3981     if (lengthptr == NULL && *bracode != OP_ONCE)
3982     {
3983     uschar *scode = bracode;
3984     do
3985     {
3986     if (could_be_empty_branch(scode, ketcode, utf8))
3987     {
3988     *bracode += OP_SBRA - OP_BRA;
3989     break;
3990     }
3991     scode += GET(scode, 1);
3992     }
3993     while (*scode == OP_ALT);
3994     }
3995     }
3996 nigel 77 }
3997    
3998     /* Else there's some kind of shambles */
3999    
4000     else
4001     {
4002     *errorcodeptr = ERR11;
4003     goto FAILED;
4004     }
4005    
4006 nigel 93 /* If the character following a repeat is '+', or if certain optimization
4007     tests above succeeded, possessive_quantifier is TRUE. For some of the
4008     simpler opcodes, there is an special alternative opcode for this. For
4009     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4010     The '+' notation is just syntactic sugar, taken from Sun's Java package,
4011     but the special opcodes can optimize it a bit. The repeated item starts at
4012     tempcode, not at previous, which might be the first part of a string whose
4013     (former) last char we repeated.
4014 nigel 77
4015 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4016     an 'upto' may follow. We skip over an 'exact' item, and then test the
4017     length of what remains before proceeding. */
4018    
4019 nigel 77 if (possessive_quantifier)
4020     {
4021 nigel 93 int len;
4022     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4023     *tempcode == OP_NOTEXACT)
4024     tempcode += _pcre_OP_lengths[*tempcode];
4025     len = code - tempcode;
4026     if (len > 0) switch (*tempcode)
4027     {
4028     case OP_STAR: *tempcode = OP_POSSTAR; break;
4029     case OP_PLUS: *tempcode = OP_POSPLUS; break;
4030     case OP_QUERY: *tempcode = OP_POSQUERY; break;
4031     case OP_UPTO: *tempcode = OP_POSUPTO; break;
4032    
4033     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4034     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4035     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4036     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4037    
4038     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4039     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4040     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4041     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4042    
4043     default:
4044     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4045     code += 1 + LINK_SIZE;
4046     len += 1 + LINK_SIZE;
4047     tempcode[0] = OP_ONCE;
4048     *code++ = OP_KET;
4049     PUTINC(code, 0, len);
4050     PUT(tempcode, 1, len);
4051     break;
4052     }
4053 nigel 77 }
4054    
4055     /* In all case we no longer have a previous item. We also set the
4056     "follows varying string" flag for subsequently encountered reqbytes if
4057     it isn't already set and we have just passed a varying length item. */
4058    
4059     END_REPEAT:
4060     previous = NULL;
4061     cd->req_varyopt |= reqvary;
4062     break;
4063    
4064    
4065 nigel 93 /* ===================================================================*/
4066     /* Start of nested parenthesized sub-expression, or comment or lookahead or
4067     lookbehind or option setting or condition or all the other extended
4068 ph10 210 parenthesis forms. */
4069 nigel 77
4070     case '(':
4071     newoptions = options;
4072     skipbytes = 0;
4073 nigel 93 bravalue = OP_CBRA;
4074     save_hwm = cd->hwm;
4075 ph10 180 reset_bracount = FALSE;
4076 ph10 211
4077 ph10 210 /* First deal with various "verbs" that can be introduced by '*'. */
4078 ph10 211
4079 ph10 210 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4080     {
4081 ph10 211 int i, namelen;
4082 ph10 243 const char *vn = verbnames;
4083 ph10 210 const uschar *name = ++ptr;
4084     previous = NULL;
4085     while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4086     if (*ptr == ':')
4087     {
4088     *errorcodeptr = ERR59; /* Not supported */
4089 ph10 211 goto FAILED;
4090     }
4091 ph10 210 if (*ptr != ')')
4092     {
4093     *errorcodeptr = ERR60;
4094     goto FAILED;
4095     }
4096 ph10 211 namelen = ptr - name;
4097 ph10 210 for (i = 0; i < verbcount; i++)
4098 ph10 211 {
4099 ph10 210 if (namelen == verbs[i].len &&
4100 ph10 240 strncmp((char *)name, vn, namelen) == 0)
4101 ph10 210 {
4102     *code = verbs[i].op;
4103     if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4104     break;
4105 ph10 211 }
4106 ph10 243 vn += verbs[i].len + 1;
4107 ph10 211 }
4108     if (i < verbcount) continue;
4109 ph10 210 *errorcodeptr = ERR60;
4110 ph10 211 goto FAILED;
4111     }
4112    
4113 ph10 210 /* Deal with the extended parentheses; all are introduced by '?', and the
4114     appearance of any of them means that this is not a capturing group. */
4115 nigel 77
4116 ph10 210 else if (*ptr == '?')
4117 nigel 77 {
4118 nigel 93 int i, set, unset, namelen;
4119 nigel 77 int *optset;
4120 nigel 93 const uschar *name;
4121     uschar *slot;
4122 nigel 77
4123     switch (*(++ptr))
4124     {
4125     case '#': /* Comment; skip to ket */
4126     ptr++;
4127 nigel 93 while (*ptr != 0 && *ptr != ')') ptr++;
4128     if (*ptr == 0)
4129     {
4130     *errorcodeptr = ERR18;