/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 341 - (hide annotations) (download)
Sat Apr 19 16:41:04 2008 UTC (6 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 202840 byte(s)
Fix DFA (?!) bug; add support for JavaScript empty classes.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 305 Copyright (c) 1997-2008 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 nigel 77 static const short int escapes[] = {
102     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 nigel 77 0, 0, -ESC_z /* x - z */
112     };
113    
114 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
115 nigel 77 static const short int escapes[] = {
116     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139     };
140     #endif
141    
142    
143 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144     searched linearly. Put all the names into a single string, in order to reduce
145 ph10 240 the number of relocations when a shared library is dynamically linked. */
146 ph10 210
147     typedef struct verbitem {
148     int len;
149     int op;
150 ph10 211 } verbitem;
151 ph10 210
152 ph10 240 static const char verbnames[] =
153 ph10 243 "ACCEPT\0"
154     "COMMIT\0"
155     "F\0"
156     "FAIL\0"
157     "PRUNE\0"
158     "SKIP\0"
159     "THEN";
160 ph10 240
161 ph10 327 static const verbitem verbs[] = {
162 ph10 240 { 6, OP_ACCEPT },
163     { 6, OP_COMMIT },
164     { 1, OP_FAIL },
165     { 4, OP_FAIL },
166     { 5, OP_PRUNE },
167     { 4, OP_SKIP },
168     { 4, OP_THEN }
169 ph10 210 };
170    
171 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
172 ph10 210
173    
174 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
175     now all in a single string, to reduce the number of relocations when a shared
176 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
177     length entry. The first three must be alpha, lower, upper, as this is assumed
178     for handling case independence. */
179 nigel 77
180 ph10 240 static const char posix_names[] =
181 ph10 243 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182     "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 ph10 240 "word\0" "xdigit";
184 nigel 77
185     static const uschar posix_name_lengths[] = {
186     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187    
188 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
189     base map, with an optional addition or removal of another map. Then, for some
190     classes, there is some additional tweaking: for [:blank:] the vertical space
191     characters are removed, and for [:alpha:] and [:alnum:] the underscore
192     character is removed. The triples in the table consist of the base map offset,
193     second map offset or -1 if no second map, and a non-negative value for map
194     addition or a negative value for map subtraction (if there are two maps). The
195     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196     remove vertical space characters, 2 => remove underscore. */
197 nigel 77
198     static const int posix_class_maps[] = {
199 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
200     cbit_lower, -1, 0, /* lower */
201     cbit_upper, -1, 0, /* upper */
202     cbit_word, -1, 2, /* alnum - word without underscore */
203     cbit_print, cbit_cntrl, 0, /* ascii */
204     cbit_space, -1, 1, /* blank - a GNU extension */
205     cbit_cntrl, -1, 0, /* cntrl */
206     cbit_digit, -1, 0, /* digit */
207     cbit_graph, -1, 0, /* graph */
208     cbit_print, -1, 0, /* print */
209     cbit_punct, -1, 0, /* punct */
210     cbit_space, -1, 0, /* space */
211     cbit_word, -1, 0, /* word - a Perl extension */
212     cbit_xdigit,-1, 0 /* xdigit */
213 nigel 77 };
214    
215    
216 nigel 93 #define STRING(a) # a
217     #define XSTRING(s) STRING(s)
218    
219 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
220 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
221     they are documented. Always add a new error instead. Messages marked DEAD below
222 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
223     the number of relocations needed when a shared library is loaded dynamically,
224     it is now one long string. We cannot use a table of offsets, because the
225     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226     simply count through to the one we want - this isn't a performance issue
227 ph10 240 because these strings are used only when there is a compilation error. */
228 nigel 77
229 ph10 240 static const char error_texts[] =
230     "no error\0"
231     "\\ at end of pattern\0"
232     "\\c at end of pattern\0"
233     "unrecognized character follows \\\0"
234     "numbers out of order in {} quantifier\0"
235 nigel 77 /* 5 */
236 ph10 240 "number too big in {} quantifier\0"
237     "missing terminating ] for character class\0"
238     "invalid escape sequence in character class\0"
239     "range out of order in character class\0"
240     "nothing to repeat\0"
241 nigel 77 /* 10 */
242 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243     "internal error: unexpected repeat\0"
244 ph10 269 "unrecognized character after (? or (?-\0"
245 ph10 240 "POSIX named classes are supported only within a class\0"
246     "missing )\0"
247 nigel 77 /* 15 */
248 ph10 240 "reference to non-existent subpattern\0"
249     "erroffset passed as NULL\0"
250     "unknown option bit(s) set\0"
251     "missing ) after comment\0"
252     "parentheses nested too deeply\0" /** DEAD **/
253 nigel 77 /* 20 */
254 ph10 240 "regular expression is too large\0"
255     "failed to get memory\0"
256     "unmatched parentheses\0"
257     "internal error: code overflow\0"
258     "unrecognized character after (?<\0"
259 nigel 77 /* 25 */
260 ph10 240 "lookbehind assertion is not fixed length\0"
261     "malformed number or name after (?(\0"
262     "conditional group contains more than two branches\0"
263     "assertion expected after (?(\0"
264     "(?R or (?[+-]digits must be followed by )\0"
265 nigel 77 /* 30 */
266 ph10 240 "unknown POSIX class name\0"
267     "POSIX collating elements are not supported\0"
268     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269     "spare error\0" /** DEAD **/
270     "character value in \\x{...} sequence is too large\0"
271 nigel 77 /* 35 */
272 ph10 240 "invalid condition (?(0)\0"
273     "\\C not allowed in lookbehind assertion\0"
274     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275     "number after (?C is > 255\0"
276     "closing ) for (?C expected\0"
277 nigel 77 /* 40 */
278 ph10 240 "recursive call could loop indefinitely\0"
279     "unrecognized character after (?P\0"
280     "syntax error in subpattern name (missing terminator)\0"
281     "two named subpatterns have the same name\0"
282     "invalid UTF-8 string\0"
283 nigel 77 /* 45 */
284 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
285     "malformed \\P or \\p sequence\0"
286     "unknown property name after \\P or \\p\0"
287     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 nigel 91 /* 50 */
290 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
291     "octal value is greater than \\377 (not in UTF-8 mode)\0"
292     "internal error: overran compiling workspace\0"
293     "internal error: previously-checked referenced subpattern not found\0"
294     "DEFINE group contains more than one branch\0"
295 nigel 93 /* 55 */
296 ph10 240 "repeating a DEFINE group is not allowed\0"
297     "inconsistent NEWLINE options\0"
298 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
299     "a numbered reference must not be zero\0"
300 ph10 240 "(*VERB) with an argument is not supported\0"
301 ph10 211 /* 60 */
302 ph10 240 "(*VERB) not recognized\0"
303 ph10 268 "number is too big\0"
304 ph10 272 "subpattern name expected\0"
305 ph10 336 "digit expected after (?+\0"
306     "] is an invalid data character in JavaScript compatibility mode";
307 nigel 77
308    
309     /* Table to identify digits and hex digits. This is used when compiling
310     patterns. Note that the tables in chartables are dependent on the locale, and
311     may mark arbitrary characters as digits - but the PCRE compiling code expects
312     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
313     a private table here. It costs 256 bytes, but it is a lot faster than doing
314     character value tests (at least in some simple cases I timed), and in some
315     applications one wants PCRE to compile efficiently as well as match
316     efficiently.
317    
318     For convenience, we use the same bit definitions as in chartables:
319    
320     0x04 decimal digit
321     0x08 hexadecimal digit
322    
323     Then we can use ctype_digit and ctype_xdigit in the code. */
324    
325 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
326 nigel 77 static const unsigned char digitab[] =
327     {
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
333     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
334     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
335     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
336     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
337     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
338     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
339     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
340     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
341     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
342     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
343     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
344     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
347     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
348     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
350     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
351     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
352     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
355     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
356     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
357     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
358     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
359     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
360    
361 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
362 nigel 77 static const unsigned char digitab[] =
363     {
364     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
365     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
366     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
367     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
368     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
369     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
370     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
371     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
374     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
375 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
376 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
377     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
378     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
379     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
380     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
381     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
382     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
383     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
387     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
388     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
393     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
394     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
395     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
396    
397     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
398     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
399     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
400     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
401     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
402     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
403     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
404     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
405     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
406     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
407     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
408     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
409 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
410 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
411     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
412     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
413     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
414     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
415     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
416     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
417     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
418     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
419     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
420     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
421     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
422     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
423     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
424     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
425     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
426     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
427     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
428     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
429     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
430     #endif
431    
432    
433     /* Definition to allow mutual recursion */
434    
435     static BOOL
436 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
437 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
438 nigel 77
439    
440    
441     /*************************************************
442 ph10 240 * Find an error text *
443     *************************************************/
444    
445 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
446     some of the text is of unknown length, we can't use a table of offsets.
447     Instead, just count through the strings. This is not a performance issue
448 ph10 240 because it happens only when there has been a compilation error.
449    
450     Argument: the error number
451     Returns: pointer to the error string
452     */
453    
454     static const char *
455     find_error_text(int n)
456     {
457     const char *s = error_texts;
458 ph10 243 for (; n > 0; n--) while (*s++ != 0);
459 ph10 240 return s;
460     }
461    
462    
463     /*************************************************
464 nigel 77 * Handle escapes *
465     *************************************************/
466    
467     /* This function is called when a \ has been encountered. It either returns a
468     positive value for a simple escape such as \n, or a negative value which
469 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
470     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
471     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
472     ptr is pointing at the \. On exit, it is on the final character of the escape
473     sequence.
474 nigel 77
475     Arguments:
476     ptrptr points to the pattern position pointer
477     errorcodeptr points to the errorcode variable
478     bracount number of previous extracting brackets
479     options the options bits
480     isclass TRUE if inside a character class
481    
482     Returns: zero or positive => a data character
483     negative => a special escape sequence
484 ph10 213 on error, errorcodeptr is set
485 nigel 77 */
486    
487     static int
488     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
489     int options, BOOL isclass)
490     {
491 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
492     const uschar *ptr = *ptrptr + 1;
493 nigel 77 int c, i;
494    
495 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
496     ptr--; /* Set pointer back to the last byte */
497    
498 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
499    
500     if (c == 0) *errorcodeptr = ERR1;
501    
502 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
503     in a table. A non-zero result is something that can be returned immediately.
504 nigel 77 Otherwise further processing may be required. */
505    
506 ph10 97 #ifndef EBCDIC /* ASCII coding */
507 ph10 274 else if (c < '0' || c > 'z') {} /* Not alphanumeric */
508 nigel 77 else if ((i = escapes[c - '0']) != 0) c = i;
509    
510 ph10 97 #else /* EBCDIC coding */
511 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
512 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
513     #endif
514    
515     /* Escapes that need further processing, or are illegal. */
516    
517     else
518     {
519     const uschar *oldptr;
520 nigel 93 BOOL braced, negated;
521    
522 nigel 77 switch (c)
523     {
524     /* A number of Perl escapes are not handled by PCRE. We give an explicit
525     error. */
526    
527     case 'l':
528     case 'L':
529     case 'N':
530     case 'u':
531     case 'U':
532     *errorcodeptr = ERR37;
533     break;
534    
535 ph10 333 /* \g must be followed by one of a number of specific things:
536    
537     (1) A number, either plain or braced. If positive, it is an absolute
538     backreference. If negative, it is a relative backreference. This is a Perl
539     5.10 feature.
540    
541     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
542     is part of Perl's movement towards a unified syntax for back references. As
543     this is synonymous with \k{name}, we fudge it up by pretending it really
544     was \k.
545    
546     (3) For Oniguruma compatibility we also support \g followed by a name or a
547     number either in angle brackets or in single quotes. However, these are
548     (possibly recursive) subroutine calls, _not_ backreferences. Just return
549     the -ESC_g code (cf \k). */
550 nigel 93
551     case 'g':
552 ph10 333 if (ptr[1] == '<' || ptr[1] == '\'')
553     {
554     c = -ESC_g;
555     break;
556     }
557    
558     /* Handle the Perl-compatible cases */
559    
560 nigel 93 if (ptr[1] == '{')
561     {
562 ph10 171 const uschar *p;
563     for (p = ptr+2; *p != 0 && *p != '}'; p++)
564     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
565 ph10 172 if (*p != 0 && *p != '}')
566 ph10 171 {
567     c = -ESC_k;
568     break;
569 ph10 172 }
570 nigel 93 braced = TRUE;
571     ptr++;
572     }
573     else braced = FALSE;
574    
575     if (ptr[1] == '-')
576     {
577     negated = TRUE;
578     ptr++;
579     }
580     else negated = FALSE;
581    
582     c = 0;
583     while ((digitab[ptr[1]] & ctype_digit) != 0)
584     c = c * 10 + *(++ptr) - '0';
585 ph10 220
586 ph10 333 if (c < 0) /* Integer overflow */
587 ph10 213 {
588     *errorcodeptr = ERR61;
589     break;
590 ph10 220 }
591 ph10 333
592     if (braced && *(++ptr) != '}')
593 nigel 93 {
594     *errorcodeptr = ERR57;
595 ph10 213 break;
596 nigel 93 }
597 ph10 333
598     if (c == 0)
599     {
600     *errorcodeptr = ERR58;
601     break;
602     }
603 nigel 93
604     if (negated)
605     {
606     if (c > bracount)
607     {
608     *errorcodeptr = ERR15;
609 ph10 213 break;
610 nigel 93 }
611     c = bracount - (c - 1);
612     }
613    
614     c = -(ESC_REF + c);
615     break;
616    
617 nigel 77 /* The handling of escape sequences consisting of a string of digits
618     starting with one that is not zero is not straightforward. By experiment,
619     the way Perl works seems to be as follows:
620    
621     Outside a character class, the digits are read as a decimal number. If the
622     number is less than 10, or if there are that many previous extracting
623     left brackets, then it is a back reference. Otherwise, up to three octal
624     digits are read to form an escaped byte. Thus \123 is likely to be octal
625     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
626     value is greater than 377, the least significant 8 bits are taken. Inside a
627     character class, \ followed by a digit is always an octal number. */
628    
629     case '1': case '2': case '3': case '4': case '5':
630     case '6': case '7': case '8': case '9':
631    
632     if (!isclass)
633     {
634     oldptr = ptr;
635     c -= '0';
636     while ((digitab[ptr[1]] & ctype_digit) != 0)
637     c = c * 10 + *(++ptr) - '0';
638 ph10 333 if (c < 0) /* Integer overflow */
639 ph10 213 {
640     *errorcodeptr = ERR61;
641 ph10 220 break;
642     }
643 nigel 77 if (c < 10 || c <= bracount)
644     {
645     c = -(ESC_REF + c);
646     break;
647     }
648     ptr = oldptr; /* Put the pointer back and fall through */
649     }
650    
651     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
652     generates a binary zero byte and treats the digit as a following literal.
653     Thus we have to pull back the pointer by one. */
654    
655     if ((c = *ptr) >= '8')
656     {
657     ptr--;
658     c = 0;
659     break;
660     }
661    
662     /* \0 always starts an octal number, but we may drop through to here with a
663 nigel 91 larger first octal digit. The original code used just to take the least
664     significant 8 bits of octal numbers (I think this is what early Perls used
665     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
666     than 3 octal digits. */
667 nigel 77
668     case '0':
669     c -= '0';
670     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
671     c = c * 8 + *(++ptr) - '0';
672 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
673 nigel 77 break;
674    
675 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
676     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
677     treated as a data character. */
678 nigel 77
679     case 'x':
680 nigel 87 if (ptr[1] == '{')
681 nigel 77 {
682     const uschar *pt = ptr + 2;
683 nigel 87 int count = 0;
684    
685 nigel 77 c = 0;
686     while ((digitab[*pt] & ctype_xdigit) != 0)
687     {
688 nigel 87 register int cc = *pt++;
689     if (c == 0 && cc == '0') continue; /* Leading zeroes */
690 nigel 77 count++;
691 nigel 87
692 ph10 97 #ifndef EBCDIC /* ASCII coding */
693 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
694 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
695 ph10 97 #else /* EBCDIC coding */
696 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
697 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
698 nigel 77 #endif
699     }
700 nigel 87
701 nigel 77 if (*pt == '}')
702     {
703 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
704 nigel 77 ptr = pt;
705     break;
706     }
707 nigel 87
708 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
709     recognize this construct; fall through to the normal \x handling. */
710     }
711    
712 nigel 87 /* Read just a single-byte hex-defined char */
713 nigel 77
714     c = 0;
715     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
716     {
717     int cc; /* Some compilers don't like ++ */
718     cc = *(++ptr); /* in initializers */
719 ph10 97 #ifndef EBCDIC /* ASCII coding */
720 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
721     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
722 ph10 97 #else /* EBCDIC coding */
723 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
724     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
725     #endif
726     }
727     break;
728    
729 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
730     This coding is ASCII-specific, but then the whole concept of \cx is
731     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
732 nigel 77
733     case 'c':
734     c = *(++ptr);
735     if (c == 0)
736     {
737     *errorcodeptr = ERR2;
738 ph10 213 break;
739 nigel 77 }
740    
741 ph10 97 #ifndef EBCDIC /* ASCII coding */
742 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
743     c ^= 0x40;
744 ph10 97 #else /* EBCDIC coding */
745 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
746     c ^= 0xC0;
747     #endif
748     break;
749    
750     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
751 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
752     otherwise, for Perl compatibility, it is a literal. This code looks a bit
753     odd, but there used to be some cases other than the default, and there may
754     be again in future, so I haven't "optimized" it. */
755 nigel 77
756     default:
757     if ((options & PCRE_EXTRA) != 0) switch(c)
758     {
759     default:
760     *errorcodeptr = ERR3;
761     break;
762     }
763     break;
764     }
765     }
766    
767     *ptrptr = ptr;
768     return c;
769     }
770    
771    
772    
773     #ifdef SUPPORT_UCP
774     /*************************************************
775     * Handle \P and \p *
776     *************************************************/
777    
778     /* This function is called after \P or \p has been encountered, provided that
779     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
780     pointing at the P or p. On exit, it is pointing at the final character of the
781     escape sequence.
782    
783     Argument:
784     ptrptr points to the pattern position pointer
785     negptr points to a boolean that is set TRUE for negation else FALSE
786 nigel 87 dptr points to an int that is set to the detailed property value
787 nigel 77 errorcodeptr points to the error code variable
788    
789 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
790 nigel 77 */
791    
792     static int
793 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
794 nigel 77 {
795     int c, i, bot, top;
796     const uschar *ptr = *ptrptr;
797 nigel 87 char name[32];
798 nigel 77
799     c = *(++ptr);
800     if (c == 0) goto ERROR_RETURN;
801    
802     *negptr = FALSE;
803    
804 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
805     negation. */
806 nigel 77
807     if (c == '{')
808     {
809     if (ptr[1] == '^')
810     {
811     *negptr = TRUE;
812     ptr++;
813     }
814 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
815 nigel 77 {
816     c = *(++ptr);
817     if (c == 0) goto ERROR_RETURN;
818     if (c == '}') break;
819     name[i] = c;
820     }
821 nigel 87 if (c !='}') goto ERROR_RETURN;
822 nigel 77 name[i] = 0;
823     }
824    
825     /* Otherwise there is just one following character */
826    
827     else
828     {
829     name[0] = c;
830     name[1] = 0;
831     }
832    
833     *ptrptr = ptr;
834    
835     /* Search for a recognized property name using binary chop */
836    
837     bot = 0;
838     top = _pcre_utt_size;
839    
840     while (bot < top)
841     {
842 nigel 87 i = (bot + top) >> 1;
843 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
844 nigel 87 if (c == 0)
845     {
846     *dptr = _pcre_utt[i].value;
847     return _pcre_utt[i].type;
848     }
849 nigel 77 if (c > 0) bot = i + 1; else top = i;
850     }
851    
852     *errorcodeptr = ERR47;
853     *ptrptr = ptr;
854     return -1;
855    
856     ERROR_RETURN:
857     *errorcodeptr = ERR46;
858     *ptrptr = ptr;
859     return -1;
860     }
861     #endif
862    
863    
864    
865    
866     /*************************************************
867     * Check for counted repeat *
868     *************************************************/
869    
870     /* This function is called when a '{' is encountered in a place where it might
871     start a quantifier. It looks ahead to see if it really is a quantifier or not.
872     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
873     where the ddds are digits.
874    
875     Arguments:
876     p pointer to the first char after '{'
877    
878     Returns: TRUE or FALSE
879     */
880    
881     static BOOL
882     is_counted_repeat(const uschar *p)
883     {
884     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
885     while ((digitab[*p] & ctype_digit) != 0) p++;
886     if (*p == '}') return TRUE;
887    
888     if (*p++ != ',') return FALSE;
889     if (*p == '}') return TRUE;
890    
891     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
892     while ((digitab[*p] & ctype_digit) != 0) p++;
893    
894     return (*p == '}');
895     }
896    
897    
898    
899     /*************************************************
900     * Read repeat counts *
901     *************************************************/
902    
903     /* Read an item of the form {n,m} and return the values. This is called only
904     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
905     so the syntax is guaranteed to be correct, but we need to check the values.
906    
907     Arguments:
908     p pointer to first char after '{'
909     minp pointer to int for min
910     maxp pointer to int for max
911     returned as -1 if no max
912     errorcodeptr points to error code variable
913    
914     Returns: pointer to '}' on success;
915     current ptr on error, with errorcodeptr set non-zero
916     */
917    
918     static const uschar *
919     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
920     {
921     int min = 0;
922     int max = -1;
923    
924 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
925     an integer overflow. */
926    
927 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
928 nigel 81 if (min < 0 || min > 65535)
929     {
930     *errorcodeptr = ERR5;
931     return p;
932     }
933 nigel 77
934 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
935     Also, max must not be less than min. */
936    
937 nigel 77 if (*p == '}') max = min; else
938     {
939     if (*(++p) != '}')
940     {
941     max = 0;
942     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
943 nigel 81 if (max < 0 || max > 65535)
944     {
945     *errorcodeptr = ERR5;
946     return p;
947     }
948 nigel 77 if (max < min)
949     {
950     *errorcodeptr = ERR4;
951     return p;
952     }
953     }
954     }
955    
956 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
957     '}'. */
958 nigel 77
959 nigel 81 *minp = min;
960     *maxp = max;
961 nigel 77 return p;
962     }
963    
964    
965    
966     /*************************************************
967 nigel 93 * Find forward referenced subpattern *
968 nigel 91 *************************************************/
969    
970 nigel 93 /* This function scans along a pattern's text looking for capturing
971     subpatterns, and counting them. If it finds a named pattern that matches the
972     name it is given, it returns its number. Alternatively, if the name is NULL, it
973     returns when it reaches a given numbered subpattern. This is used for forward
974     references to subpatterns. We know that if (?P< is encountered, the name will
975     be terminated by '>' because that is checked in the first pass.
976 nigel 91
977     Arguments:
978 nigel 93 ptr current position in the pattern
979 ph10 341 cd compile background data
980 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
981     lorn name length, or subpattern number if name is NULL
982     xmode TRUE if we are in /x mode
983 nigel 91
984     Returns: the number of the named subpattern, or -1 if not found
985     */
986    
987     static int
988 ph10 341 find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,
989 nigel 93 BOOL xmode)
990 nigel 91 {
991     const uschar *thisname;
992 ph10 341 int count = cd->bracount;
993 nigel 93
994 nigel 91 for (; *ptr != 0; ptr++)
995     {
996 nigel 93 int term;
997    
998     /* Skip over backslashed characters and also entire \Q...\E */
999    
1000     if (*ptr == '\\')
1001     {
1002     if (*(++ptr) == 0) return -1;
1003     if (*ptr == 'Q') for (;;)
1004     {
1005     while (*(++ptr) != 0 && *ptr != '\\');
1006     if (*ptr == 0) return -1;
1007     if (*(++ptr) == 'E') break;
1008     }
1009     continue;
1010     }
1011    
1012 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1013     are handled for real. If the first character is '^', skip it. Also, if the
1014     first few characters (either before or after ^) are \Q\E or \E we skip them
1015     too. This makes for compatibility with Perl. */
1016 nigel 93
1017     if (*ptr == '[')
1018     {
1019 ph10 340 BOOL negate_class = FALSE;
1020     for (;;)
1021     {
1022     int c = *(++ptr);
1023     if (c == '\\')
1024     {
1025     if (ptr[1] == 'E') ptr++;
1026     else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
1027     else break;
1028     }
1029     else if (!negate_class && c == '^')
1030     negate_class = TRUE;
1031     else break;
1032     }
1033    
1034     /* If the next character is ']', it is a data character that must be
1035 ph10 341 skipped, except in JavaScript compatibility mode. */
1036 ph10 340
1037 ph10 341 if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1038     ptr++;
1039 ph10 340
1040 nigel 93 while (*(++ptr) != ']')
1041     {
1042 ph10 220 if (*ptr == 0) return -1;
1043 nigel 93 if (*ptr == '\\')
1044     {
1045     if (*(++ptr) == 0) return -1;
1046     if (*ptr == 'Q') for (;;)
1047     {
1048     while (*(++ptr) != 0 && *ptr != '\\');
1049     if (*ptr == 0) return -1;
1050     if (*(++ptr) == 'E') break;
1051     }
1052     continue;
1053     }
1054     }
1055     continue;
1056     }
1057    
1058     /* Skip comments in /x mode */
1059    
1060     if (xmode && *ptr == '#')
1061     {
1062     while (*(++ptr) != 0 && *ptr != '\n');
1063     if (*ptr == 0) return -1;
1064     continue;
1065     }
1066    
1067     /* An opening parens must now be a real metacharacter */
1068    
1069 nigel 91 if (*ptr != '(') continue;
1070 ph10 210 if (ptr[1] != '?' && ptr[1] != '*')
1071 nigel 93 {
1072     count++;
1073     if (name == NULL && count == lorn) return count;
1074     continue;
1075     }
1076    
1077     ptr += 2;
1078     if (*ptr == 'P') ptr++; /* Allow optional P */
1079    
1080     /* We have to disambiguate (?<! and (?<= from (?<name> */
1081    
1082     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1083     *ptr != '\'')
1084     continue;
1085    
1086 nigel 91 count++;
1087 nigel 93
1088     if (name == NULL && count == lorn) return count;
1089     term = *ptr++;
1090     if (term == '<') term = '>';
1091 nigel 91 thisname = ptr;
1092 nigel 93 while (*ptr != term) ptr++;
1093     if (name != NULL && lorn == ptr - thisname &&
1094     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1095 nigel 91 return count;
1096     }
1097 nigel 93
1098 nigel 91 return -1;
1099     }
1100    
1101    
1102    
1103     /*************************************************
1104 nigel 77 * Find first significant op code *
1105     *************************************************/
1106    
1107     /* This is called by several functions that scan a compiled expression looking
1108     for a fixed first character, or an anchoring op code etc. It skips over things
1109     that do not influence this. For some calls, a change of option is important.
1110     For some calls, it makes sense to skip negative forward and all backward
1111     assertions, and also the \b assertion; for others it does not.
1112    
1113     Arguments:
1114     code pointer to the start of the group
1115     options pointer to external options
1116     optbit the option bit whose changing is significant, or
1117     zero if none are
1118     skipassert TRUE if certain assertions are to be skipped
1119    
1120     Returns: pointer to the first significant opcode
1121     */
1122    
1123     static const uschar*
1124     first_significant_code(const uschar *code, int *options, int optbit,
1125     BOOL skipassert)
1126     {
1127     for (;;)
1128     {
1129     switch ((int)*code)
1130     {
1131     case OP_OPT:
1132     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1133     *options = (int)code[1];
1134     code += 2;
1135     break;
1136    
1137     case OP_ASSERT_NOT:
1138     case OP_ASSERTBACK:
1139     case OP_ASSERTBACK_NOT:
1140     if (!skipassert) return code;
1141     do code += GET(code, 1); while (*code == OP_ALT);
1142     code += _pcre_OP_lengths[*code];
1143     break;
1144    
1145     case OP_WORD_BOUNDARY:
1146     case OP_NOT_WORD_BOUNDARY:
1147     if (!skipassert) return code;
1148     /* Fall through */
1149    
1150     case OP_CALLOUT:
1151     case OP_CREF:
1152 nigel 93 case OP_RREF:
1153     case OP_DEF:
1154 nigel 77 code += _pcre_OP_lengths[*code];
1155     break;
1156    
1157     default:
1158     return code;
1159     }
1160     }
1161     /* Control never reaches here */
1162     }
1163    
1164    
1165    
1166    
1167     /*************************************************
1168     * Find the fixed length of a pattern *
1169     *************************************************/
1170    
1171     /* Scan a pattern and compute the fixed length of subject that will match it,
1172     if the length is fixed. This is needed for dealing with backward assertions.
1173     In UTF8 mode, the result is in characters rather than bytes.
1174    
1175     Arguments:
1176     code points to the start of the pattern (the bracket)
1177     options the compiling options
1178    
1179     Returns: the fixed length, or -1 if there is no fixed length,
1180     or -2 if \C was encountered
1181     */
1182    
1183     static int
1184     find_fixedlength(uschar *code, int options)
1185     {
1186     int length = -1;
1187    
1188     register int branchlength = 0;
1189     register uschar *cc = code + 1 + LINK_SIZE;
1190    
1191     /* Scan along the opcodes for this branch. If we get to the end of the
1192     branch, check the length against that of the other branches. */
1193    
1194     for (;;)
1195     {
1196     int d;
1197     register int op = *cc;
1198     switch (op)
1199     {
1200 nigel 93 case OP_CBRA:
1201 nigel 77 case OP_BRA:
1202     case OP_ONCE:
1203     case OP_COND:
1204 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1205 nigel 77 if (d < 0) return d;
1206     branchlength += d;
1207     do cc += GET(cc, 1); while (*cc == OP_ALT);
1208     cc += 1 + LINK_SIZE;
1209     break;
1210    
1211     /* Reached end of a branch; if it's a ket it is the end of a nested
1212     call. If it's ALT it is an alternation in a nested call. If it is
1213     END it's the end of the outer call. All can be handled by the same code. */
1214    
1215     case OP_ALT:
1216     case OP_KET:
1217     case OP_KETRMAX:
1218     case OP_KETRMIN:
1219     case OP_END:
1220     if (length < 0) length = branchlength;
1221     else if (length != branchlength) return -1;
1222     if (*cc != OP_ALT) return length;
1223     cc += 1 + LINK_SIZE;
1224     branchlength = 0;
1225     break;
1226    
1227     /* Skip over assertive subpatterns */
1228    
1229     case OP_ASSERT:
1230     case OP_ASSERT_NOT:
1231     case OP_ASSERTBACK:
1232     case OP_ASSERTBACK_NOT:
1233     do cc += GET(cc, 1); while (*cc == OP_ALT);
1234     /* Fall through */
1235    
1236     /* Skip over things that don't match chars */
1237    
1238     case OP_REVERSE:
1239     case OP_CREF:
1240 nigel 93 case OP_RREF:
1241     case OP_DEF:
1242 nigel 77 case OP_OPT:
1243     case OP_CALLOUT:
1244     case OP_SOD:
1245     case OP_SOM:
1246     case OP_EOD:
1247     case OP_EODN:
1248     case OP_CIRC:
1249     case OP_DOLL:
1250     case OP_NOT_WORD_BOUNDARY:
1251     case OP_WORD_BOUNDARY:
1252     cc += _pcre_OP_lengths[*cc];
1253     break;
1254    
1255     /* Handle literal characters */
1256    
1257     case OP_CHAR:
1258     case OP_CHARNC:
1259 nigel 91 case OP_NOT:
1260 nigel 77 branchlength++;
1261     cc += 2;
1262     #ifdef SUPPORT_UTF8
1263     if ((options & PCRE_UTF8) != 0)
1264     {
1265     while ((*cc & 0xc0) == 0x80) cc++;
1266     }
1267     #endif
1268     break;
1269    
1270     /* Handle exact repetitions. The count is already in characters, but we
1271     need to skip over a multibyte character in UTF8 mode. */
1272    
1273     case OP_EXACT:
1274     branchlength += GET2(cc,1);
1275     cc += 4;
1276     #ifdef SUPPORT_UTF8
1277     if ((options & PCRE_UTF8) != 0)
1278     {
1279     while((*cc & 0x80) == 0x80) cc++;
1280     }
1281     #endif
1282     break;
1283    
1284     case OP_TYPEEXACT:
1285     branchlength += GET2(cc,1);
1286 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1287 nigel 77 cc += 4;
1288     break;
1289    
1290     /* Handle single-char matchers */
1291    
1292     case OP_PROP:
1293     case OP_NOTPROP:
1294 nigel 87 cc += 2;
1295 nigel 77 /* Fall through */
1296    
1297     case OP_NOT_DIGIT:
1298     case OP_DIGIT:
1299     case OP_NOT_WHITESPACE:
1300     case OP_WHITESPACE:
1301     case OP_NOT_WORDCHAR:
1302     case OP_WORDCHAR:
1303     case OP_ANY:
1304     branchlength++;
1305     cc++;
1306     break;
1307    
1308     /* The single-byte matcher isn't allowed */
1309    
1310     case OP_ANYBYTE:
1311     return -2;
1312    
1313     /* Check a class for variable quantification */
1314    
1315     #ifdef SUPPORT_UTF8
1316     case OP_XCLASS:
1317     cc += GET(cc, 1) - 33;
1318     /* Fall through */
1319     #endif
1320    
1321     case OP_CLASS:
1322     case OP_NCLASS:
1323     cc += 33;
1324    
1325     switch (*cc)
1326     {
1327     case OP_CRSTAR:
1328     case OP_CRMINSTAR:
1329     case OP_CRQUERY:
1330     case OP_CRMINQUERY:
1331     return -1;
1332    
1333     case OP_CRRANGE:
1334     case OP_CRMINRANGE:
1335     if (GET2(cc,1) != GET2(cc,3)) return -1;
1336     branchlength += GET2(cc,1);
1337     cc += 5;
1338     break;
1339    
1340     default:
1341     branchlength++;
1342     }
1343     break;
1344    
1345     /* Anything else is variable length */
1346    
1347     default:
1348     return -1;
1349     }
1350     }
1351     /* Control never gets here */
1352     }
1353    
1354    
1355    
1356    
1357     /*************************************************
1358     * Scan compiled regex for numbered bracket *
1359     *************************************************/
1360    
1361     /* This little function scans through a compiled pattern until it finds a
1362     capturing bracket with the given number.
1363    
1364     Arguments:
1365     code points to start of expression
1366     utf8 TRUE in UTF-8 mode
1367     number the required bracket number
1368    
1369     Returns: pointer to the opcode for the bracket, or NULL if not found
1370     */
1371    
1372     static const uschar *
1373     find_bracket(const uschar *code, BOOL utf8, int number)
1374     {
1375     for (;;)
1376     {
1377     register int c = *code;
1378     if (c == OP_END) return NULL;
1379 nigel 91
1380     /* XCLASS is used for classes that cannot be represented just by a bit
1381     map. This includes negated single high-valued characters. The length in
1382     the table is zero; the actual length is stored in the compiled code. */
1383    
1384     if (c == OP_XCLASS) code += GET(code, 1);
1385    
1386 nigel 93 /* Handle capturing bracket */
1387 nigel 91
1388 nigel 93 else if (c == OP_CBRA)
1389 nigel 77 {
1390 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1391 nigel 77 if (n == number) return (uschar *)code;
1392 nigel 93 code += _pcre_OP_lengths[c];
1393 nigel 77 }
1394 nigel 91
1395 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1396     repeated character types, we have to test for \p and \P, which have an extra
1397 ph10 218 two bytes of parameters. */
1398 nigel 91
1399 nigel 77 else
1400     {
1401 ph10 218 switch(c)
1402     {
1403     case OP_TYPESTAR:
1404     case OP_TYPEMINSTAR:
1405     case OP_TYPEPLUS:
1406     case OP_TYPEMINPLUS:
1407     case OP_TYPEQUERY:
1408     case OP_TYPEMINQUERY:
1409     case OP_TYPEPOSSTAR:
1410     case OP_TYPEPOSPLUS:
1411     case OP_TYPEPOSQUERY:
1412     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1413 ph10 220 break;
1414 ph10 221
1415     case OP_TYPEUPTO:
1416     case OP_TYPEMINUPTO:
1417     case OP_TYPEEXACT:
1418     case OP_TYPEPOSUPTO:
1419     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1420     break;
1421 ph10 220 }
1422    
1423 ph10 218 /* Add in the fixed length from the table */
1424 ph10 220
1425 nigel 77 code += _pcre_OP_lengths[c];
1426 ph10 220
1427 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1428     a multi-byte character. The length in the table is a minimum, so we have to
1429     arrange to skip the extra bytes. */
1430 ph10 220
1431 ph10 107 #ifdef SUPPORT_UTF8
1432 nigel 77 if (utf8) switch(c)
1433     {
1434     case OP_CHAR:
1435     case OP_CHARNC:
1436     case OP_EXACT:
1437     case OP_UPTO:
1438     case OP_MINUPTO:
1439 nigel 93 case OP_POSUPTO:
1440 nigel 77 case OP_STAR:
1441     case OP_MINSTAR:
1442 nigel 93 case OP_POSSTAR:
1443 nigel 77 case OP_PLUS:
1444     case OP_MINPLUS:
1445 nigel 93 case OP_POSPLUS:
1446 nigel 77 case OP_QUERY:
1447     case OP_MINQUERY:
1448 nigel 93 case OP_POSQUERY:
1449     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1450 nigel 77 break;
1451     }
1452 ph10 111 #endif
1453 nigel 77 }
1454     }
1455     }
1456    
1457    
1458    
1459     /*************************************************
1460     * Scan compiled regex for recursion reference *
1461     *************************************************/
1462    
1463     /* This little function scans through a compiled pattern until it finds an
1464     instance of OP_RECURSE.
1465    
1466     Arguments:
1467     code points to start of expression
1468     utf8 TRUE in UTF-8 mode
1469    
1470     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1471     */
1472    
1473     static const uschar *
1474     find_recurse(const uschar *code, BOOL utf8)
1475     {
1476     for (;;)
1477     {
1478     register int c = *code;
1479     if (c == OP_END) return NULL;
1480 nigel 91 if (c == OP_RECURSE) return code;
1481 ph10 220
1482 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1483     map. This includes negated single high-valued characters. The length in
1484     the table is zero; the actual length is stored in the compiled code. */
1485    
1486     if (c == OP_XCLASS) code += GET(code, 1);
1487    
1488 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1489     repeated character types, we have to test for \p and \P, which have an extra
1490 ph10 218 two bytes of parameters. */
1491 nigel 91
1492 nigel 77 else
1493     {
1494 ph10 218 switch(c)
1495     {
1496     case OP_TYPESTAR:
1497     case OP_TYPEMINSTAR:
1498     case OP_TYPEPLUS:
1499     case OP_TYPEMINPLUS:
1500     case OP_TYPEQUERY:
1501     case OP_TYPEMINQUERY:
1502     case OP_TYPEPOSSTAR:
1503     case OP_TYPEPOSPLUS:
1504     case OP_TYPEPOSQUERY:
1505     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1506 ph10 220 break;
1507 ph10 221
1508     case OP_TYPEPOSUPTO:
1509     case OP_TYPEUPTO:
1510     case OP_TYPEMINUPTO:
1511     case OP_TYPEEXACT:
1512     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1513     break;
1514 ph10 220 }
1515    
1516 ph10 218 /* Add in the fixed length from the table */
1517    
1518 nigel 77 code += _pcre_OP_lengths[c];
1519 ph10 220
1520 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1521     by a multi-byte character. The length in the table is a minimum, so we have
1522     to arrange to skip the extra bytes. */
1523 ph10 220
1524 ph10 107 #ifdef SUPPORT_UTF8
1525 nigel 77 if (utf8) switch(c)
1526     {
1527     case OP_CHAR:
1528     case OP_CHARNC:
1529     case OP_EXACT:
1530     case OP_UPTO:
1531     case OP_MINUPTO:
1532 nigel 93 case OP_POSUPTO:
1533 nigel 77 case OP_STAR:
1534     case OP_MINSTAR:
1535 nigel 93 case OP_POSSTAR:
1536 nigel 77 case OP_PLUS:
1537     case OP_MINPLUS:
1538 nigel 93 case OP_POSPLUS:
1539 nigel 77 case OP_QUERY:
1540     case OP_MINQUERY:
1541 nigel 93 case OP_POSQUERY:
1542     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1543 nigel 77 break;
1544     }
1545 ph10 111 #endif
1546 nigel 77 }
1547     }
1548     }
1549    
1550    
1551    
1552     /*************************************************
1553     * Scan compiled branch for non-emptiness *
1554     *************************************************/
1555    
1556     /* This function scans through a branch of a compiled pattern to see whether it
1557 nigel 93 can match the empty string or not. It is called from could_be_empty()
1558     below and from compile_branch() when checking for an unlimited repeat of a
1559     group that can match nothing. Note that first_significant_code() skips over
1560 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1561     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1562     bracket whose current branch will already have been scanned.
1563 nigel 77
1564     Arguments:
1565     code points to start of search
1566     endcode points to where to stop
1567     utf8 TRUE if in UTF8 mode
1568    
1569     Returns: TRUE if what is matched could be empty
1570     */
1571    
1572     static BOOL
1573     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1574     {
1575     register int c;
1576 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1577 nigel 77 code < endcode;
1578     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1579     {
1580     const uschar *ccode;
1581    
1582     c = *code;
1583 ph10 286
1584     /* Skip over forward assertions; the other assertions are skipped by
1585 ph10 282 first_significant_code() with a TRUE final argument. */
1586 ph10 286
1587 ph10 282 if (c == OP_ASSERT)
1588 ph10 286 {
1589 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1590     c = *code;
1591     continue;
1592 ph10 286 }
1593 ph10 172
1594 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1595 nigel 77
1596 ph10 335 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1597 ph10 170 {
1598 ph10 172 code += _pcre_OP_lengths[c];
1599 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1600     c = *code;
1601     continue;
1602     }
1603    
1604     /* For other groups, scan the branches. */
1605 ph10 172
1606 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1607 nigel 77 {
1608     BOOL empty_branch;
1609     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1610    
1611     /* Scan a closed bracket */
1612    
1613     empty_branch = FALSE;
1614     do
1615     {
1616     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1617     empty_branch = TRUE;
1618     code += GET(code, 1);
1619     }
1620     while (*code == OP_ALT);
1621     if (!empty_branch) return FALSE; /* All branches are non-empty */
1622 ph10 172 c = *code;
1623 nigel 93 continue;
1624 nigel 77 }
1625    
1626 nigel 93 /* Handle the other opcodes */
1627    
1628     switch (c)
1629 nigel 77 {
1630 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1631     cannot be represented just by a bit map. This includes negated single
1632     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1633 ph10 220 actual length is stored in the compiled code, so we must update "code"
1634 ph10 216 here. */
1635 nigel 77
1636     #ifdef SUPPORT_UTF8
1637     case OP_XCLASS:
1638 ph10 216 ccode = code += GET(code, 1);
1639 nigel 77 goto CHECK_CLASS_REPEAT;
1640     #endif
1641    
1642     case OP_CLASS:
1643     case OP_NCLASS:
1644     ccode = code + 33;
1645    
1646     #ifdef SUPPORT_UTF8
1647     CHECK_CLASS_REPEAT:
1648     #endif
1649    
1650     switch (*ccode)
1651     {
1652     case OP_CRSTAR: /* These could be empty; continue */
1653     case OP_CRMINSTAR:
1654     case OP_CRQUERY:
1655     case OP_CRMINQUERY:
1656     break;
1657    
1658     default: /* Non-repeat => class must match */
1659     case OP_CRPLUS: /* These repeats aren't empty */
1660     case OP_CRMINPLUS:
1661     return FALSE;
1662    
1663     case OP_CRRANGE:
1664     case OP_CRMINRANGE:
1665     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1666     break;
1667     }
1668     break;
1669    
1670     /* Opcodes that must match a character */
1671    
1672     case OP_PROP:
1673     case OP_NOTPROP:
1674     case OP_EXTUNI:
1675     case OP_NOT_DIGIT:
1676     case OP_DIGIT:
1677     case OP_NOT_WHITESPACE:
1678     case OP_WHITESPACE:
1679     case OP_NOT_WORDCHAR:
1680     case OP_WORDCHAR:
1681     case OP_ANY:
1682     case OP_ANYBYTE:
1683     case OP_CHAR:
1684     case OP_CHARNC:
1685     case OP_NOT:
1686     case OP_PLUS:
1687     case OP_MINPLUS:
1688 nigel 93 case OP_POSPLUS:
1689 nigel 77 case OP_EXACT:
1690     case OP_NOTPLUS:
1691     case OP_NOTMINPLUS:
1692 nigel 93 case OP_NOTPOSPLUS:
1693 nigel 77 case OP_NOTEXACT:
1694     case OP_TYPEPLUS:
1695     case OP_TYPEMINPLUS:
1696 nigel 93 case OP_TYPEPOSPLUS:
1697 nigel 77 case OP_TYPEEXACT:
1698     return FALSE;
1699 ph10 227
1700     /* These are going to continue, as they may be empty, but we have to
1701     fudge the length for the \p and \P cases. */
1702    
1703 ph10 224 case OP_TYPESTAR:
1704     case OP_TYPEMINSTAR:
1705     case OP_TYPEPOSSTAR:
1706     case OP_TYPEQUERY:
1707     case OP_TYPEMINQUERY:
1708     case OP_TYPEPOSQUERY:
1709     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1710 ph10 227 break;
1711    
1712 ph10 224 /* Same for these */
1713 ph10 227
1714 ph10 224 case OP_TYPEUPTO:
1715     case OP_TYPEMINUPTO:
1716     case OP_TYPEPOSUPTO:
1717     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1718     break;
1719 nigel 77
1720     /* End of branch */
1721    
1722     case OP_KET:
1723     case OP_KETRMAX:
1724     case OP_KETRMIN:
1725     case OP_ALT:
1726     return TRUE;
1727    
1728 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1729     MINUPTO, and POSUPTO may be followed by a multibyte character */
1730 nigel 77
1731     #ifdef SUPPORT_UTF8
1732     case OP_STAR:
1733     case OP_MINSTAR:
1734 nigel 93 case OP_POSSTAR:
1735 nigel 77 case OP_QUERY:
1736     case OP_MINQUERY:
1737 nigel 93 case OP_POSQUERY:
1738 nigel 77 case OP_UPTO:
1739     case OP_MINUPTO:
1740 nigel 93 case OP_POSUPTO:
1741 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1742     break;
1743     #endif
1744     }
1745     }
1746    
1747     return TRUE;
1748     }
1749    
1750    
1751    
1752     /*************************************************
1753     * Scan compiled regex for non-emptiness *
1754     *************************************************/
1755    
1756     /* This function is called to check for left recursive calls. We want to check
1757     the current branch of the current pattern to see if it could match the empty
1758     string. If it could, we must look outwards for branches at other levels,
1759     stopping when we pass beyond the bracket which is the subject of the recursion.
1760    
1761     Arguments:
1762     code points to start of the recursion
1763     endcode points to where to stop (current RECURSE item)
1764     bcptr points to the chain of current (unclosed) branch starts
1765     utf8 TRUE if in UTF-8 mode
1766    
1767     Returns: TRUE if what is matched could be empty
1768     */
1769    
1770     static BOOL
1771     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1772     BOOL utf8)
1773     {
1774     while (bcptr != NULL && bcptr->current >= code)
1775     {
1776     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1777     bcptr = bcptr->outer;
1778     }
1779     return TRUE;
1780     }
1781    
1782    
1783    
1784     /*************************************************
1785     * Check for POSIX class syntax *
1786     *************************************************/
1787    
1788     /* This function is called when the sequence "[:" or "[." or "[=" is
1789 ph10 295 encountered in a character class. It checks whether this is followed by a
1790 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1791 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
1792 nigel 77
1793 ph10 298 Originally, this function only recognized a sequence of letters between the
1794     terminators, but it seems that Perl recognizes any sequence of characters,
1795     though of course unknown POSIX names are subsequently rejected. Perl gives an
1796     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1797     didn't consider this to be a POSIX class. Likewise for [:1234:].
1798 ph10 295
1799 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
1800     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1801     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1802     below handles the special case of \], but does not try to do any other escape
1803     processing. This makes it different from Perl for cases such as [:l\ower:]
1804 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1805 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1806 ph10 295 I think.
1807    
1808     Arguments:
1809 nigel 77 ptr pointer to the initial [
1810     endptr where to return the end pointer
1811    
1812     Returns: TRUE or FALSE
1813     */
1814    
1815     static BOOL
1816 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1817 nigel 77 {
1818     int terminator; /* Don't combine these lines; the Solaris cc */
1819     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1820 ph10 295 for (++ptr; *ptr != 0; ptr++)
1821 nigel 77 {
1822 ph10 295 if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1823 ph10 298 {
1824     if (*ptr == ']') return FALSE;
1825 ph10 295 if (*ptr == terminator && ptr[1] == ']')
1826     {
1827     *endptr = ptr;
1828     return TRUE;
1829 ph10 298 }
1830     }
1831     }
1832 nigel 77 return FALSE;
1833     }
1834    
1835    
1836    
1837    
1838     /*************************************************
1839     * Check POSIX class name *
1840     *************************************************/
1841    
1842     /* This function is called to check the name given in a POSIX-style class entry
1843     such as [:alnum:].
1844    
1845     Arguments:
1846     ptr points to the first letter
1847     len the length of the name
1848    
1849     Returns: a value representing the name, or -1 if unknown
1850     */
1851    
1852     static int
1853     check_posix_name(const uschar *ptr, int len)
1854     {
1855 ph10 240 const char *pn = posix_names;
1856 nigel 77 register int yield = 0;
1857     while (posix_name_lengths[yield] != 0)
1858     {
1859     if (len == posix_name_lengths[yield] &&
1860 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
1861 ph10 243 pn += posix_name_lengths[yield] + 1;
1862 nigel 77 yield++;
1863     }
1864     return -1;
1865     }
1866    
1867    
1868     /*************************************************
1869     * Adjust OP_RECURSE items in repeated group *
1870     *************************************************/
1871    
1872     /* OP_RECURSE items contain an offset from the start of the regex to the group
1873     that is referenced. This means that groups can be replicated for fixed
1874     repetition simply by copying (because the recursion is allowed to refer to
1875     earlier groups that are outside the current group). However, when a group is
1876 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1877     inserted before it, after it has been compiled. This means that any OP_RECURSE
1878     items within it that refer to the group itself or any contained groups have to
1879     have their offsets adjusted. That one of the jobs of this function. Before it
1880     is called, the partially compiled regex must be temporarily terminated with
1881     OP_END.
1882 nigel 77
1883 nigel 93 This function has been extended with the possibility of forward references for
1884     recursions and subroutine calls. It must also check the list of such references
1885     for the group we are dealing with. If it finds that one of the recursions in
1886     the current group is on this list, it adjusts the offset in the list, not the
1887     value in the reference (which is a group number).
1888    
1889 nigel 77 Arguments:
1890     group points to the start of the group
1891     adjust the amount by which the group is to be moved
1892     utf8 TRUE in UTF-8 mode
1893     cd contains pointers to tables etc.
1894 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1895 nigel 77
1896     Returns: nothing
1897     */
1898    
1899     static void
1900 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1901     uschar *save_hwm)
1902 nigel 77 {
1903     uschar *ptr = group;
1904 ph10 224
1905 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1906     {
1907 nigel 93 int offset;
1908     uschar *hc;
1909    
1910     /* See if this recursion is on the forward reference list. If so, adjust the
1911     reference. */
1912 ph10 334
1913 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1914     {
1915     offset = GET(hc, 0);
1916     if (cd->start_code + offset == ptr + 1)
1917     {
1918     PUT(hc, 0, offset + adjust);
1919     break;
1920     }
1921     }
1922    
1923     /* Otherwise, adjust the recursion offset if it's after the start of this
1924     group. */
1925    
1926     if (hc >= cd->hwm)
1927     {
1928     offset = GET(ptr, 1);
1929     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1930     }
1931    
1932 nigel 77 ptr += 1 + LINK_SIZE;
1933     }
1934     }
1935    
1936    
1937    
1938     /*************************************************
1939     * Insert an automatic callout point *
1940     *************************************************/
1941    
1942     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1943     callout points before each pattern item.
1944    
1945     Arguments:
1946     code current code pointer
1947     ptr current pattern pointer
1948     cd pointers to tables etc
1949    
1950     Returns: new code pointer
1951     */
1952    
1953     static uschar *
1954     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1955     {
1956     *code++ = OP_CALLOUT;
1957     *code++ = 255;
1958     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1959     PUT(code, LINK_SIZE, 0); /* Default length */
1960     return code + 2*LINK_SIZE;
1961     }
1962    
1963    
1964    
1965     /*************************************************
1966     * Complete a callout item *
1967     *************************************************/
1968    
1969     /* A callout item contains the length of the next item in the pattern, which
1970     we can't fill in till after we have reached the relevant point. This is used
1971     for both automatic and manual callouts.
1972    
1973     Arguments:
1974     previous_callout points to previous callout item
1975     ptr current pattern pointer
1976     cd pointers to tables etc
1977    
1978     Returns: nothing
1979     */
1980    
1981     static void
1982     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1983     {
1984     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1985     PUT(previous_callout, 2 + LINK_SIZE, length);
1986     }
1987    
1988    
1989    
1990     #ifdef SUPPORT_UCP
1991     /*************************************************
1992     * Get othercase range *
1993     *************************************************/
1994    
1995     /* This function is passed the start and end of a class range, in UTF-8 mode
1996     with UCP support. It searches up the characters, looking for internal ranges of
1997     characters in the "other" case. Each call returns the next one, updating the
1998     start address.
1999    
2000     Arguments:
2001     cptr points to starting character value; updated
2002     d end value
2003     ocptr where to put start of othercase range
2004     odptr where to put end of othercase range
2005    
2006     Yield: TRUE when range returned; FALSE when no more
2007     */
2008    
2009     static BOOL
2010 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2011     unsigned int *odptr)
2012 nigel 77 {
2013 nigel 93 unsigned int c, othercase, next;
2014 nigel 77
2015     for (c = *cptr; c <= d; c++)
2016 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
2017 nigel 77
2018     if (c > d) return FALSE;
2019    
2020     *ocptr = othercase;
2021     next = othercase + 1;
2022    
2023     for (++c; c <= d; c++)
2024     {
2025 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
2026 nigel 77 next++;
2027     }
2028    
2029     *odptr = next - 1;
2030     *cptr = c;
2031    
2032     return TRUE;
2033     }
2034     #endif /* SUPPORT_UCP */
2035    
2036    
2037 nigel 93
2038 nigel 77 /*************************************************
2039 nigel 93 * Check if auto-possessifying is possible *
2040     *************************************************/
2041    
2042     /* This function is called for unlimited repeats of certain items, to see
2043     whether the next thing could possibly match the repeated item. If not, it makes
2044     sense to automatically possessify the repeated item.
2045    
2046     Arguments:
2047     op_code the repeated op code
2048     this data for this item, depends on the opcode
2049     utf8 TRUE in UTF-8 mode
2050     utf8_char used for utf8 character bytes, NULL if not relevant
2051     ptr next character in pattern
2052     options options bits
2053     cd contains pointers to tables etc.
2054    
2055     Returns: TRUE if possessifying is wanted
2056     */
2057    
2058     static BOOL
2059     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2060     const uschar *ptr, int options, compile_data *cd)
2061     {
2062     int next;
2063    
2064     /* Skip whitespace and comments in extended mode */
2065    
2066     if ((options & PCRE_EXTENDED) != 0)
2067     {
2068     for (;;)
2069     {
2070     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2071     if (*ptr == '#')
2072     {
2073     while (*(++ptr) != 0)
2074     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2075     }
2076     else break;
2077     }
2078     }
2079    
2080     /* If the next item is one that we can handle, get its value. A non-negative
2081     value is a character, a negative value is an escape value. */
2082    
2083     if (*ptr == '\\')
2084     {
2085     int temperrorcode = 0;
2086     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2087     if (temperrorcode != 0) return FALSE;
2088     ptr++; /* Point after the escape sequence */
2089     }
2090    
2091     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2092     {
2093     #ifdef SUPPORT_UTF8
2094     if (utf8) { GETCHARINC(next, ptr); } else
2095     #endif
2096     next = *ptr++;
2097     }
2098    
2099     else return FALSE;
2100    
2101     /* Skip whitespace and comments in extended mode */
2102    
2103     if ((options & PCRE_EXTENDED) != 0)
2104     {
2105     for (;;)
2106     {
2107     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2108     if (*ptr == '#')
2109     {
2110     while (*(++ptr) != 0)
2111     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2112     }
2113     else break;
2114     }
2115     }
2116    
2117     /* If the next thing is itself optional, we have to give up. */
2118    
2119     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2120     return FALSE;
2121    
2122     /* Now compare the next item with the previous opcode. If the previous is a
2123     positive single character match, "item" either contains the character or, if
2124     "item" is greater than 127 in utf8 mode, the character's bytes are in
2125     utf8_char. */
2126    
2127    
2128     /* Handle cases when the next item is a character. */
2129    
2130     if (next >= 0) switch(op_code)
2131     {
2132     case OP_CHAR:
2133     #ifdef SUPPORT_UTF8
2134     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2135     #endif
2136     return item != next;
2137    
2138     /* For CHARNC (caseless character) we must check the other case. If we have
2139     Unicode property support, we can use it to test the other case of
2140     high-valued characters. */
2141    
2142     case OP_CHARNC:
2143     #ifdef SUPPORT_UTF8
2144     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2145     #endif
2146     if (item == next) return FALSE;
2147     #ifdef SUPPORT_UTF8
2148     if (utf8)
2149     {
2150     unsigned int othercase;
2151     if (next < 128) othercase = cd->fcc[next]; else
2152     #ifdef SUPPORT_UCP
2153     othercase = _pcre_ucp_othercase((unsigned int)next);
2154     #else
2155     othercase = NOTACHAR;
2156     #endif
2157     return (unsigned int)item != othercase;
2158     }
2159     else
2160     #endif /* SUPPORT_UTF8 */
2161     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2162    
2163     /* For OP_NOT, "item" must be a single-byte character. */
2164    
2165     case OP_NOT:
2166     if (item == next) return TRUE;
2167     if ((options & PCRE_CASELESS) == 0) return FALSE;
2168     #ifdef SUPPORT_UTF8
2169     if (utf8)
2170     {
2171     unsigned int othercase;
2172     if (next < 128) othercase = cd->fcc[next]; else
2173     #ifdef SUPPORT_UCP
2174     othercase = _pcre_ucp_othercase(next);
2175     #else
2176     othercase = NOTACHAR;
2177     #endif
2178     return (unsigned int)item == othercase;
2179     }
2180     else
2181     #endif /* SUPPORT_UTF8 */
2182     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2183    
2184     case OP_DIGIT:
2185     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2186    
2187     case OP_NOT_DIGIT:
2188     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2189    
2190     case OP_WHITESPACE:
2191     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2192    
2193     case OP_NOT_WHITESPACE:
2194     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2195    
2196     case OP_WORDCHAR:
2197     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2198    
2199     case OP_NOT_WORDCHAR:
2200     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2201    
2202 ph10 180 case OP_HSPACE:
2203     case OP_NOT_HSPACE:
2204     switch(next)
2205     {
2206     case 0x09:
2207     case 0x20:
2208     case 0xa0:
2209     case 0x1680:
2210     case 0x180e:
2211     case 0x2000:
2212     case 0x2001:
2213     case 0x2002:
2214     case 0x2003:
2215     case 0x2004:
2216     case 0x2005:
2217     case 0x2006:
2218     case 0x2007:
2219     case 0x2008:
2220     case 0x2009:
2221     case 0x200A:
2222     case 0x202f:
2223     case 0x205f:
2224     case 0x3000:
2225     return op_code != OP_HSPACE;
2226     default:
2227     return op_code == OP_HSPACE;
2228     }
2229    
2230     case OP_VSPACE:
2231     case OP_NOT_VSPACE:
2232     switch(next)
2233     {
2234     case 0x0a:
2235     case 0x0b:
2236     case 0x0c:
2237     case 0x0d:
2238     case 0x85:
2239     case 0x2028:
2240     case 0x2029:
2241     return op_code != OP_VSPACE;
2242     default:
2243     return op_code == OP_VSPACE;
2244     }
2245    
2246 nigel 93 default:
2247     return FALSE;
2248     }
2249    
2250    
2251     /* Handle the case when the next item is \d, \s, etc. */
2252    
2253     switch(op_code)
2254     {
2255     case OP_CHAR:
2256     case OP_CHARNC:
2257     #ifdef SUPPORT_UTF8
2258     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2259     #endif
2260     switch(-next)
2261     {
2262     case ESC_d:
2263     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2264    
2265     case ESC_D:
2266     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2267    
2268     case ESC_s:
2269     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2270    
2271     case ESC_S:
2272     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2273    
2274     case ESC_w:
2275     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2276    
2277     case ESC_W:
2278     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2279 ph10 182
2280 ph10 180 case ESC_h:
2281     case ESC_H:
2282     switch(item)
2283     {
2284     case 0x09:
2285     case 0x20:
2286     case 0xa0:
2287     case 0x1680:
2288     case 0x180e:
2289     case 0x2000:
2290     case 0x2001:
2291     case 0x2002:
2292     case 0x2003:
2293     case 0x2004:
2294     case 0x2005:
2295     case 0x2006:
2296     case 0x2007:
2297     case 0x2008:
2298     case 0x2009:
2299     case 0x200A:
2300     case 0x202f:
2301     case 0x205f:
2302     case 0x3000:
2303     return -next != ESC_h;
2304     default:
2305     return -next == ESC_h;
2306 ph10 182 }
2307    
2308 ph10 180 case ESC_v:
2309     case ESC_V:
2310     switch(item)
2311     {
2312     case 0x0a:
2313     case 0x0b:
2314     case 0x0c:
2315     case 0x0d:
2316     case 0x85:
2317     case 0x2028:
2318     case 0x2029:
2319     return -next != ESC_v;
2320     default:
2321     return -next == ESC_v;
2322 ph10 182 }
2323 nigel 93
2324     default:
2325     return FALSE;
2326     }
2327    
2328     case OP_DIGIT:
2329 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2330     next == -ESC_h || next == -ESC_v;
2331 nigel 93
2332     case OP_NOT_DIGIT:
2333     return next == -ESC_d;
2334    
2335     case OP_WHITESPACE:
2336     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2337    
2338     case OP_NOT_WHITESPACE:
2339 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2340 nigel 93
2341 ph10 180 case OP_HSPACE:
2342     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2343    
2344     case OP_NOT_HSPACE:
2345     return next == -ESC_h;
2346 ph10 182
2347 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2348 ph10 182 case OP_VSPACE:
2349 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2350    
2351     case OP_NOT_VSPACE:
2352 ph10 182 return next == -ESC_v;
2353 ph10 180
2354 nigel 93 case OP_WORDCHAR:
2355 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2356 nigel 93
2357     case OP_NOT_WORDCHAR:
2358     return next == -ESC_w || next == -ESC_d;
2359 ph10 182
2360 nigel 93 default:
2361     return FALSE;
2362     }
2363    
2364     /* Control does not reach here */
2365     }
2366    
2367    
2368    
2369     /*************************************************
2370 nigel 77 * Compile one branch *
2371     *************************************************/
2372    
2373 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2374 nigel 77 changed during the branch, the pointer is used to change the external options
2375 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2376     to find out the amount of memory needed, as well as during the real compile
2377     phase. The value of lengthptr distinguishes the two phases.
2378 nigel 77
2379     Arguments:
2380     optionsptr pointer to the option bits
2381     codeptr points to the pointer to the current code point
2382     ptrptr points to the current pattern pointer
2383     errorcodeptr points to error code variable
2384     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2385     reqbyteptr set to the last literal character required, else < 0
2386     bcptr points to current branch chain
2387     cd contains pointers to tables etc.
2388 nigel 93 lengthptr NULL during the real compile phase
2389     points to length accumulator during pre-compile phase
2390 nigel 77
2391     Returns: TRUE on success
2392     FALSE, with *errorcodeptr set non-zero on error
2393     */
2394    
2395     static BOOL
2396 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2397     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2398     compile_data *cd, int *lengthptr)
2399 nigel 77 {
2400     int repeat_type, op_type;
2401     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2402     int bravalue = 0;
2403     int greedy_default, greedy_non_default;
2404     int firstbyte, reqbyte;
2405     int zeroreqbyte, zerofirstbyte;
2406     int req_caseopt, reqvary, tempreqvary;
2407     int options = *optionsptr;
2408     int after_manual_callout = 0;
2409 nigel 93 int length_prevgroup = 0;
2410 nigel 77 register int c;
2411     register uschar *code = *codeptr;
2412 nigel 93 uschar *last_code = code;
2413     uschar *orig_code = code;
2414 nigel 77 uschar *tempcode;
2415     BOOL inescq = FALSE;
2416     BOOL groupsetfirstbyte = FALSE;
2417     const uschar *ptr = *ptrptr;
2418     const uschar *tempptr;
2419     uschar *previous = NULL;
2420     uschar *previous_callout = NULL;
2421 nigel 93 uschar *save_hwm = NULL;
2422 nigel 77 uschar classbits[32];
2423    
2424     #ifdef SUPPORT_UTF8
2425     BOOL class_utf8;
2426     BOOL utf8 = (options & PCRE_UTF8) != 0;
2427     uschar *class_utf8data;
2428 ph10 300 uschar *class_utf8data_base;
2429 nigel 77 uschar utf8_char[6];
2430     #else
2431     BOOL utf8 = FALSE;
2432 nigel 93 uschar *utf8_char = NULL;
2433 nigel 77 #endif
2434    
2435 nigel 93 #ifdef DEBUG
2436     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2437     #endif
2438    
2439 nigel 77 /* Set up the default and non-default settings for greediness */
2440    
2441     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2442     greedy_non_default = greedy_default ^ 1;
2443    
2444     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2445     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2446     matches a non-fixed char first char; reqbyte just remains unset if we never
2447     find one.
2448    
2449     When we hit a repeat whose minimum is zero, we may have to adjust these values
2450     to take the zero repeat into account. This is implemented by setting them to
2451     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2452     item types that can be repeated set these backoff variables appropriately. */
2453    
2454     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2455    
2456     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2457     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2458     value > 255. It is added into the firstbyte or reqbyte variables to record the
2459     case status of the value. This is used only for ASCII characters. */
2460    
2461     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2462    
2463     /* Switch on next character until the end of the branch */
2464    
2465     for (;; ptr++)
2466     {
2467     BOOL negate_class;
2468 ph10 286 BOOL should_flip_negation;
2469 nigel 77 BOOL possessive_quantifier;
2470     BOOL is_quantifier;
2471 nigel 93 BOOL is_recurse;
2472 ph10 180 BOOL reset_bracount;
2473 nigel 77 int class_charcount;
2474     int class_lastchar;
2475     int newoptions;
2476     int recno;
2477 ph10 172 int refsign;
2478 nigel 77 int skipbytes;
2479     int subreqbyte;
2480     int subfirstbyte;
2481 nigel 93 int terminator;
2482 nigel 77 int mclength;
2483     uschar mcbuffer[8];
2484    
2485 nigel 93 /* Get next byte in the pattern */
2486 nigel 77
2487     c = *ptr;
2488 ph10 334
2489 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2490     previous cycle of this loop. */
2491    
2492     if (lengthptr != NULL)
2493     {
2494     #ifdef DEBUG
2495     if (code > cd->hwm) cd->hwm = code; /* High water info */
2496     #endif
2497     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2498     {
2499     *errorcodeptr = ERR52;
2500     goto FAILED;
2501     }
2502    
2503     /* There is at least one situation where code goes backwards: this is the
2504     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2505     the class is simply eliminated. However, it is created first, so we have to
2506     allow memory for it. Therefore, don't ever reduce the length at this point.
2507     */
2508    
2509     if (code < last_code) code = last_code;
2510 ph10 202
2511     /* Paranoid check for integer overflow */
2512    
2513     if (OFLOW_MAX - *lengthptr < code - last_code)
2514     {
2515     *errorcodeptr = ERR20;
2516     goto FAILED;
2517     }
2518    
2519 nigel 93 *lengthptr += code - last_code;
2520     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2521    
2522     /* If "previous" is set and it is not at the start of the work space, move
2523     it back to there, in order to avoid filling up the work space. Otherwise,
2524     if "previous" is NULL, reset the current code pointer to the start. */
2525    
2526     if (previous != NULL)
2527     {
2528     if (previous > orig_code)
2529     {
2530     memmove(orig_code, previous, code - previous);
2531     code -= previous - orig_code;
2532     previous = orig_code;
2533     }
2534     }
2535     else code = orig_code;
2536    
2537     /* Remember where this code item starts so we can pick up the length
2538     next time round. */
2539    
2540     last_code = code;
2541     }
2542    
2543     /* In the real compile phase, just check the workspace used by the forward
2544     reference list. */
2545    
2546     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2547     {
2548     *errorcodeptr = ERR52;
2549     goto FAILED;
2550     }
2551    
2552 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2553    
2554     if (inescq && c != 0)
2555     {
2556     if (c == '\\' && ptr[1] == 'E')
2557     {
2558     inescq = FALSE;
2559     ptr++;
2560     continue;
2561     }
2562     else
2563     {
2564     if (previous_callout != NULL)
2565     {
2566 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2567     complete_callout(previous_callout, ptr, cd);
2568 nigel 77 previous_callout = NULL;
2569     }
2570     if ((options & PCRE_AUTO_CALLOUT) != 0)
2571     {
2572     previous_callout = code;
2573     code = auto_callout(code, ptr, cd);
2574     }
2575     goto NORMAL_CHAR;
2576     }
2577     }
2578    
2579     /* Fill in length of a previous callout, except when the next thing is
2580     a quantifier. */
2581    
2582     is_quantifier = c == '*' || c == '+' || c == '?' ||
2583     (c == '{' && is_counted_repeat(ptr+1));
2584    
2585     if (!is_quantifier && previous_callout != NULL &&
2586     after_manual_callout-- <= 0)
2587     {
2588 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2589     complete_callout(previous_callout, ptr, cd);
2590 nigel 77 previous_callout = NULL;
2591     }
2592    
2593     /* In extended mode, skip white space and comments */
2594    
2595     if ((options & PCRE_EXTENDED) != 0)
2596     {
2597     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2598     if (c == '#')
2599     {
2600 nigel 93 while (*(++ptr) != 0)
2601 nigel 91 {
2602 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2603 nigel 91 }
2604 nigel 93 if (*ptr != 0) continue;
2605    
2606 nigel 91 /* Else fall through to handle end of string */
2607     c = 0;
2608 nigel 77 }
2609     }
2610    
2611     /* No auto callout for quantifiers. */
2612    
2613     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2614     {
2615     previous_callout = code;
2616     code = auto_callout(code, ptr, cd);
2617     }
2618    
2619     switch(c)
2620     {
2621 nigel 93 /* ===================================================================*/
2622     case 0: /* The branch terminates at string end */
2623     case '|': /* or | or ) */
2624 nigel 77 case ')':
2625     *firstbyteptr = firstbyte;
2626     *reqbyteptr = reqbyte;
2627     *codeptr = code;
2628     *ptrptr = ptr;
2629 nigel 93 if (lengthptr != NULL)
2630     {
2631 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2632     {
2633     *errorcodeptr = ERR20;
2634     goto FAILED;
2635     }
2636 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2637     DPRINTF((">> end branch\n"));
2638     }
2639 nigel 77 return TRUE;
2640    
2641 nigel 93
2642     /* ===================================================================*/
2643 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2644     the setting of any following char as a first character. */
2645    
2646     case '^':
2647     if ((options & PCRE_MULTILINE) != 0)
2648     {
2649     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2650     }
2651     previous = NULL;
2652     *code++ = OP_CIRC;
2653     break;
2654    
2655     case '$':
2656     previous = NULL;
2657     *code++ = OP_DOLL;
2658     break;
2659    
2660     /* There can never be a first char if '.' is first, whatever happens about
2661     repeats. The value of reqbyte doesn't change either. */
2662    
2663     case '.':
2664     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2665     zerofirstbyte = firstbyte;
2666     zeroreqbyte = reqbyte;
2667     previous = code;
2668     *code++ = OP_ANY;
2669     break;
2670    
2671 nigel 93
2672     /* ===================================================================*/
2673 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2674     32-byte bitmap of the permitted characters, except in the special case
2675     where there is only one such character. For negated classes, we build the
2676     map as usual, then invert it at the end. However, we use a different opcode
2677     so that data characters > 255 can be handled correctly.
2678 nigel 77
2679     If the class contains characters outside the 0-255 range, a different
2680     opcode is compiled. It may optionally have a bit map for characters < 256,
2681     but those above are are explicitly listed afterwards. A flag byte tells
2682     whether the bitmap is present, and whether this is a negated class or not.
2683 ph10 336
2684     In JavaScript compatibility mode, an isolated ']' causes an error. In
2685     default (Perl) mode, it is treated as a data character. */
2686    
2687     case ']':
2688     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2689     {
2690     *errorcodeptr = ERR64;
2691     goto FAILED;
2692     }
2693     goto NORMAL_CHAR;
2694 nigel 77
2695     case '[':
2696     previous = code;
2697    
2698     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2699     they are encountered at the top level, so we'll do that too. */
2700    
2701     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2702 ph10 295 check_posix_syntax(ptr, &tempptr))
2703 nigel 77 {
2704     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2705     goto FAILED;
2706     }
2707    
2708 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2709 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2710 ph10 205 skip them too. This makes for compatibility with Perl. */
2711 ph10 208
2712 ph10 205 negate_class = FALSE;
2713     for (;;)
2714 nigel 77 {
2715     c = *(++ptr);
2716 ph10 205 if (c == '\\')
2717     {
2718 ph10 208 if (ptr[1] == 'E') ptr++;
2719 ph10 205 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2720 ph10 208 else break;
2721 ph10 205 }
2722     else if (!negate_class && c == '^')
2723     negate_class = TRUE;
2724     else break;
2725 ph10 208 }
2726 ph10 341
2727     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2728     an initial ']' is taken as a data character -- the code below handles
2729     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2730     [^] must match any character, so generate OP_ALLANY. */
2731    
2732     if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2733     {
2734     *code++ = negate_class? OP_ALLANY : OP_FAIL;
2735     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2736     zerofirstbyte = firstbyte;
2737     break;
2738     }
2739 nigel 77
2740 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
2741     negation flag at the end, so that support for characters > 255 works
2742 ph10 264 correctly (they are all included in the class). */
2743    
2744     should_flip_negation = FALSE;
2745    
2746 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
2747 nigel 93 of just a single character (as long as it's < 256). However, For higher
2748     valued UTF-8 characters, we don't yet do any optimization. */
2749 nigel 77
2750     class_charcount = 0;
2751     class_lastchar = -1;
2752    
2753 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2754     temporary bit of memory, in case the class contains only 1 character (less
2755     than 256), because in that case the compiled code doesn't use the bit map.
2756     */
2757    
2758     memset(classbits, 0, 32 * sizeof(uschar));
2759    
2760 nigel 77 #ifdef SUPPORT_UTF8
2761     class_utf8 = FALSE; /* No chars >= 256 */
2762 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2763 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2764 nigel 77 #endif
2765    
2766     /* Process characters until ] is reached. By writing this as a "do" it
2767 nigel 93 means that an initial ] is taken as a data character. At the start of the
2768     loop, c contains the first byte of the character. */
2769 nigel 77
2770 nigel 93 if (c != 0) do
2771 nigel 77 {
2772 nigel 93 const uschar *oldptr;
2773    
2774 nigel 77 #ifdef SUPPORT_UTF8
2775     if (utf8 && c > 127)
2776     { /* Braces are required because the */
2777     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2778     }
2779 ph10 309
2780 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2781 ph10 309 data and reset the pointer. This is so that very large classes that
2782 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
2783 ph10 309 (which is on the stack). */
2784    
2785 ph10 300 if (lengthptr != NULL)
2786     {
2787     *lengthptr += class_utf8data - class_utf8data_base;
2788 ph10 309 class_utf8data = class_utf8data_base;
2789     }
2790    
2791 nigel 77 #endif
2792    
2793     /* Inside \Q...\E everything is literal except \E */
2794    
2795     if (inescq)
2796     {
2797 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2798 nigel 77 {
2799 nigel 93 inescq = FALSE; /* Reset literal state */
2800     ptr++; /* Skip the 'E' */
2801     continue; /* Carry on with next */
2802 nigel 77 }
2803 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2804 nigel 77 }
2805    
2806     /* Handle POSIX class names. Perl allows a negation extension of the
2807     form [:^name:]. A square bracket that doesn't match the syntax is
2808     treated as a literal. We also recognize the POSIX constructions
2809     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2810     5.6 and 5.8 do. */
2811    
2812     if (c == '[' &&
2813     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2814 ph10 295 check_posix_syntax(ptr, &tempptr))
2815 nigel 77 {
2816     BOOL local_negate = FALSE;
2817 nigel 87 int posix_class, taboffset, tabopt;
2818 nigel 77 register const uschar *cbits = cd->cbits;
2819 nigel 87 uschar pbits[32];
2820 nigel 77
2821     if (ptr[1] != ':')
2822     {
2823     *errorcodeptr = ERR31;
2824     goto FAILED;
2825     }
2826    
2827     ptr += 2;
2828     if (*ptr == '^')
2829     {
2830     local_negate = TRUE;
2831 ph10 286 should_flip_negation = TRUE; /* Note negative special */
2832 nigel 77 ptr++;
2833     }
2834    
2835     posix_class = check_posix_name(ptr, tempptr - ptr);
2836     if (posix_class < 0)
2837     {
2838     *errorcodeptr = ERR30;
2839     goto FAILED;
2840     }
2841    
2842     /* If matching is caseless, upper and lower are converted to
2843     alpha. This relies on the fact that the class table starts with
2844     alpha, lower, upper as the first 3 entries. */
2845    
2846     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2847     posix_class = 0;
2848    
2849 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2850     because we may be adding and subtracting from it, and we don't want to
2851     subtract bits that may be in the main map already. At the end we or the
2852     result into the bit map that is being built. */
2853 nigel 77
2854     posix_class *= 3;
2855 nigel 87
2856     /* Copy in the first table (always present) */
2857    
2858     memcpy(pbits, cbits + posix_class_maps[posix_class],
2859     32 * sizeof(uschar));
2860    
2861     /* If there is a second table, add or remove it as required. */
2862    
2863     taboffset = posix_class_maps[posix_class + 1];
2864     tabopt = posix_class_maps[posix_class + 2];
2865    
2866     if (taboffset >= 0)
2867 nigel 77 {
2868 nigel 87 if (tabopt >= 0)
2869     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2870 nigel 77 else
2871 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2872 nigel 77 }
2873    
2874 nigel 87 /* Not see if we need to remove any special characters. An option
2875     value of 1 removes vertical space and 2 removes underscore. */
2876    
2877     if (tabopt < 0) tabopt = -tabopt;
2878     if (tabopt == 1) pbits[1] &= ~0x3c;
2879     else if (tabopt == 2) pbits[11] &= 0x7f;
2880    
2881     /* Add the POSIX table or its complement into the main table that is
2882     being built and we are done. */
2883    
2884     if (local_negate)
2885     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2886     else
2887     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2888    
2889 nigel 77 ptr = tempptr + 1;
2890     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2891     continue; /* End of POSIX syntax handling */
2892     }
2893    
2894     /* Backslash may introduce a single character, or it may introduce one
2895 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2896     case. Inside a class (and only there) it is treated as backspace.
2897     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2898 ph10 205 to 'or' into the one we are building. We assume they have more than one
2899 nigel 77 character in them, so set class_charcount bigger than one. */
2900    
2901     if (c == '\\')
2902     {
2903 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2904     if (*errorcodeptr != 0) goto FAILED;
2905 nigel 77
2906 ph10 275 if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
2907 nigel 77 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2908 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2909 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2910     {
2911     if (ptr[1] == '\\' && ptr[2] == 'E')
2912     {
2913     ptr += 2; /* avoid empty string */
2914     }
2915     else inescq = TRUE;
2916     continue;
2917     }
2918 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2919 nigel 77
2920     if (c < 0)
2921     {
2922     register const uschar *cbits = cd->cbits;
2923     class_charcount += 2; /* Greater than 1 is what matters */
2924 nigel 93
2925     /* Save time by not doing this in the pre-compile phase. */
2926    
2927     if (lengthptr == NULL) switch (-c)
2928 nigel 77 {
2929     case ESC_d:
2930     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2931     continue;
2932    
2933     case ESC_D:
2934 ph10 286 should_flip_negation = TRUE;
2935 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2936     continue;
2937    
2938     case ESC_w:
2939     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2940     continue;
2941    
2942     case ESC_W:
2943 ph10 286 should_flip_negation = TRUE;
2944 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2945     continue;
2946    
2947     case ESC_s:
2948     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2949     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2950     continue;
2951    
2952     case ESC_S:
2953 ph10 286 should_flip_negation = TRUE;
2954 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2955     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2956     continue;
2957    
2958 nigel 93 default: /* Not recognized; fall through */
2959     break; /* Need "default" setting to stop compiler warning. */
2960     }
2961    
2962     /* In the pre-compile phase, just do the recognition. */
2963    
2964     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2965     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2966 ph10 180
2967 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
2968     they use extra memory. */
2969 ph10 180
2970 ph10 178 if (-c == ESC_h)
2971     {
2972     SETBIT(classbits, 0x09); /* VT */
2973     SETBIT(classbits, 0x20); /* SPACE */
2974 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
2975 ph10 178 #ifdef SUPPORT_UTF8
2976     if (utf8)
2977 ph10 180 {
2978 ph10 178 class_utf8 = TRUE;
2979     *class_utf8data++ = XCL_SINGLE;
2980 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2981 ph10 178 *class_utf8data++ = XCL_SINGLE;
2982 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2983     *class_utf8data++ = XCL_RANGE;
2984     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2985     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2986 ph10 178 *class_utf8data++ = XCL_SINGLE;
2987 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2988 ph10 178 *class_utf8data++ = XCL_SINGLE;
2989 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2990 ph10 178 *class_utf8data++ = XCL_SINGLE;
2991 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2992     }
2993     #endif
2994     continue;
2995     }
2996 nigel 93
2997 ph10 178 if (-c == ESC_H)
2998     {
2999     for (c = 0; c < 32; c++)
3000     {
3001     int x = 0xff;
3002     switch (c)
3003 ph10 180 {
3004 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
3005     case 0x20/8: x ^= 1 << (0x20%8); break;
3006     case 0xa0/8: x ^= 1 << (0xa0%8); break;
3007     default: break;
3008     }
3009     classbits[c] |= x;
3010 ph10 180 }
3011    
3012 ph10 178 #ifdef SUPPORT_UTF8
3013     if (utf8)
3014 ph10 180 {
3015 ph10 178 class_utf8 = TRUE;
3016 ph10 180 *class_utf8data++ = XCL_RANGE;
3017     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3018     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3019     *class_utf8data++ = XCL_RANGE;
3020     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3021     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3022     *class_utf8data++ = XCL_RANGE;
3023     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3024     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3025     *class_utf8data++ = XCL_RANGE;
3026     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3027     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3028     *class_utf8data++ = XCL_RANGE;
3029     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3030     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3031     *class_utf8data++ = XCL_RANGE;
3032     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3033     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3034     *class_utf8data++ = XCL_RANGE;
3035     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3036     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3037     }
3038     #endif
3039     continue;
3040     }
3041 ph10 178
3042     if (-c == ESC_v)
3043     {
3044     SETBIT(classbits, 0x0a); /* LF */
3045     SETBIT(classbits, 0x0b); /* VT */
3046 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3047     SETBIT(classbits, 0x0d); /* CR */
3048     SETBIT(classbits, 0x85); /* NEL */
3049 ph10 178 #ifdef SUPPORT_UTF8
3050     if (utf8)
3051 ph10 180 {
3052 ph10 178 class_utf8 = TRUE;
3053 ph10 180 *class_utf8data++ = XCL_RANGE;
3054     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3055     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3056     }
3057     #endif
3058     continue;
3059     }
3060 ph10 178
3061     if (-c == ESC_V)
3062     {
3063     for (c = 0; c < 32; c++)
3064     {
3065     int x = 0xff;
3066     switch (c)
3067 ph10 180 {
3068 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3069     x ^= 1 << (0x0b%8);
3070     x ^= 1 << (0x0c%8);
3071 ph10 180 x ^= 1 << (0x0d%8);
3072 ph10 178 break;
3073     case 0x85/8: x ^= 1 << (0x85%8); break;
3074     default: break;
3075     }
3076     classbits[c] |= x;
3077 ph10 180 }
3078    
3079 ph10 178 #ifdef SUPPORT_UTF8
3080     if (utf8)
3081 ph10 180 {
3082 ph10 178 class_utf8 = TRUE;
3083 ph10 180 *class_utf8data++ = XCL_RANGE;
3084     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3085     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3086     *class_utf8data++ = XCL_RANGE;
3087     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3088     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3089     }
3090     #endif
3091     continue;
3092     }
3093 ph10 178
3094 nigel 93 /* We need to deal with \P and \p in both phases. */
3095    
3096 nigel 77 #ifdef SUPPORT_UCP
3097 nigel 93 if (-c == ESC_p || -c == ESC_P)
3098     {
3099     BOOL negated;
3100     int pdata;
3101     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3102     if (ptype < 0) goto FAILED;
3103     class_utf8 = TRUE;
3104     *class_utf8data++ = ((-c == ESC_p) != negated)?
3105     XCL_PROP : XCL_NOTPROP;
3106     *class_utf8data++ = ptype;
3107     *class_utf8data++ = pdata;
3108     class_charcount -= 2; /* Not a < 256 character */
3109 nigel 77 continue;
3110 nigel 93 }
3111 nigel 77 #endif
3112 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
3113     strict mode. By default, for compatibility with Perl, they are
3114     treated as literals. */
3115 nigel 77
3116 nigel 93 if ((options & PCRE_EXTRA) != 0)
3117     {
3118     *errorcodeptr = ERR7;
3119     goto FAILED;
3120     }
3121 nigel 77
3122 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3123     c = *ptr; /* Get the final character and fall through */
3124 nigel 77 }
3125    
3126     /* Fall through if we have a single character (c >= 0). This may be
3127 nigel 93 greater than 256 in UTF-8 mode. */
3128 nigel 77
3129     } /* End of backslash handling */
3130    
3131     /* A single character may be followed by '-' to form a range. However,
3132     Perl does not permit ']' to be the end of the range. A '-' character
3133 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3134     entirely. The code for handling \Q and \E is messy. */
3135 nigel 77
3136 nigel 93 CHECK_RANGE:
3137     while (ptr[1] == '\\' && ptr[2] == 'E')
3138 nigel 77 {
3139 nigel 93 inescq = FALSE;
3140     ptr += 2;
3141     }
3142    
3143     oldptr = ptr;
3144 ph10 231
3145 ph10 230 /* Remember \r or \n */
3146 ph10 231
3147     if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3148    
3149 ph10 230 /* Check for range */
3150 nigel 93
3151     if (!inescq && ptr[1] == '-')
3152     {
3153 nigel 77 int d;
3154     ptr += 2;
3155 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3156 nigel 77
3157 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3158     mode. */
3159    
3160     while (*ptr == '\\' && ptr[1] == 'Q')
3161     {
3162     ptr += 2;
3163     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3164     inescq = TRUE;
3165     break;
3166     }
3167    
3168     if (*ptr == 0 || (!inescq && *ptr == ']'))
3169     {
3170     ptr = oldptr;
3171     goto LONE_SINGLE_CHARACTER;
3172     }
3173    
3174 nigel 77 #ifdef SUPPORT_UTF8
3175     if (utf8)
3176     { /* Braces are required because the */
3177     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3178     }
3179     else
3180     #endif
3181     d = *ptr; /* Not UTF-8 mode */
3182    
3183     /* The second part of a range can be a single-character escape, but
3184     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3185     in such circumstances. */
3186    
3187 nigel 93 if (!inescq && d == '\\')
3188 nigel 77 {
3189 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3190     if (*errorcodeptr != 0) goto FAILED;
3191 nigel 77
3192 ph10 275 /* \b is backspace; \X is literal X; \R is literal R; any other
3193 nigel 93 special means the '-' was literal */
3194 nigel 77
3195     if (d < 0)
3196     {
3197     if (d == -ESC_b) d = '\b';
3198 nigel 93 else if (d == -ESC_X) d = 'X';
3199     else if (d == -ESC_R) d = 'R'; else
3200 nigel 77 {
3201 nigel 93 ptr = oldptr;
3202 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3203     }
3204     }
3205     }
3206    
3207 nigel 93 /* Check that the two values are in the correct order. Optimize
3208     one-character ranges */
3209 nigel 77
3210 nigel 93 if (d < c)
3211     {
3212     *errorcodeptr = ERR8;
3213     goto FAILED;
3214     }
3215    
3216 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3217    
3218 ph10 230 /* Remember \r or \n */
3219 ph10 231
3220     if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3221    
3222 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3223     matching, we have to use an XCLASS with extra data items. Caseless
3224     matching for characters > 127 is available only if UCP support is
3225     available. */
3226    
3227     #ifdef SUPPORT_UTF8
3228     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3229     {
3230     class_utf8 = TRUE;
3231    
3232     /* With UCP support, we can find the other case equivalents of
3233     the relevant characters. There may be several ranges. Optimize how
3234     they fit with the basic range. */
3235    
3236     #ifdef SUPPORT_UCP
3237     if ((options & PCRE_CASELESS) != 0)
3238     {
3239 nigel 93 unsigned int occ, ocd;
3240     unsigned int cc = c;
3241     unsigned int origd = d;
3242 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3243     {
3244 ph10 180 if (occ >= (unsigned int)c &&
3245     ocd <= (unsigned int)d)
3246 ph10 176 continue; /* Skip embedded ranges */
3247 nigel 77
3248 ph10 180 if (occ < (unsigned int)c &&
3249 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3250 nigel 77 { /* if there is overlap, */
3251     c = occ; /* noting that if occ < c */
3252     continue; /* we can't have ocd > d */
3253     } /* because a subrange is */
3254 ph10 180 if (ocd > (unsigned int)d &&
3255 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3256 nigel 77 { /* the basic range. */
3257     d = ocd;
3258     continue;
3259     }
3260    
3261     if (occ == ocd)
3262     {
3263     *class_utf8data++ = XCL_SINGLE;
3264     }
3265     else
3266     {
3267     *class_utf8data++ = XCL_RANGE;
3268     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3269     }
3270     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3271     }
3272     }
3273     #endif /* SUPPORT_UCP */
3274    
3275     /* Now record the original range, possibly modified for UCP caseless
3276     overlapping ranges. */
3277    
3278     *class_utf8data++ = XCL_RANGE;
3279     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3280     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3281    
3282     /* With UCP support, we are done. Without UCP support, there is no
3283     caseless matching for UTF-8 characters > 127; we can use the bit map
3284     for the smaller ones. */
3285    
3286     #ifdef SUPPORT_UCP
3287     continue; /* With next character in the class */
3288     #else
3289     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3290    
3291     /* Adjust upper limit and fall through to set up the map */
3292    
3293     d = 127;
3294    
3295     #endif /* SUPPORT_UCP */
3296     }
3297     #endif /* SUPPORT_UTF8 */
3298    
3299     /* We use the bit map for all cases when not in UTF-8 mode; else
3300     ranges that lie entirely within 0-127 when there is UCP support; else
3301     for partial ranges without UCP support. */
3302    
3303 nigel 93 class_charcount += d - c + 1;
3304     class_lastchar = d;
3305    
3306     /* We can save a bit of time by skipping this in the pre-compile. */
3307    
3308     if (lengthptr == NULL) for (; c <= d; c++)
3309 nigel 77 {
3310     classbits[c/8] |= (1 << (c&7));
3311     if ((options & PCRE_CASELESS) != 0)
3312     {
3313     int uc = cd->fcc[c]; /* flip case */
3314     classbits[uc/8] |= (1 << (uc&7));
3315     }
3316     }
3317    
3318     continue; /* Go get the next char in the class */
3319     }
3320    
3321     /* Handle a lone single character - we can get here for a normal
3322     non-escape char, or after \ that introduces a single character or for an
3323     apparent range that isn't. */
3324    
3325     LONE_SINGLE_CHARACTER:
3326 ph10 231
3327 nigel 77 /* Handle a character that cannot go in the bit map */
3328    
3329     #ifdef SUPPORT_UTF8
3330     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3331     {
3332     class_utf8 = TRUE;
3333     *class_utf8data++ = XCL_SINGLE;
3334     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3335    
3336     #ifdef SUPPORT_UCP
3337     if ((options & PCRE_CASELESS) != 0)
3338     {
3339 nigel 93 unsigned int othercase;
3340     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3341 nigel 77 {
3342     *class_utf8data++ = XCL_SINGLE;
3343     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3344     }
3345     }
3346     #endif /* SUPPORT_UCP */
3347    
3348     }
3349     else
3350     #endif /* SUPPORT_UTF8 */
3351    
3352     /* Handle a single-byte character */
3353     {
3354     classbits[c/8] |= (1 << (c&7));
3355     if ((options & PCRE_CASELESS) != 0)
3356     {
3357     c = cd->fcc[c]; /* flip case */
3358     classbits[c/8] |= (1 << (c&7));
3359     }
3360     class_charcount++;
3361     class_lastchar = c;
3362     }
3363     }
3364    
3365 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3366 nigel 77
3367 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3368 nigel 77
3369 nigel 93 if (c == 0) /* Missing terminating ']' */
3370     {
3371     *errorcodeptr = ERR6;
3372     goto FAILED;
3373     }
3374 ph10 231
3375    
3376 ph10 230 /* This code has been disabled because it would mean that \s counts as
3377     an explicit \r or \n reference, and that's not really what is wanted. Now
3378     we set the flag only if there is a literal "\r" or "\n" in the class. */
3379 ph10 227
3380 ph10 230 #if 0
3381 ph10 226 /* Remember whether \r or \n are in this class */
3382 ph10 227
3383 ph10 226 if (negate_class)
3384     {
3385 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3386 ph10 226 }
3387     else
3388     {
3389 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3390 ph10 227 }
3391 ph10 230 #endif
3392 ph10 227
3393 ph10 231
3394 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3395 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3396     use of \p or \P, in other words, no use of any XCLASS features, we can
3397     optimize.
3398    
3399 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3400     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3401     operate on single-bytes only. This is an historical hangover. Maybe one day
3402     we can tidy these opcodes to handle multi-byte characters.
3403 nigel 77
3404     The optimization throws away the bit map. We turn the item into a
3405     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3406     that OP_NOT does not support multibyte characters. In the positive case, it
3407     can cause firstbyte to be set. Otherwise, there can be no first char if
3408     this item is first, whatever repeat count may follow. In the case of
3409     reqbyte, save the previous value for reinstating. */
3410    
3411     #ifdef SUPPORT_UTF8
3412 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3413 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3414 nigel 77 #else
3415     if (class_charcount == 1)
3416     #endif
3417     {
3418     zeroreqbyte = reqbyte;
3419    
3420     /* The OP_NOT opcode works on one-byte characters only. */
3421    
3422     if (negate_class)
3423     {
3424     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3425     zerofirstbyte = firstbyte;
3426     *code++ = OP_NOT;
3427     *code++ = class_lastchar;
3428     break;
3429     }
3430    
3431     /* For a single, positive character, get the value into mcbuffer, and
3432     then we can handle this with the normal one-character code. */
3433    
3434     #ifdef SUPPORT_UTF8
3435     if (utf8 && class_lastchar > 127)
3436     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3437     else
3438     #endif
3439     {
3440     mcbuffer[0] = class_lastchar;
3441     mclength = 1;
3442     }
3443     goto ONE_CHAR;
3444     } /* End of 1-char optimization */
3445    
3446     /* The general case - not the one-char optimization. If this is the first
3447     thing in the branch, there can be no first char setting, whatever the
3448     repeat count. Any reqbyte setting must remain unchanged after any kind of
3449     repeat. */
3450    
3451     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3452     zerofirstbyte = firstbyte;
3453     zeroreqbyte = reqbyte;
3454    
3455     /* If there are characters with values > 255, we have to compile an
3456 ph10 286 extended class, with its own opcode, unless there was a negated special
3457     such as \S in the class, because in that case all characters > 255 are in
3458     the class, so any that were explicitly given as well can be ignored. If
3459 ph10 264 (when there are explicit characters > 255 that must be listed) there are no
3460     characters < 256, we can omit the bitmap in the actual compiled code. */
3461 nigel 77
3462     #ifdef SUPPORT_UTF8
3463 ph10 264 if (class_utf8 && !should_flip_negation)
3464 nigel 77 {
3465     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3466     *code++ = OP_XCLASS;
3467     code += LINK_SIZE;
3468     *code = negate_class? XCL_NOT : 0;
3469    
3470 nigel 93 /* If the map is required, move up the extra data to make room for it;
3471     otherwise just move the code pointer to the end of the extra data. */
3472 nigel 77
3473     if (class_charcount > 0)
3474     {
3475     *code++ |= XCL_MAP;
3476 nigel 93 memmove(code + 32, code, class_utf8data - code);
3477 nigel 77 memcpy(code, classbits, 32);
3478 nigel 93 code = class_utf8data + 32;
3479 nigel 77 }
3480 nigel 93 else code = class_utf8data;
3481 nigel 77
3482     /* Now fill in the complete length of the item */
3483    
3484     PUT(previous, 1, code - previous);
3485     break; /* End of class handling */
3486     }
3487     #endif
3488    
3489 ph10 286 /* If there are no characters > 255, set the opcode to OP_CLASS or
3490     OP_NCLASS, depending on whether the whole class was negated and whether
3491     there were negative specials such as \S in the class. Then copy the 32-byte
3492 ph10 264 map into the code vector, negating it if necessary. */
3493 ph10 286
3494 ph10 264 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3495 nigel 77 if (negate_class)
3496     {
3497 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3498     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3499 nigel 77 }
3500     else
3501     {
3502     memcpy(code, classbits, 32);
3503     }
3504     code += 32;
3505     break;
3506    
3507 nigel 93
3508     /* ===================================================================*/
3509 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3510     has been tested above. */
3511    
3512     case '{':
3513     if (!is_quantifier) goto NORMAL_CHAR;
3514     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3515     if (*errorcodeptr != 0) goto FAILED;
3516     goto REPEAT;
3517    
3518     case '*':
3519     repeat_min = 0;
3520     repeat_max = -1;
3521     goto REPEAT;
3522    
3523     case '+':
3524     repeat_min = 1;
3525     repeat_max = -1;
3526     goto REPEAT;
3527    
3528     case '?':
3529     repeat_min = 0;
3530     repeat_max = 1;
3531    
3532     REPEAT:
3533     if (previous == NULL)
3534     {
3535     *errorcodeptr = ERR9;
3536     goto FAILED;
3537     }
3538    
3539     if (repeat_min == 0)
3540     {
3541     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3542     reqbyte = zeroreqbyte; /* Ditto */
3543     }
3544    
3545     /* Remember whether this is a variable length repeat */
3546    
3547     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3548    
3549     op_type = 0; /* Default single-char op codes */
3550     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3551    
3552     /* Save start of previous item, in case we have to move it up to make space
3553     for an inserted OP_ONCE for the additional '+' extension. */
3554    
3555     tempcode = previous;
3556    
3557     /* If the next character is '+', we have a possessive quantifier. This
3558     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3559     If the next character is '?' this is a minimizing repeat, by default,
3560     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3561     repeat type to the non-default. */
3562    
3563     if (ptr[1] == '+')
3564     {
3565     repeat_type = 0; /* Force greedy */
3566     possessive_quantifier = TRUE;
3567     ptr++;
3568     }
3569     else if (ptr[1] == '?')
3570     {
3571     repeat_type = greedy_non_default;
3572     ptr++;
3573     }
3574     else repeat_type = greedy_default;
3575    
3576     /* If previous was a character match, abolish the item and generate a
3577     repeat item instead. If a char item has a minumum of more than one, ensure
3578     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3579     the first thing in a branch because the x will have gone into firstbyte
3580     instead. */
3581    
3582     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3583     {
3584     /* Deal with UTF-8 characters that take up more than one byte. It's
3585     easier to write this out separately than try to macrify it. Use c to
3586     hold the length of the character in bytes, plus 0x80 to flag that it's a
3587     length rather than a small character. */
3588    
3589     #ifdef SUPPORT_UTF8
3590     if (utf8 && (code[-1] & 0x80) != 0)
3591     {
3592     uschar *lastchar = code - 1;
3593     while((*lastchar & 0xc0) == 0x80) lastchar--;
3594     c = code - lastchar; /* Length of UTF-8 character */
3595     memcpy(utf8_char, lastchar, c); /* Save the char */
3596     c |= 0x80; /* Flag c as a length */
3597     }
3598     else
3599     #endif
3600    
3601     /* Handle the case of a single byte - either with no UTF8 support, or
3602     with UTF-8 disabled, or for a UTF-8 character < 128. */
3603    
3604     {
3605     c = code[-1];
3606     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3607     }
3608    
3609 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3610     the line is something that cannot possibly match this character. If so,
3611     automatically possessifying this item gains some performance in the case
3612     where the match fails. */
3613    
3614     if (!possessive_quantifier &&
3615     repeat_max < 0 &&
3616     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3617     options, cd))
3618     {
3619     repeat_type = 0; /* Force greedy */
3620     possessive_quantifier = TRUE;
3621     }
3622    
3623 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3624     }
3625    
3626     /* If previous was a single negated character ([^a] or similar), we use
3627     one of the special opcodes, replacing it. The code is shared with single-
3628     character repeats by setting opt_type to add a suitable offset into
3629 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3630     currently used only for single-byte chars. */
3631 nigel 77
3632     else if (*previous == OP_NOT)
3633     {
3634     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3635     c = previous[1];
3636 nigel 93 if (!possessive_quantifier &&
3637     repeat_max < 0 &&
3638     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3639     {
3640     repeat_type = 0; /* Force greedy */
3641     possessive_quantifier = TRUE;
3642     }
3643 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3644     }
3645    
3646     /* If previous was a character type match (\d or similar), abolish it and
3647     create a suitable repeat item. The code is shared with single-character
3648     repeats by setting op_type to add a suitable offset into repeat_type. Note
3649     the the Unicode property types will be present only when SUPPORT_UCP is
3650     defined, but we don't wrap the little bits of code here because it just
3651     makes it horribly messy. */
3652    
3653     else if (*previous < OP_EODN)
3654     {
3655     uschar *oldcode;
3656 nigel 87 int prop_type, prop_value;
3657 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3658     c = *previous;
3659    
3660 nigel 93 if (!possessive_quantifier &&
3661     repeat_max < 0 &&
3662     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3663     {
3664     repeat_type = 0; /* Force greedy */
3665     possessive_quantifier = TRUE;
3666     }
3667    
3668 nigel 77 OUTPUT_SINGLE_REPEAT:
3669 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3670     {
3671     prop_type = previous[1];
3672     prop_value = previous[2];
3673     }
3674     else prop_type = prop_value = -1;
3675 nigel 77
3676     oldcode = code;
3677     code = previous; /* Usually overwrite previous item */
3678    
3679     /* If the maximum is zero then the minimum must also be zero; Perl allows
3680     this case, so we do too - by simply omitting the item altogether. */
3681    
3682     if (repeat_max == 0) goto END_REPEAT;
3683    
3684     /* All real repeats make it impossible to handle partial matching (maybe
3685     one day we will be able to remove this restriction). */
3686    
3687 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3688 nigel 77
3689     /* Combine the op_type with the repeat_type */
3690    
3691     repeat_type += op_type;
3692    
3693     /* A minimum of zero is handled either as the special case * or ?, or as
3694     an UPTO, with the maximum given. */
3695    
3696     if (repeat_min == 0)
3697     {
3698     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3699     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3700     else
3701     {
3702     *code++ = OP_UPTO + repeat_type;
3703     PUT2INC(code, 0, repeat_max);
3704     }
3705     }
3706    
3707     /* A repeat minimum of 1 is optimized into some special cases. If the
3708 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3709 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3710     one less than the maximum. */
3711    
3712     else if (repeat_min == 1)
3713     {
3714     if (repeat_max == -1)
3715     *code++ = OP_PLUS + repeat_type;
3716     else
3717     {
3718     code = oldcode; /* leave previous item in place */
3719     if (repeat_max == 1) goto END_REPEAT;
3720     *code++ = OP_UPTO + repeat_type;
3721     PUT2INC(code, 0, repeat_max - 1);
3722     }
3723     }
3724    
3725     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3726     handled as an EXACT followed by an UPTO. */
3727    
3728     else
3729     {
3730     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3731     PUT2INC(code, 0, repeat_min);
3732    
3733     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3734     we have to insert the character for the previous code. For a repeated
3735 nigel 87 Unicode property match, there are two extra bytes that define the
3736 nigel 77 required property. In UTF-8 mode, long characters have their length in
3737     c, with the 0x80 bit as a flag. */
3738    
3739     if (repeat_max < 0)
3740     {
3741     #ifdef SUPPORT_UTF8
3742     if (utf8 && c >= 128)
3743     {
3744     memcpy(code, utf8_char, c & 7);
3745     code += c & 7;
3746     }
3747     else
3748     #endif
3749     {
3750     *code++ = c;
3751 nigel 87 if (prop_type >= 0)
3752     {
3753     *code++ = prop_type;
3754     *code++ = prop_value;
3755     }
3756 nigel 77 }
3757     *code++ = OP_STAR + repeat_type;
3758     }
3759    
3760     /* Else insert an UPTO if the max is greater than the min, again
3761 nigel 93 preceded by the character, for the previously inserted code. If the
3762     UPTO is just for 1 instance, we can use QUERY instead. */
3763 nigel 77
3764     else if (repeat_max != repeat_min)
3765     {
3766     #ifdef SUPPORT_UTF8
3767     if (utf8 && c >= 128)
3768     {
3769     memcpy(code, utf8_char, c & 7);
3770     code += c & 7;
3771     }
3772     else
3773     #endif
3774     *code++ = c;
3775 nigel 87 if (prop_type >= 0)
3776     {
3777     *code++ = prop_type;
3778     *code++ = prop_value;
3779     }
3780 nigel 77 repeat_max -= repeat_min;
3781 nigel 93
3782     if (repeat_max == 1)
3783     {
3784     *code++ = OP_QUERY + repeat_type;
3785     }
3786     else
3787     {
3788     *code++ = OP_UPTO + repeat_type;
3789     PUT2INC(code, 0, repeat_max);
3790     }
3791 nigel 77 }
3792     }
3793    
3794     /* The character or character type itself comes last in all cases. */
3795    
3796     #ifdef SUPPORT_UTF8
3797     if (utf8 && c >= 128)
3798     {
3799     memcpy(code, utf8_char, c & 7);
3800     code += c & 7;
3801     }
3802     else
3803     #endif
3804     *code++ = c;
3805    
3806 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3807     define the required property. */
3808 nigel 77
3809     #ifdef SUPPORT_UCP
3810 nigel 87 if (prop_type >= 0)
3811     {
3812     *code++ = prop_type;
3813     *code++ = prop_value;
3814     }
3815 nigel 77 #endif
3816     }
3817    
3818     /* If previous was a character class or a back reference, we put the repeat
3819     stuff after it, but just skip the item if the repeat was {0,0}. */
3820    
3821     else if (*previous == OP_CLASS ||
3822     *previous == OP_NCLASS ||
3823     #ifdef SUPPORT_UTF8
3824     *previous == OP_XCLASS ||
3825     #endif
3826     *previous == OP_REF)
3827     {
3828     if (repeat_max == 0)
3829     {
3830     code = previous;
3831     goto END_REPEAT;
3832     }
3833    
3834     /* All real repeats make it impossible to handle partial matching (maybe
3835     one day we will be able to remove this restriction). */
3836    
3837 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3838 nigel 77
3839     if (repeat_min == 0 && repeat_max == -1)
3840     *code++ = OP_CRSTAR + repeat_type;
3841     else if (repeat_min == 1 && repeat_max == -1)
3842     *code++ = OP_CRPLUS + repeat_type;
3843     else if (repeat_min == 0 && repeat_max == 1)
3844     *code++ = OP_CRQUERY + repeat_type;
3845     else
3846     {
3847     *code++ = OP_CRRANGE + repeat_type;
3848     PUT2INC(code, 0, repeat_min);
3849     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3850     PUT2INC(code, 0, repeat_max);
3851     }
3852     }
3853    
3854     /* If previous was a bracket group, we may have to replicate it in certain
3855     cases. */
3856    
3857 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3858     *previous == OP_ONCE || *previous == OP_COND)
3859 nigel 77 {
3860     register int i;
3861     int ketoffset = 0;
3862     int len = code - previous;
3863     uschar *bralink = NULL;
3864    
3865 nigel 93 /* Repeating a DEFINE group is pointless */
3866    
3867     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3868     {
3869     *errorcodeptr = ERR55;
3870     goto FAILED;
3871     }
3872    
3873 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3874     by scanning through from the start, and compute the offset back to it
3875     from the current code pointer. There may be an OP_OPT setting following
3876     the final KET, so we can't find the end just by going back from the code
3877     pointer. */
3878    
3879     if (repeat_max == -1)
3880     {
3881     register uschar *ket = previous;
3882     do ket += GET(ket, 1); while (*ket != OP_KET);
3883     ketoffset = code - ket;
3884     }
3885    
3886     /* The case of a zero minimum is special because of the need to stick
3887     OP_BRAZERO in front of it, and because the group appears once in the
3888     data, whereas in other cases it appears the minimum number of times. For
3889     this reason, it is simplest to treat this case separately, as otherwise
3890     the code gets far too messy. There are several special subcases when the
3891     minimum is zero. */
3892    
3893     if (repeat_min == 0)
3894     {
3895 ph10 335 /* If the maximum is also zero, we used to just omit the group from the
3896     output altogether, like this:
3897 nigel 77
3898 ph10 335 ** if (repeat_max == 0)
3899     ** {
3900     ** code = previous;
3901     ** goto END_REPEAT;
3902     ** }
3903    
3904     However, that fails when a group is referenced as a subroutine from
3905     elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3906     so that it is skipped on execution. As we don't have a list of which
3907     groups are referenced, we cannot do this selectively.
3908 nigel 77
3909 ph10 335 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3910     and do no more at this point. However, we do need to adjust any
3911     OP_RECURSE calls inside the group that refer to the group itself or any
3912     internal or forward referenced group, because the offset is from the
3913     start of the whole regex. Temporarily terminate the pattern while doing
3914     this. */
3915 nigel 77
3916 ph10 335 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
3917 nigel 77 {
3918     *code = OP_END;
3919 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3920 nigel 77 memmove(previous+1, previous, len);
3921     code++;
3922 ph10 335 if (repeat_max == 0)
3923     {
3924     *previous++ = OP_SKIPZERO;
3925     goto END_REPEAT;
3926     }
3927 nigel 77 *previous++ = OP_BRAZERO + repeat_type;
3928     }
3929    
3930     /* If the maximum is greater than 1 and limited, we have to replicate
3931     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3932     The first one has to be handled carefully because it's the original
3933     copy, which has to be moved up. The remainder can be handled by code
3934     that is common with the non-zero minimum case below. We have to
3935     adjust the value or repeat_max, since one less copy is required. Once
3936     again, we may have to adjust any OP_RECURSE calls inside the group. */
3937    
3938     else
3939     {
3940     int offset;
3941     *code = OP_END;
3942 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3943 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3944     code += 2 + LINK_SIZE;
3945     *previous++ = OP_BRAZERO + repeat_type;
3946     *previous++ = OP_BRA;
3947    
3948     /* We chain together the bracket offset fields that have to be
3949     filled in later when the ends of the brackets are reached. */
3950    
3951     offset = (bralink == NULL)? 0 : previous - bralink;
3952     bralink = previous;
3953     PUTINC(previous, 0, offset);
3954     }
3955    
3956     repeat_max--;
3957     }
3958    
3959     /* If the minimum is greater than zero, replicate the group as many
3960     times as necessary, and adjust the maximum to the number of subsequent
3961     copies that we need. If we set a first char from the group, and didn't
3962 nigel 93 set a required char, copy the latter from the former. If there are any
3963     forward reference subroutine calls in the group, there will be entries on
3964     the workspace list; replicate these with an appropriate increment. */
3965 nigel 77
3966     else
3967     {
3968     if (repeat_min > 1)
3969     {
3970 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3971 ph10 202 just adjust the length as if we had. Do some paranoid checks for
3972     potential integer overflow. */
3973 nigel 93
3974     if (lengthptr != NULL)
3975 ph10 202 {
3976     int delta = (repeat_min - 1)*length_prevgroup;
3977     if ((double)(repeat_min - 1)*(double)length_prevgroup >
3978     (double)INT_MAX ||
3979     OFLOW_MAX - *lengthptr < delta)
3980     {
3981     *errorcodeptr = ERR20;
3982     goto FAILED;
3983     }
3984     *lengthptr += delta;
3985     }
3986 nigel 93
3987     /* This is compiling for real */
3988    
3989     else
3990 nigel 77 {
3991 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3992     for (i = 1; i < repeat_min; i++)
3993     {
3994     uschar *hc;
3995     uschar *this_hwm = cd->hwm;
3996     memcpy(code, previous, len);
3997     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3998     {
3999     PUT(cd->hwm, 0, GET(hc, 0) + len);
4000     cd->hwm += LINK_SIZE;
4001     }
4002     save_hwm = this_hwm;
4003     code += len;
4004     }
4005 nigel 77 }
4006     }
4007 nigel 93
4008 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
4009     }
4010    
4011     /* This code is common to both the zero and non-zero minimum cases. If
4012     the maximum is limited, it replicates the group in a nested fashion,
4013     remembering the bracket starts on a stack. In the case of a zero minimum,
4014     the first one was set up above. In all cases the repeat_max now specifies
4015 nigel 93 the number of additional copies needed. Again, we must remember to
4016     replicate entries on the forward reference list. */
4017 nigel 77
4018     if (repeat_max >= 0)
4019     {
4020 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
4021     just adjust the length as if we had. For each repetition we must add 1
4022     to the length for BRAZERO and for all but the last repetition we must
4023 ph10 202 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4024     paranoid checks to avoid integer overflow. */
4025 nigel 93
4026     if (lengthptr != NULL && repeat_max > 0)
4027 ph10 202 {
4028     int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4029     2 - 2*LINK_SIZE; /* Last one doesn't nest */
4030     if ((double)repeat_max *
4031     (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4032     > (double)INT_MAX ||
4033     OFLOW_MAX - *lengthptr < delta)
4034     {
4035     *errorcodeptr = ERR20;
4036     goto FAILED;
4037     }
4038     *lengthptr += delta;
4039     }
4040 nigel 93
4041     /* This is compiling for real */
4042    
4043     else for (i = repeat_max - 1; i >= 0; i--)
4044 nigel 77 {
4045 nigel 93 uschar *hc;
4046     uschar *this_hwm = cd->hwm;
4047    
4048 nigel 77 *code++ = OP_BRAZERO + repeat_type;
4049    
4050     /* All but the final copy start a new nesting, maintaining the
4051     chain of brackets outstanding. */
4052    
4053     if (i != 0)
4054     {
4055     int offset;
4056     *code++ = OP_BRA;
4057     offset = (bralink == NULL)? 0 : code - bralink;
4058     bralink = code;
4059     PUTINC(code, 0, offset);
4060     }
4061    
4062     memcpy(code, previous, len);
4063 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4064     {
4065     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4066     cd->hwm += LINK_SIZE;
4067     }
4068     save_hwm = this_hwm;
4069 nigel 77 code += len;
4070     }
4071    
4072     /* Now chain through the pending brackets, and fill in their length
4073     fields (which are holding the chain links pro tem). */
4074    
4075     while (bralink != NULL)
4076     {
4077     int oldlinkoffset;