/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 340 - (hide annotations) (download)
Fri Apr 18 20:00:21 2008 UTC (6 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 201856 byte(s)
Fix incorrect error for patterns like /(?2)[]a()b](abc)/

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 305 Copyright (c) 1997-2008 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 nigel 77 static const short int escapes[] = {
102     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 nigel 77 0, 0, -ESC_z /* x - z */
112     };
113    
114 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
115 nigel 77 static const short int escapes[] = {
116     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139     };
140     #endif
141    
142    
143 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144     searched linearly. Put all the names into a single string, in order to reduce
145 ph10 240 the number of relocations when a shared library is dynamically linked. */
146 ph10 210
147     typedef struct verbitem {
148     int len;
149     int op;
150 ph10 211 } verbitem;
151 ph10 210
152 ph10 240 static const char verbnames[] =
153 ph10 243 "ACCEPT\0"
154     "COMMIT\0"
155     "F\0"
156     "FAIL\0"
157     "PRUNE\0"
158     "SKIP\0"
159     "THEN";
160 ph10 240
161 ph10 327 static const verbitem verbs[] = {
162 ph10 240 { 6, OP_ACCEPT },
163     { 6, OP_COMMIT },
164     { 1, OP_FAIL },
165     { 4, OP_FAIL },
166     { 5, OP_PRUNE },
167     { 4, OP_SKIP },
168     { 4, OP_THEN }
169 ph10 210 };
170    
171 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
172 ph10 210
173    
174 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
175     now all in a single string, to reduce the number of relocations when a shared
176 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
177     length entry. The first three must be alpha, lower, upper, as this is assumed
178     for handling case independence. */
179 nigel 77
180 ph10 240 static const char posix_names[] =
181 ph10 243 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182     "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 ph10 240 "word\0" "xdigit";
184 nigel 77
185     static const uschar posix_name_lengths[] = {
186     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187    
188 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
189     base map, with an optional addition or removal of another map. Then, for some
190     classes, there is some additional tweaking: for [:blank:] the vertical space
191     characters are removed, and for [:alpha:] and [:alnum:] the underscore
192     character is removed. The triples in the table consist of the base map offset,
193     second map offset or -1 if no second map, and a non-negative value for map
194     addition or a negative value for map subtraction (if there are two maps). The
195     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196     remove vertical space characters, 2 => remove underscore. */
197 nigel 77
198     static const int posix_class_maps[] = {
199 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
200     cbit_lower, -1, 0, /* lower */
201     cbit_upper, -1, 0, /* upper */
202     cbit_word, -1, 2, /* alnum - word without underscore */
203     cbit_print, cbit_cntrl, 0, /* ascii */
204     cbit_space, -1, 1, /* blank - a GNU extension */
205     cbit_cntrl, -1, 0, /* cntrl */
206     cbit_digit, -1, 0, /* digit */
207     cbit_graph, -1, 0, /* graph */
208     cbit_print, -1, 0, /* print */
209     cbit_punct, -1, 0, /* punct */
210     cbit_space, -1, 0, /* space */
211     cbit_word, -1, 0, /* word - a Perl extension */
212     cbit_xdigit,-1, 0 /* xdigit */
213 nigel 77 };
214    
215    
216 nigel 93 #define STRING(a) # a
217     #define XSTRING(s) STRING(s)
218    
219 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
220 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
221     they are documented. Always add a new error instead. Messages marked DEAD below
222 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
223     the number of relocations needed when a shared library is loaded dynamically,
224     it is now one long string. We cannot use a table of offsets, because the
225     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226     simply count through to the one we want - this isn't a performance issue
227 ph10 240 because these strings are used only when there is a compilation error. */
228 nigel 77
229 ph10 240 static const char error_texts[] =
230     "no error\0"
231     "\\ at end of pattern\0"
232     "\\c at end of pattern\0"
233     "unrecognized character follows \\\0"
234     "numbers out of order in {} quantifier\0"
235 nigel 77 /* 5 */
236 ph10 240 "number too big in {} quantifier\0"
237     "missing terminating ] for character class\0"
238     "invalid escape sequence in character class\0"
239     "range out of order in character class\0"
240     "nothing to repeat\0"
241 nigel 77 /* 10 */
242 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243     "internal error: unexpected repeat\0"
244 ph10 269 "unrecognized character after (? or (?-\0"
245 ph10 240 "POSIX named classes are supported only within a class\0"
246     "missing )\0"
247 nigel 77 /* 15 */
248 ph10 240 "reference to non-existent subpattern\0"
249     "erroffset passed as NULL\0"
250     "unknown option bit(s) set\0"
251     "missing ) after comment\0"
252     "parentheses nested too deeply\0" /** DEAD **/
253 nigel 77 /* 20 */
254 ph10 240 "regular expression is too large\0"
255     "failed to get memory\0"
256     "unmatched parentheses\0"
257     "internal error: code overflow\0"
258     "unrecognized character after (?<\0"
259 nigel 77 /* 25 */
260 ph10 240 "lookbehind assertion is not fixed length\0"
261     "malformed number or name after (?(\0"
262     "conditional group contains more than two branches\0"
263     "assertion expected after (?(\0"
264     "(?R or (?[+-]digits must be followed by )\0"
265 nigel 77 /* 30 */
266 ph10 240 "unknown POSIX class name\0"
267     "POSIX collating elements are not supported\0"
268     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269     "spare error\0" /** DEAD **/
270     "character value in \\x{...} sequence is too large\0"
271 nigel 77 /* 35 */
272 ph10 240 "invalid condition (?(0)\0"
273     "\\C not allowed in lookbehind assertion\0"
274     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275     "number after (?C is > 255\0"
276     "closing ) for (?C expected\0"
277 nigel 77 /* 40 */
278 ph10 240 "recursive call could loop indefinitely\0"
279     "unrecognized character after (?P\0"
280     "syntax error in subpattern name (missing terminator)\0"
281     "two named subpatterns have the same name\0"
282     "invalid UTF-8 string\0"
283 nigel 77 /* 45 */
284 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
285     "malformed \\P or \\p sequence\0"
286     "unknown property name after \\P or \\p\0"
287     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 nigel 91 /* 50 */
290 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
291     "octal value is greater than \\377 (not in UTF-8 mode)\0"
292     "internal error: overran compiling workspace\0"
293     "internal error: previously-checked referenced subpattern not found\0"
294     "DEFINE group contains more than one branch\0"
295 nigel 93 /* 55 */
296 ph10 240 "repeating a DEFINE group is not allowed\0"
297     "inconsistent NEWLINE options\0"
298 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
299     "a numbered reference must not be zero\0"
300 ph10 240 "(*VERB) with an argument is not supported\0"
301 ph10 211 /* 60 */
302 ph10 240 "(*VERB) not recognized\0"
303 ph10 268 "number is too big\0"
304 ph10 272 "subpattern name expected\0"
305 ph10 336 "digit expected after (?+\0"
306     "] is an invalid data character in JavaScript compatibility mode";
307 nigel 77
308    
309     /* Table to identify digits and hex digits. This is used when compiling
310     patterns. Note that the tables in chartables are dependent on the locale, and
311     may mark arbitrary characters as digits - but the PCRE compiling code expects
312     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
313     a private table here. It costs 256 bytes, but it is a lot faster than doing
314     character value tests (at least in some simple cases I timed), and in some
315     applications one wants PCRE to compile efficiently as well as match
316     efficiently.
317    
318     For convenience, we use the same bit definitions as in chartables:
319    
320     0x04 decimal digit
321     0x08 hexadecimal digit
322    
323     Then we can use ctype_digit and ctype_xdigit in the code. */
324    
325 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
326 nigel 77 static const unsigned char digitab[] =
327     {
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
333     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
334     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
335     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
336     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
337     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
338     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
339     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
340     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
341     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
342     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
343     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
344     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
347     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
348     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
350     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
351     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
352     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
355     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
356     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
357     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
358     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
359     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
360    
361 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
362 nigel 77 static const unsigned char digitab[] =
363     {
364     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
365     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
366     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
367     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
368     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
369     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
370     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
371     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
374     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
375 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
376 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
377     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
378     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
379     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
380     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
381     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
382     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
383     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
387     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
388     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
393     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
394     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
395     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
396    
397     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
398     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
399     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
400     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
401     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
402     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
403     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
404     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
405     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
406     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
407     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
408     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
409 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
410 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
411     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
412     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
413     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
414     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
415     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
416     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
417     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
418     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
419     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
420     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
421     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
422     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
423     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
424     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
425     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
426     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
427     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
428     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
429     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
430     #endif
431    
432    
433     /* Definition to allow mutual recursion */
434    
435     static BOOL
436 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
437 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
438 nigel 77
439    
440    
441     /*************************************************
442 ph10 240 * Find an error text *
443     *************************************************/
444    
445 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
446     some of the text is of unknown length, we can't use a table of offsets.
447     Instead, just count through the strings. This is not a performance issue
448 ph10 240 because it happens only when there has been a compilation error.
449    
450     Argument: the error number
451     Returns: pointer to the error string
452     */
453    
454     static const char *
455     find_error_text(int n)
456     {
457     const char *s = error_texts;
458 ph10 243 for (; n > 0; n--) while (*s++ != 0);
459 ph10 240 return s;
460     }
461    
462    
463     /*************************************************
464 nigel 77 * Handle escapes *
465     *************************************************/
466    
467     /* This function is called when a \ has been encountered. It either returns a
468     positive value for a simple escape such as \n, or a negative value which
469 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
470     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
471     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
472     ptr is pointing at the \. On exit, it is on the final character of the escape
473     sequence.
474 nigel 77
475     Arguments:
476     ptrptr points to the pattern position pointer
477     errorcodeptr points to the errorcode variable
478     bracount number of previous extracting brackets
479     options the options bits
480     isclass TRUE if inside a character class
481    
482     Returns: zero or positive => a data character
483     negative => a special escape sequence
484 ph10 213 on error, errorcodeptr is set
485 nigel 77 */
486    
487     static int
488     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
489     int options, BOOL isclass)
490     {
491 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
492     const uschar *ptr = *ptrptr + 1;
493 nigel 77 int c, i;
494    
495 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
496     ptr--; /* Set pointer back to the last byte */
497    
498 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
499    
500     if (c == 0) *errorcodeptr = ERR1;
501    
502 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
503     in a table. A non-zero result is something that can be returned immediately.
504 nigel 77 Otherwise further processing may be required. */
505    
506 ph10 97 #ifndef EBCDIC /* ASCII coding */
507 ph10 274 else if (c < '0' || c > 'z') {} /* Not alphanumeric */
508 nigel 77 else if ((i = escapes[c - '0']) != 0) c = i;
509    
510 ph10 97 #else /* EBCDIC coding */
511 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
512 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
513     #endif
514    
515     /* Escapes that need further processing, or are illegal. */
516    
517     else
518     {
519     const uschar *oldptr;
520 nigel 93 BOOL braced, negated;
521    
522 nigel 77 switch (c)
523     {
524     /* A number of Perl escapes are not handled by PCRE. We give an explicit
525     error. */
526    
527     case 'l':
528     case 'L':
529     case 'N':
530     case 'u':
531     case 'U':
532     *errorcodeptr = ERR37;
533     break;
534    
535 ph10 333 /* \g must be followed by one of a number of specific things:
536    
537     (1) A number, either plain or braced. If positive, it is an absolute
538     backreference. If negative, it is a relative backreference. This is a Perl
539     5.10 feature.
540    
541     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
542     is part of Perl's movement towards a unified syntax for back references. As
543     this is synonymous with \k{name}, we fudge it up by pretending it really
544     was \k.
545    
546     (3) For Oniguruma compatibility we also support \g followed by a name or a
547     number either in angle brackets or in single quotes. However, these are
548     (possibly recursive) subroutine calls, _not_ backreferences. Just return
549     the -ESC_g code (cf \k). */
550 nigel 93
551     case 'g':
552 ph10 333 if (ptr[1] == '<' || ptr[1] == '\'')
553     {
554     c = -ESC_g;
555     break;
556     }
557    
558     /* Handle the Perl-compatible cases */
559    
560 nigel 93 if (ptr[1] == '{')
561     {
562 ph10 171 const uschar *p;
563     for (p = ptr+2; *p != 0 && *p != '}'; p++)
564     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
565 ph10 172 if (*p != 0 && *p != '}')
566 ph10 171 {
567     c = -ESC_k;
568     break;
569 ph10 172 }
570 nigel 93 braced = TRUE;
571     ptr++;
572     }
573     else braced = FALSE;
574    
575     if (ptr[1] == '-')
576     {
577     negated = TRUE;
578     ptr++;
579     }
580     else negated = FALSE;
581    
582     c = 0;
583     while ((digitab[ptr[1]] & ctype_digit) != 0)
584     c = c * 10 + *(++ptr) - '0';
585 ph10 220
586 ph10 333 if (c < 0) /* Integer overflow */
587 ph10 213 {
588     *errorcodeptr = ERR61;
589     break;
590 ph10 220 }
591 ph10 333
592     if (braced && *(++ptr) != '}')
593 nigel 93 {
594     *errorcodeptr = ERR57;
595 ph10 213 break;
596 nigel 93 }
597 ph10 333
598     if (c == 0)
599     {
600     *errorcodeptr = ERR58;
601     break;
602     }
603 nigel 93
604     if (negated)
605     {
606     if (c > bracount)
607     {
608     *errorcodeptr = ERR15;
609 ph10 213 break;
610 nigel 93 }
611     c = bracount - (c - 1);
612     }
613    
614     c = -(ESC_REF + c);
615     break;
616    
617 nigel 77 /* The handling of escape sequences consisting of a string of digits
618     starting with one that is not zero is not straightforward. By experiment,
619     the way Perl works seems to be as follows:
620    
621     Outside a character class, the digits are read as a decimal number. If the
622     number is less than 10, or if there are that many previous extracting
623     left brackets, then it is a back reference. Otherwise, up to three octal
624     digits are read to form an escaped byte. Thus \123 is likely to be octal
625     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
626     value is greater than 377, the least significant 8 bits are taken. Inside a
627     character class, \ followed by a digit is always an octal number. */
628    
629     case '1': case '2': case '3': case '4': case '5':
630     case '6': case '7': case '8': case '9':
631    
632     if (!isclass)
633     {
634     oldptr = ptr;
635     c -= '0';
636     while ((digitab[ptr[1]] & ctype_digit) != 0)
637     c = c * 10 + *(++ptr) - '0';
638 ph10 333 if (c < 0) /* Integer overflow */
639 ph10 213 {
640     *errorcodeptr = ERR61;
641 ph10 220 break;
642     }
643 nigel 77 if (c < 10 || c <= bracount)
644     {
645     c = -(ESC_REF + c);
646     break;
647     }
648     ptr = oldptr; /* Put the pointer back and fall through */
649     }
650    
651     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
652     generates a binary zero byte and treats the digit as a following literal.
653     Thus we have to pull back the pointer by one. */
654    
655     if ((c = *ptr) >= '8')
656     {
657     ptr--;
658     c = 0;
659     break;
660     }
661    
662     /* \0 always starts an octal number, but we may drop through to here with a
663 nigel 91 larger first octal digit. The original code used just to take the least
664     significant 8 bits of octal numbers (I think this is what early Perls used
665     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
666     than 3 octal digits. */
667 nigel 77
668     case '0':
669     c -= '0';
670     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
671     c = c * 8 + *(++ptr) - '0';
672 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
673 nigel 77 break;
674    
675 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
676     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
677     treated as a data character. */
678 nigel 77
679     case 'x':
680 nigel 87 if (ptr[1] == '{')
681 nigel 77 {
682     const uschar *pt = ptr + 2;
683 nigel 87 int count = 0;
684    
685 nigel 77 c = 0;
686     while ((digitab[*pt] & ctype_xdigit) != 0)
687     {
688 nigel 87 register int cc = *pt++;
689     if (c == 0 && cc == '0') continue; /* Leading zeroes */
690 nigel 77 count++;
691 nigel 87
692 ph10 97 #ifndef EBCDIC /* ASCII coding */
693 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
694 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
695 ph10 97 #else /* EBCDIC coding */
696 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
697 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
698 nigel 77 #endif
699     }
700 nigel 87
701 nigel 77 if (*pt == '}')
702     {
703 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
704 nigel 77 ptr = pt;
705     break;
706     }
707 nigel 87
708 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
709     recognize this construct; fall through to the normal \x handling. */
710     }
711    
712 nigel 87 /* Read just a single-byte hex-defined char */
713 nigel 77
714     c = 0;
715     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
716     {
717     int cc; /* Some compilers don't like ++ */
718     cc = *(++ptr); /* in initializers */
719 ph10 97 #ifndef EBCDIC /* ASCII coding */
720 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
721     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
722 ph10 97 #else /* EBCDIC coding */
723 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
724     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
725     #endif
726     }
727     break;
728    
729 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
730     This coding is ASCII-specific, but then the whole concept of \cx is
731     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
732 nigel 77
733     case 'c':
734     c = *(++ptr);
735     if (c == 0)
736     {
737     *errorcodeptr = ERR2;
738 ph10 213 break;
739 nigel 77 }
740    
741 ph10 97 #ifndef EBCDIC /* ASCII coding */
742 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
743     c ^= 0x40;
744 ph10 97 #else /* EBCDIC coding */
745 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
746     c ^= 0xC0;
747     #endif
748     break;
749    
750     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
751 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
752     otherwise, for Perl compatibility, it is a literal. This code looks a bit
753     odd, but there used to be some cases other than the default, and there may
754     be again in future, so I haven't "optimized" it. */
755 nigel 77
756     default:
757     if ((options & PCRE_EXTRA) != 0) switch(c)
758     {
759     default:
760     *errorcodeptr = ERR3;
761     break;
762     }
763     break;
764     }
765     }
766    
767     *ptrptr = ptr;
768     return c;
769     }
770    
771    
772    
773     #ifdef SUPPORT_UCP
774     /*************************************************
775     * Handle \P and \p *
776     *************************************************/
777    
778     /* This function is called after \P or \p has been encountered, provided that
779     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
780     pointing at the P or p. On exit, it is pointing at the final character of the
781     escape sequence.
782    
783     Argument:
784     ptrptr points to the pattern position pointer
785     negptr points to a boolean that is set TRUE for negation else FALSE
786 nigel 87 dptr points to an int that is set to the detailed property value
787 nigel 77 errorcodeptr points to the error code variable
788    
789 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
790 nigel 77 */
791    
792     static int
793 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
794 nigel 77 {
795     int c, i, bot, top;
796     const uschar *ptr = *ptrptr;
797 nigel 87 char name[32];
798 nigel 77
799     c = *(++ptr);
800     if (c == 0) goto ERROR_RETURN;
801    
802     *negptr = FALSE;
803    
804 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
805     negation. */
806 nigel 77
807     if (c == '{')
808     {
809     if (ptr[1] == '^')
810     {
811     *negptr = TRUE;
812     ptr++;
813     }
814 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
815 nigel 77 {
816     c = *(++ptr);
817     if (c == 0) goto ERROR_RETURN;
818     if (c == '}') break;
819     name[i] = c;
820     }
821 nigel 87 if (c !='}') goto ERROR_RETURN;
822 nigel 77 name[i] = 0;
823     }
824    
825     /* Otherwise there is just one following character */
826    
827     else
828     {
829     name[0] = c;
830     name[1] = 0;
831     }
832    
833     *ptrptr = ptr;
834    
835     /* Search for a recognized property name using binary chop */
836    
837     bot = 0;
838     top = _pcre_utt_size;
839    
840     while (bot < top)
841     {
842 nigel 87 i = (bot + top) >> 1;
843 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
844 nigel 87 if (c == 0)
845     {
846     *dptr = _pcre_utt[i].value;
847     return _pcre_utt[i].type;
848     }
849 nigel 77 if (c > 0) bot = i + 1; else top = i;
850     }
851    
852     *errorcodeptr = ERR47;
853     *ptrptr = ptr;
854     return -1;
855    
856     ERROR_RETURN:
857     *errorcodeptr = ERR46;
858     *ptrptr = ptr;
859     return -1;
860     }
861     #endif
862    
863    
864    
865    
866     /*************************************************
867     * Check for counted repeat *
868     *************************************************/
869    
870     /* This function is called when a '{' is encountered in a place where it might
871     start a quantifier. It looks ahead to see if it really is a quantifier or not.
872     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
873     where the ddds are digits.
874    
875     Arguments:
876     p pointer to the first char after '{'
877    
878     Returns: TRUE or FALSE
879     */
880    
881     static BOOL
882     is_counted_repeat(const uschar *p)
883     {
884     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
885     while ((digitab[*p] & ctype_digit) != 0) p++;
886     if (*p == '}') return TRUE;
887    
888     if (*p++ != ',') return FALSE;
889     if (*p == '}') return TRUE;
890    
891     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
892     while ((digitab[*p] & ctype_digit) != 0) p++;
893    
894     return (*p == '}');
895     }
896    
897    
898    
899     /*************************************************
900     * Read repeat counts *
901     *************************************************/
902    
903     /* Read an item of the form {n,m} and return the values. This is called only
904     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
905     so the syntax is guaranteed to be correct, but we need to check the values.
906    
907     Arguments:
908     p pointer to first char after '{'
909     minp pointer to int for min
910     maxp pointer to int for max
911     returned as -1 if no max
912     errorcodeptr points to error code variable
913    
914     Returns: pointer to '}' on success;
915     current ptr on error, with errorcodeptr set non-zero
916     */
917    
918     static const uschar *
919     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
920     {
921     int min = 0;
922     int max = -1;
923    
924 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
925     an integer overflow. */
926    
927 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
928 nigel 81 if (min < 0 || min > 65535)
929     {
930     *errorcodeptr = ERR5;
931     return p;
932     }
933 nigel 77
934 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
935     Also, max must not be less than min. */
936    
937 nigel 77 if (*p == '}') max = min; else
938     {
939     if (*(++p) != '}')
940     {
941     max = 0;
942     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
943 nigel 81 if (max < 0 || max > 65535)
944     {
945     *errorcodeptr = ERR5;
946     return p;
947     }
948 nigel 77 if (max < min)
949     {
950     *errorcodeptr = ERR4;
951     return p;
952     }
953     }
954     }
955    
956 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
957     '}'. */
958 nigel 77
959 nigel 81 *minp = min;
960     *maxp = max;
961 nigel 77 return p;
962     }
963    
964    
965    
966     /*************************************************
967 nigel 93 * Find forward referenced subpattern *
968 nigel 91 *************************************************/
969    
970 nigel 93 /* This function scans along a pattern's text looking for capturing
971     subpatterns, and counting them. If it finds a named pattern that matches the
972     name it is given, it returns its number. Alternatively, if the name is NULL, it
973     returns when it reaches a given numbered subpattern. This is used for forward
974     references to subpatterns. We know that if (?P< is encountered, the name will
975     be terminated by '>' because that is checked in the first pass.
976 nigel 91
977     Arguments:
978 nigel 93 ptr current position in the pattern
979     count current count of capturing parens so far encountered
980     name name to seek, or NULL if seeking a numbered subpattern
981     lorn name length, or subpattern number if name is NULL
982     xmode TRUE if we are in /x mode
983 nigel 91
984     Returns: the number of the named subpattern, or -1 if not found
985     */
986    
987     static int
988 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
989     BOOL xmode)
990 nigel 91 {
991     const uschar *thisname;
992 nigel 93
993 nigel 91 for (; *ptr != 0; ptr++)
994     {
995 nigel 93 int term;
996    
997     /* Skip over backslashed characters and also entire \Q...\E */
998    
999     if (*ptr == '\\')
1000     {
1001     if (*(++ptr) == 0) return -1;
1002     if (*ptr == 'Q') for (;;)
1003     {
1004     while (*(++ptr) != 0 && *ptr != '\\');
1005     if (*ptr == 0) return -1;
1006     if (*(++ptr) == 'E') break;
1007     }
1008     continue;
1009     }
1010    
1011 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1012     are handled for real. If the first character is '^', skip it. Also, if the
1013     first few characters (either before or after ^) are \Q\E or \E we skip them
1014     too. This makes for compatibility with Perl. */
1015 nigel 93
1016     if (*ptr == '[')
1017     {
1018 ph10 340 BOOL negate_class = FALSE;
1019     for (;;)
1020     {
1021     int c = *(++ptr);
1022     if (c == '\\')
1023     {
1024     if (ptr[1] == 'E') ptr++;
1025     else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
1026     else break;
1027     }
1028     else if (!negate_class && c == '^')
1029     negate_class = TRUE;
1030     else break;
1031     }
1032    
1033     /* If the next character is ']', it is a data character that must be
1034     skipped. */
1035    
1036     if (ptr[1] == ']') ptr++;
1037    
1038 nigel 93 while (*(++ptr) != ']')
1039     {
1040 ph10 220 if (*ptr == 0) return -1;
1041 nigel 93 if (*ptr == '\\')
1042     {
1043     if (*(++ptr) == 0) return -1;
1044     if (*ptr == 'Q') for (;;)
1045     {
1046     while (*(++ptr) != 0 && *ptr != '\\');
1047     if (*ptr == 0) return -1;
1048     if (*(++ptr) == 'E') break;
1049     }
1050     continue;
1051     }
1052     }
1053     continue;
1054     }
1055    
1056     /* Skip comments in /x mode */
1057    
1058     if (xmode && *ptr == '#')
1059     {
1060     while (*(++ptr) != 0 && *ptr != '\n');
1061     if (*ptr == 0) return -1;
1062     continue;
1063     }
1064    
1065     /* An opening parens must now be a real metacharacter */
1066    
1067 nigel 91 if (*ptr != '(') continue;
1068 ph10 210 if (ptr[1] != '?' && ptr[1] != '*')
1069 nigel 93 {
1070     count++;
1071     if (name == NULL && count == lorn) return count;
1072     continue;
1073     }
1074    
1075     ptr += 2;
1076     if (*ptr == 'P') ptr++; /* Allow optional P */
1077    
1078     /* We have to disambiguate (?<! and (?<= from (?<name> */
1079    
1080     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1081     *ptr != '\'')
1082     continue;
1083    
1084 nigel 91 count++;
1085 nigel 93
1086     if (name == NULL && count == lorn) return count;
1087     term = *ptr++;
1088     if (term == '<') term = '>';
1089 nigel 91 thisname = ptr;
1090 nigel 93 while (*ptr != term) ptr++;
1091     if (name != NULL && lorn == ptr - thisname &&
1092     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1093 nigel 91 return count;
1094     }
1095 nigel 93
1096 nigel 91 return -1;
1097     }
1098    
1099    
1100    
1101     /*************************************************
1102 nigel 77 * Find first significant op code *
1103     *************************************************/
1104    
1105     /* This is called by several functions that scan a compiled expression looking
1106     for a fixed first character, or an anchoring op code etc. It skips over things
1107     that do not influence this. For some calls, a change of option is important.
1108     For some calls, it makes sense to skip negative forward and all backward
1109     assertions, and also the \b assertion; for others it does not.
1110    
1111     Arguments:
1112     code pointer to the start of the group
1113     options pointer to external options
1114     optbit the option bit whose changing is significant, or
1115     zero if none are
1116     skipassert TRUE if certain assertions are to be skipped
1117    
1118     Returns: pointer to the first significant opcode
1119     */
1120    
1121     static const uschar*
1122     first_significant_code(const uschar *code, int *options, int optbit,
1123     BOOL skipassert)
1124     {
1125     for (;;)
1126     {
1127     switch ((int)*code)
1128     {
1129     case OP_OPT:
1130     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1131     *options = (int)code[1];
1132     code += 2;
1133     break;
1134    
1135     case OP_ASSERT_NOT:
1136     case OP_ASSERTBACK:
1137     case OP_ASSERTBACK_NOT:
1138     if (!skipassert) return code;
1139     do code += GET(code, 1); while (*code == OP_ALT);
1140     code += _pcre_OP_lengths[*code];
1141     break;
1142    
1143     case OP_WORD_BOUNDARY:
1144     case OP_NOT_WORD_BOUNDARY:
1145     if (!skipassert) return code;
1146     /* Fall through */
1147    
1148     case OP_CALLOUT:
1149     case OP_CREF:
1150 nigel 93 case OP_RREF:
1151     case OP_DEF:
1152 nigel 77 code += _pcre_OP_lengths[*code];
1153     break;
1154    
1155     default:
1156     return code;
1157     }
1158     }
1159     /* Control never reaches here */
1160     }
1161    
1162    
1163    
1164    
1165     /*************************************************
1166     * Find the fixed length of a pattern *
1167     *************************************************/
1168    
1169     /* Scan a pattern and compute the fixed length of subject that will match it,
1170     if the length is fixed. This is needed for dealing with backward assertions.
1171     In UTF8 mode, the result is in characters rather than bytes.
1172    
1173     Arguments:
1174     code points to the start of the pattern (the bracket)
1175     options the compiling options
1176    
1177     Returns: the fixed length, or -1 if there is no fixed length,
1178     or -2 if \C was encountered
1179     */
1180    
1181     static int
1182     find_fixedlength(uschar *code, int options)
1183     {
1184     int length = -1;
1185    
1186     register int branchlength = 0;
1187     register uschar *cc = code + 1 + LINK_SIZE;
1188    
1189     /* Scan along the opcodes for this branch. If we get to the end of the
1190     branch, check the length against that of the other branches. */
1191    
1192     for (;;)
1193     {
1194     int d;
1195     register int op = *cc;
1196     switch (op)
1197     {
1198 nigel 93 case OP_CBRA:
1199 nigel 77 case OP_BRA:
1200     case OP_ONCE:
1201     case OP_COND:
1202 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1203 nigel 77 if (d < 0) return d;
1204     branchlength += d;
1205     do cc += GET(cc, 1); while (*cc == OP_ALT);
1206     cc += 1 + LINK_SIZE;
1207     break;
1208    
1209     /* Reached end of a branch; if it's a ket it is the end of a nested
1210     call. If it's ALT it is an alternation in a nested call. If it is
1211     END it's the end of the outer call. All can be handled by the same code. */
1212    
1213     case OP_ALT:
1214     case OP_KET:
1215     case OP_KETRMAX:
1216     case OP_KETRMIN:
1217     case OP_END:
1218     if (length < 0) length = branchlength;
1219     else if (length != branchlength) return -1;
1220     if (*cc != OP_ALT) return length;
1221     cc += 1 + LINK_SIZE;
1222     branchlength = 0;
1223     break;
1224    
1225     /* Skip over assertive subpatterns */
1226    
1227     case OP_ASSERT:
1228     case OP_ASSERT_NOT:
1229     case OP_ASSERTBACK:
1230     case OP_ASSERTBACK_NOT:
1231     do cc += GET(cc, 1); while (*cc == OP_ALT);
1232     /* Fall through */
1233    
1234     /* Skip over things that don't match chars */
1235    
1236     case OP_REVERSE:
1237     case OP_CREF:
1238 nigel 93 case OP_RREF:
1239     case OP_DEF:
1240 nigel 77 case OP_OPT:
1241     case OP_CALLOUT:
1242     case OP_SOD:
1243     case OP_SOM:
1244     case OP_EOD:
1245     case OP_EODN:
1246     case OP_CIRC:
1247     case OP_DOLL:
1248     case OP_NOT_WORD_BOUNDARY:
1249     case OP_WORD_BOUNDARY:
1250     cc += _pcre_OP_lengths[*cc];
1251     break;
1252    
1253     /* Handle literal characters */
1254    
1255     case OP_CHAR:
1256     case OP_CHARNC:
1257 nigel 91 case OP_NOT:
1258 nigel 77 branchlength++;
1259     cc += 2;
1260     #ifdef SUPPORT_UTF8
1261     if ((options & PCRE_UTF8) != 0)
1262     {
1263     while ((*cc & 0xc0) == 0x80) cc++;
1264     }
1265     #endif
1266     break;
1267    
1268     /* Handle exact repetitions. The count is already in characters, but we
1269     need to skip over a multibyte character in UTF8 mode. */
1270    
1271     case OP_EXACT:
1272     branchlength += GET2(cc,1);
1273     cc += 4;
1274     #ifdef SUPPORT_UTF8
1275     if ((options & PCRE_UTF8) != 0)
1276     {
1277     while((*cc & 0x80) == 0x80) cc++;
1278     }
1279     #endif
1280     break;
1281    
1282     case OP_TYPEEXACT:
1283     branchlength += GET2(cc,1);
1284 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1285 nigel 77 cc += 4;
1286     break;
1287    
1288     /* Handle single-char matchers */
1289    
1290     case OP_PROP:
1291     case OP_NOTPROP:
1292 nigel 87 cc += 2;
1293 nigel 77 /* Fall through */
1294    
1295     case OP_NOT_DIGIT:
1296     case OP_DIGIT:
1297     case OP_NOT_WHITESPACE:
1298     case OP_WHITESPACE:
1299     case OP_NOT_WORDCHAR:
1300     case OP_WORDCHAR:
1301     case OP_ANY:
1302     branchlength++;
1303     cc++;
1304     break;
1305    
1306     /* The single-byte matcher isn't allowed */
1307    
1308     case OP_ANYBYTE:
1309     return -2;
1310    
1311     /* Check a class for variable quantification */
1312    
1313     #ifdef SUPPORT_UTF8
1314     case OP_XCLASS:
1315     cc += GET(cc, 1) - 33;
1316     /* Fall through */
1317     #endif
1318    
1319     case OP_CLASS:
1320     case OP_NCLASS:
1321     cc += 33;
1322    
1323     switch (*cc)
1324     {
1325     case OP_CRSTAR:
1326     case OP_CRMINSTAR:
1327     case OP_CRQUERY:
1328     case OP_CRMINQUERY:
1329     return -1;
1330    
1331     case OP_CRRANGE:
1332     case OP_CRMINRANGE:
1333     if (GET2(cc,1) != GET2(cc,3)) return -1;
1334     branchlength += GET2(cc,1);
1335     cc += 5;
1336     break;
1337    
1338     default:
1339     branchlength++;
1340     }
1341     break;
1342    
1343     /* Anything else is variable length */
1344    
1345     default:
1346     return -1;
1347     }
1348     }
1349     /* Control never gets here */
1350     }
1351    
1352    
1353    
1354    
1355     /*************************************************
1356     * Scan compiled regex for numbered bracket *
1357     *************************************************/
1358    
1359     /* This little function scans through a compiled pattern until it finds a
1360     capturing bracket with the given number.
1361    
1362     Arguments:
1363     code points to start of expression
1364     utf8 TRUE in UTF-8 mode
1365     number the required bracket number
1366    
1367     Returns: pointer to the opcode for the bracket, or NULL if not found
1368     */
1369    
1370     static const uschar *
1371     find_bracket(const uschar *code, BOOL utf8, int number)
1372     {
1373     for (;;)
1374     {
1375     register int c = *code;
1376     if (c == OP_END) return NULL;
1377 nigel 91
1378     /* XCLASS is used for classes that cannot be represented just by a bit
1379     map. This includes negated single high-valued characters. The length in
1380     the table is zero; the actual length is stored in the compiled code. */
1381    
1382     if (c == OP_XCLASS) code += GET(code, 1);
1383    
1384 nigel 93 /* Handle capturing bracket */
1385 nigel 91
1386 nigel 93 else if (c == OP_CBRA)
1387 nigel 77 {
1388 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1389 nigel 77 if (n == number) return (uschar *)code;
1390 nigel 93 code += _pcre_OP_lengths[c];
1391 nigel 77 }
1392 nigel 91
1393 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1394     repeated character types, we have to test for \p and \P, which have an extra
1395 ph10 218 two bytes of parameters. */
1396 nigel 91
1397 nigel 77 else
1398     {
1399 ph10 218 switch(c)
1400     {
1401     case OP_TYPESTAR:
1402     case OP_TYPEMINSTAR:
1403     case OP_TYPEPLUS:
1404     case OP_TYPEMINPLUS:
1405     case OP_TYPEQUERY:
1406     case OP_TYPEMINQUERY:
1407     case OP_TYPEPOSSTAR:
1408     case OP_TYPEPOSPLUS:
1409     case OP_TYPEPOSQUERY:
1410     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1411 ph10 220 break;
1412 ph10 221
1413     case OP_TYPEUPTO:
1414     case OP_TYPEMINUPTO:
1415     case OP_TYPEEXACT:
1416     case OP_TYPEPOSUPTO:
1417     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1418     break;
1419 ph10 220 }
1420    
1421 ph10 218 /* Add in the fixed length from the table */
1422 ph10 220
1423 nigel 77 code += _pcre_OP_lengths[c];
1424 ph10 220
1425 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1426     a multi-byte character. The length in the table is a minimum, so we have to
1427     arrange to skip the extra bytes. */
1428 ph10 220
1429 ph10 107 #ifdef SUPPORT_UTF8
1430 nigel 77 if (utf8) switch(c)
1431     {
1432     case OP_CHAR:
1433     case OP_CHARNC:
1434     case OP_EXACT:
1435     case OP_UPTO:
1436     case OP_MINUPTO:
1437 nigel 93 case OP_POSUPTO:
1438 nigel 77 case OP_STAR:
1439     case OP_MINSTAR:
1440 nigel 93 case OP_POSSTAR:
1441 nigel 77 case OP_PLUS:
1442     case OP_MINPLUS:
1443 nigel 93 case OP_POSPLUS:
1444 nigel 77 case OP_QUERY:
1445     case OP_MINQUERY:
1446 nigel 93 case OP_POSQUERY:
1447     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1448 nigel 77 break;
1449     }
1450 ph10 111 #endif
1451 nigel 77 }
1452     }
1453     }
1454    
1455    
1456    
1457     /*************************************************
1458     * Scan compiled regex for recursion reference *
1459     *************************************************/
1460    
1461     /* This little function scans through a compiled pattern until it finds an
1462     instance of OP_RECURSE.
1463    
1464     Arguments:
1465     code points to start of expression
1466     utf8 TRUE in UTF-8 mode
1467    
1468     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1469     */
1470    
1471     static const uschar *
1472     find_recurse(const uschar *code, BOOL utf8)
1473     {
1474     for (;;)
1475     {
1476     register int c = *code;
1477     if (c == OP_END) return NULL;
1478 nigel 91 if (c == OP_RECURSE) return code;
1479 ph10 220
1480 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1481     map. This includes negated single high-valued characters. The length in
1482     the table is zero; the actual length is stored in the compiled code. */
1483    
1484     if (c == OP_XCLASS) code += GET(code, 1);
1485    
1486 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1487     repeated character types, we have to test for \p and \P, which have an extra
1488 ph10 218 two bytes of parameters. */
1489 nigel 91
1490 nigel 77 else
1491     {
1492 ph10 218 switch(c)
1493     {
1494     case OP_TYPESTAR:
1495     case OP_TYPEMINSTAR:
1496     case OP_TYPEPLUS:
1497     case OP_TYPEMINPLUS:
1498     case OP_TYPEQUERY:
1499     case OP_TYPEMINQUERY:
1500     case OP_TYPEPOSSTAR:
1501     case OP_TYPEPOSPLUS:
1502     case OP_TYPEPOSQUERY:
1503     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1504 ph10 220 break;
1505 ph10 221
1506     case OP_TYPEPOSUPTO:
1507     case OP_TYPEUPTO:
1508     case OP_TYPEMINUPTO:
1509     case OP_TYPEEXACT:
1510     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1511     break;
1512 ph10 220 }
1513    
1514 ph10 218 /* Add in the fixed length from the table */
1515    
1516 nigel 77 code += _pcre_OP_lengths[c];
1517 ph10 220
1518 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1519     by a multi-byte character. The length in the table is a minimum, so we have
1520     to arrange to skip the extra bytes. */
1521 ph10 220
1522 ph10 107 #ifdef SUPPORT_UTF8
1523 nigel 77 if (utf8) switch(c)
1524     {
1525     case OP_CHAR:
1526     case OP_CHARNC:
1527     case OP_EXACT:
1528     case OP_UPTO:
1529     case OP_MINUPTO:
1530 nigel 93 case OP_POSUPTO:
1531 nigel 77 case OP_STAR:
1532     case OP_MINSTAR:
1533 nigel 93 case OP_POSSTAR:
1534 nigel 77 case OP_PLUS:
1535     case OP_MINPLUS:
1536 nigel 93 case OP_POSPLUS:
1537 nigel 77 case OP_QUERY:
1538     case OP_MINQUERY:
1539 nigel 93 case OP_POSQUERY:
1540     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1541 nigel 77 break;
1542     }
1543 ph10 111 #endif
1544 nigel 77 }
1545     }
1546     }
1547    
1548    
1549    
1550     /*************************************************
1551     * Scan compiled branch for non-emptiness *
1552     *************************************************/
1553    
1554     /* This function scans through a branch of a compiled pattern to see whether it
1555 nigel 93 can match the empty string or not. It is called from could_be_empty()
1556     below and from compile_branch() when checking for an unlimited repeat of a
1557     group that can match nothing. Note that first_significant_code() skips over
1558 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1559     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1560     bracket whose current branch will already have been scanned.
1561 nigel 77
1562     Arguments:
1563     code points to start of search
1564     endcode points to where to stop
1565     utf8 TRUE if in UTF8 mode
1566    
1567     Returns: TRUE if what is matched could be empty
1568     */
1569    
1570     static BOOL
1571     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1572     {
1573     register int c;
1574 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1575 nigel 77 code < endcode;
1576     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1577     {
1578     const uschar *ccode;
1579    
1580     c = *code;
1581 ph10 286
1582     /* Skip over forward assertions; the other assertions are skipped by
1583 ph10 282 first_significant_code() with a TRUE final argument. */
1584 ph10 286
1585 ph10 282 if (c == OP_ASSERT)
1586 ph10 286 {
1587 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1588     c = *code;
1589     continue;
1590 ph10 286 }
1591 ph10 172
1592 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1593 nigel 77
1594 ph10 335 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1595 ph10 170 {
1596 ph10 172 code += _pcre_OP_lengths[c];
1597 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1598     c = *code;
1599     continue;
1600     }
1601    
1602     /* For other groups, scan the branches. */
1603 ph10 172
1604 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1605 nigel 77 {
1606     BOOL empty_branch;
1607     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1608    
1609     /* Scan a closed bracket */
1610    
1611     empty_branch = FALSE;
1612     do
1613     {
1614     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1615     empty_branch = TRUE;
1616     code += GET(code, 1);
1617     }
1618     while (*code == OP_ALT);
1619     if (!empty_branch) return FALSE; /* All branches are non-empty */
1620 ph10 172 c = *code;
1621 nigel 93 continue;
1622 nigel 77 }
1623    
1624 nigel 93 /* Handle the other opcodes */
1625    
1626     switch (c)
1627 nigel 77 {
1628 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1629     cannot be represented just by a bit map. This includes negated single
1630     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1631 ph10 220 actual length is stored in the compiled code, so we must update "code"
1632 ph10 216 here. */
1633 nigel 77
1634     #ifdef SUPPORT_UTF8
1635     case OP_XCLASS:
1636 ph10 216 ccode = code += GET(code, 1);
1637 nigel 77 goto CHECK_CLASS_REPEAT;
1638     #endif
1639    
1640     case OP_CLASS:
1641     case OP_NCLASS:
1642     ccode = code + 33;
1643    
1644     #ifdef SUPPORT_UTF8
1645     CHECK_CLASS_REPEAT:
1646     #endif
1647    
1648     switch (*ccode)
1649     {
1650     case OP_CRSTAR: /* These could be empty; continue */
1651     case OP_CRMINSTAR:
1652     case OP_CRQUERY:
1653     case OP_CRMINQUERY:
1654     break;
1655    
1656     default: /* Non-repeat => class must match */
1657     case OP_CRPLUS: /* These repeats aren't empty */
1658     case OP_CRMINPLUS:
1659     return FALSE;
1660    
1661     case OP_CRRANGE:
1662     case OP_CRMINRANGE:
1663     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1664     break;
1665     }
1666     break;
1667    
1668     /* Opcodes that must match a character */
1669    
1670     case OP_PROP:
1671     case OP_NOTPROP:
1672     case OP_EXTUNI:
1673     case OP_NOT_DIGIT:
1674     case OP_DIGIT:
1675     case OP_NOT_WHITESPACE:
1676     case OP_WHITESPACE:
1677     case OP_NOT_WORDCHAR:
1678     case OP_WORDCHAR:
1679     case OP_ANY:
1680     case OP_ANYBYTE:
1681     case OP_CHAR:
1682     case OP_CHARNC:
1683     case OP_NOT:
1684     case OP_PLUS:
1685     case OP_MINPLUS:
1686 nigel 93 case OP_POSPLUS:
1687 nigel 77 case OP_EXACT:
1688     case OP_NOTPLUS:
1689     case OP_NOTMINPLUS:
1690 nigel 93 case OP_NOTPOSPLUS:
1691 nigel 77 case OP_NOTEXACT:
1692     case OP_TYPEPLUS:
1693     case OP_TYPEMINPLUS:
1694 nigel 93 case OP_TYPEPOSPLUS:
1695 nigel 77 case OP_TYPEEXACT:
1696     return FALSE;
1697 ph10 227
1698     /* These are going to continue, as they may be empty, but we have to
1699     fudge the length for the \p and \P cases. */
1700    
1701 ph10 224 case OP_TYPESTAR:
1702     case OP_TYPEMINSTAR:
1703     case OP_TYPEPOSSTAR:
1704     case OP_TYPEQUERY:
1705     case OP_TYPEMINQUERY:
1706     case OP_TYPEPOSQUERY:
1707     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1708 ph10 227 break;
1709    
1710 ph10 224 /* Same for these */
1711 ph10 227
1712 ph10 224 case OP_TYPEUPTO:
1713     case OP_TYPEMINUPTO:
1714     case OP_TYPEPOSUPTO:
1715     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1716     break;
1717 nigel 77
1718     /* End of branch */
1719    
1720     case OP_KET:
1721     case OP_KETRMAX:
1722     case OP_KETRMIN:
1723     case OP_ALT:
1724     return TRUE;
1725    
1726 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1727     MINUPTO, and POSUPTO may be followed by a multibyte character */
1728 nigel 77
1729     #ifdef SUPPORT_UTF8
1730     case OP_STAR:
1731     case OP_MINSTAR:
1732 nigel 93 case OP_POSSTAR:
1733 nigel 77 case OP_QUERY:
1734     case OP_MINQUERY:
1735 nigel 93 case OP_POSQUERY:
1736 nigel 77 case OP_UPTO:
1737     case OP_MINUPTO:
1738 nigel 93 case OP_POSUPTO:
1739 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1740     break;
1741     #endif
1742     }
1743     }
1744    
1745     return TRUE;
1746     }
1747    
1748    
1749    
1750     /*************************************************
1751     * Scan compiled regex for non-emptiness *
1752     *************************************************/
1753    
1754     /* This function is called to check for left recursive calls. We want to check
1755     the current branch of the current pattern to see if it could match the empty
1756     string. If it could, we must look outwards for branches at other levels,
1757     stopping when we pass beyond the bracket which is the subject of the recursion.
1758    
1759     Arguments:
1760     code points to start of the recursion
1761     endcode points to where to stop (current RECURSE item)
1762     bcptr points to the chain of current (unclosed) branch starts
1763     utf8 TRUE if in UTF-8 mode
1764    
1765     Returns: TRUE if what is matched could be empty
1766     */
1767    
1768     static BOOL
1769     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1770     BOOL utf8)
1771     {
1772     while (bcptr != NULL && bcptr->current >= code)
1773     {
1774     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1775     bcptr = bcptr->outer;
1776     }
1777     return TRUE;
1778     }
1779    
1780    
1781    
1782     /*************************************************
1783     * Check for POSIX class syntax *
1784     *************************************************/
1785    
1786     /* This function is called when the sequence "[:" or "[." or "[=" is
1787 ph10 295 encountered in a character class. It checks whether this is followed by a
1788 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1789 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
1790 nigel 77
1791 ph10 298 Originally, this function only recognized a sequence of letters between the
1792     terminators, but it seems that Perl recognizes any sequence of characters,
1793     though of course unknown POSIX names are subsequently rejected. Perl gives an
1794     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1795     didn't consider this to be a POSIX class. Likewise for [:1234:].
1796 ph10 295
1797 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
1798     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1799     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1800     below handles the special case of \], but does not try to do any other escape
1801     processing. This makes it different from Perl for cases such as [:l\ower:]
1802 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1803 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1804 ph10 295 I think.
1805    
1806     Arguments:
1807 nigel 77 ptr pointer to the initial [
1808     endptr where to return the end pointer
1809    
1810     Returns: TRUE or FALSE
1811     */
1812    
1813     static BOOL
1814 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1815 nigel 77 {
1816     int terminator; /* Don't combine these lines; the Solaris cc */
1817     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1818 ph10 295 for (++ptr; *ptr != 0; ptr++)
1819 nigel 77 {
1820 ph10 295 if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1821 ph10 298 {
1822     if (*ptr == ']') return FALSE;
1823 ph10 295 if (*ptr == terminator && ptr[1] == ']')
1824     {
1825     *endptr = ptr;
1826     return TRUE;
1827 ph10 298 }
1828     }
1829     }
1830 nigel 77 return FALSE;
1831     }
1832    
1833    
1834    
1835    
1836     /*************************************************
1837     * Check POSIX class name *
1838     *************************************************/
1839    
1840     /* This function is called to check the name given in a POSIX-style class entry
1841     such as [:alnum:].
1842    
1843     Arguments:
1844     ptr points to the first letter
1845     len the length of the name
1846    
1847     Returns: a value representing the name, or -1 if unknown
1848     */
1849    
1850     static int
1851     check_posix_name(const uschar *ptr, int len)
1852     {
1853 ph10 240 const char *pn = posix_names;
1854 nigel 77 register int yield = 0;
1855     while (posix_name_lengths[yield] != 0)
1856     {
1857     if (len == posix_name_lengths[yield] &&
1858 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
1859 ph10 243 pn += posix_name_lengths[yield] + 1;
1860 nigel 77 yield++;
1861     }
1862     return -1;
1863     }
1864    
1865    
1866     /*************************************************
1867     * Adjust OP_RECURSE items in repeated group *
1868     *************************************************/
1869    
1870     /* OP_RECURSE items contain an offset from the start of the regex to the group
1871     that is referenced. This means that groups can be replicated for fixed
1872     repetition simply by copying (because the recursion is allowed to refer to
1873     earlier groups that are outside the current group). However, when a group is
1874 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1875     inserted before it, after it has been compiled. This means that any OP_RECURSE
1876     items within it that refer to the group itself or any contained groups have to
1877     have their offsets adjusted. That one of the jobs of this function. Before it
1878     is called, the partially compiled regex must be temporarily terminated with
1879     OP_END.
1880 nigel 77
1881 nigel 93 This function has been extended with the possibility of forward references for
1882     recursions and subroutine calls. It must also check the list of such references
1883     for the group we are dealing with. If it finds that one of the recursions in
1884     the current group is on this list, it adjusts the offset in the list, not the
1885     value in the reference (which is a group number).
1886    
1887 nigel 77 Arguments:
1888     group points to the start of the group
1889     adjust the amount by which the group is to be moved
1890     utf8 TRUE in UTF-8 mode
1891     cd contains pointers to tables etc.
1892 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1893 nigel 77
1894     Returns: nothing
1895     */
1896    
1897     static void
1898 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1899     uschar *save_hwm)
1900 nigel 77 {
1901     uschar *ptr = group;
1902 ph10 224
1903 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1904     {
1905 nigel 93 int offset;
1906     uschar *hc;
1907    
1908     /* See if this recursion is on the forward reference list. If so, adjust the
1909     reference. */
1910 ph10 334
1911 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1912     {
1913     offset = GET(hc, 0);
1914     if (cd->start_code + offset == ptr + 1)
1915     {
1916     PUT(hc, 0, offset + adjust);
1917     break;
1918     }
1919     }
1920    
1921     /* Otherwise, adjust the recursion offset if it's after the start of this
1922     group. */
1923    
1924     if (hc >= cd->hwm)
1925     {
1926     offset = GET(ptr, 1);
1927     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1928     }
1929    
1930 nigel 77 ptr += 1 + LINK_SIZE;
1931     }
1932     }
1933    
1934    
1935    
1936     /*************************************************
1937     * Insert an automatic callout point *
1938     *************************************************/
1939    
1940     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1941     callout points before each pattern item.
1942    
1943     Arguments:
1944     code current code pointer
1945     ptr current pattern pointer
1946     cd pointers to tables etc
1947    
1948     Returns: new code pointer
1949     */
1950    
1951     static uschar *
1952     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1953     {
1954     *code++ = OP_CALLOUT;
1955     *code++ = 255;
1956     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1957     PUT(code, LINK_SIZE, 0); /* Default length */
1958     return code + 2*LINK_SIZE;
1959     }
1960    
1961    
1962    
1963     /*************************************************
1964     * Complete a callout item *
1965     *************************************************/
1966    
1967     /* A callout item contains the length of the next item in the pattern, which
1968     we can't fill in till after we have reached the relevant point. This is used
1969     for both automatic and manual callouts.
1970    
1971     Arguments:
1972     previous_callout points to previous callout item
1973     ptr current pattern pointer
1974     cd pointers to tables etc
1975    
1976     Returns: nothing
1977     */
1978    
1979     static void
1980     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1981     {
1982     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1983     PUT(previous_callout, 2 + LINK_SIZE, length);
1984     }
1985    
1986    
1987    
1988     #ifdef SUPPORT_UCP
1989     /*************************************************
1990     * Get othercase range *
1991     *************************************************/
1992    
1993     /* This function is passed the start and end of a class range, in UTF-8 mode
1994     with UCP support. It searches up the characters, looking for internal ranges of
1995     characters in the "other" case. Each call returns the next one, updating the
1996     start address.
1997    
1998     Arguments:
1999     cptr points to starting character value; updated
2000     d end value
2001     ocptr where to put start of othercase range
2002     odptr where to put end of othercase range
2003    
2004     Yield: TRUE when range returned; FALSE when no more
2005     */
2006    
2007     static BOOL
2008 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2009     unsigned int *odptr)
2010 nigel 77 {
2011 nigel 93 unsigned int c, othercase, next;
2012 nigel 77
2013     for (c = *cptr; c <= d; c++)
2014 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
2015 nigel 77
2016     if (c > d) return FALSE;
2017    
2018     *ocptr = othercase;
2019     next = othercase + 1;
2020    
2021     for (++c; c <= d; c++)
2022     {
2023 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
2024 nigel 77 next++;
2025     }
2026    
2027     *odptr = next - 1;
2028     *cptr = c;
2029    
2030     return TRUE;
2031     }
2032     #endif /* SUPPORT_UCP */
2033    
2034    
2035 nigel 93
2036 nigel 77 /*************************************************
2037 nigel 93 * Check if auto-possessifying is possible *
2038     *************************************************/
2039    
2040     /* This function is called for unlimited repeats of certain items, to see
2041     whether the next thing could possibly match the repeated item. If not, it makes
2042     sense to automatically possessify the repeated item.
2043    
2044     Arguments:
2045     op_code the repeated op code
2046     this data for this item, depends on the opcode
2047     utf8 TRUE in UTF-8 mode
2048     utf8_char used for utf8 character bytes, NULL if not relevant
2049     ptr next character in pattern
2050     options options bits
2051     cd contains pointers to tables etc.
2052    
2053     Returns: TRUE if possessifying is wanted
2054     */
2055    
2056     static BOOL
2057     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2058     const uschar *ptr, int options, compile_data *cd)
2059     {
2060     int next;
2061    
2062     /* Skip whitespace and comments in extended mode */
2063    
2064     if ((options & PCRE_EXTENDED) != 0)
2065     {
2066     for (;;)
2067     {
2068     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2069     if (*ptr == '#')
2070     {
2071     while (*(++ptr) != 0)
2072     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2073     }
2074     else break;
2075     }
2076     }
2077    
2078     /* If the next item is one that we can handle, get its value. A non-negative
2079     value is a character, a negative value is an escape value. */
2080    
2081     if (*ptr == '\\')
2082     {
2083     int temperrorcode = 0;
2084     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2085     if (temperrorcode != 0) return FALSE;
2086     ptr++; /* Point after the escape sequence */
2087     }
2088    
2089     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2090     {
2091     #ifdef SUPPORT_UTF8
2092     if (utf8) { GETCHARINC(next, ptr); } else
2093     #endif
2094     next = *ptr++;
2095     }
2096    
2097     else return FALSE;
2098    
2099     /* Skip whitespace and comments in extended mode */
2100    
2101     if ((options & PCRE_EXTENDED) != 0)
2102     {
2103     for (;;)
2104     {
2105     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2106     if (*ptr == '#')
2107     {
2108     while (*(++ptr) != 0)
2109     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2110     }
2111     else break;
2112     }
2113     }
2114    
2115     /* If the next thing is itself optional, we have to give up. */
2116    
2117     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2118     return FALSE;
2119    
2120     /* Now compare the next item with the previous opcode. If the previous is a
2121     positive single character match, "item" either contains the character or, if
2122     "item" is greater than 127 in utf8 mode, the character's bytes are in
2123     utf8_char. */
2124    
2125    
2126     /* Handle cases when the next item is a character. */
2127    
2128     if (next >= 0) switch(op_code)
2129     {
2130     case OP_CHAR:
2131     #ifdef SUPPORT_UTF8
2132     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2133     #endif
2134     return item != next;
2135    
2136     /* For CHARNC (caseless character) we must check the other case. If we have
2137     Unicode property support, we can use it to test the other case of
2138     high-valued characters. */
2139    
2140     case OP_CHARNC:
2141     #ifdef SUPPORT_UTF8
2142     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2143     #endif
2144     if (item == next) return FALSE;
2145     #ifdef SUPPORT_UTF8
2146     if (utf8)
2147     {
2148     unsigned int othercase;
2149     if (next < 128) othercase = cd->fcc[next]; else
2150     #ifdef SUPPORT_UCP
2151     othercase = _pcre_ucp_othercase((unsigned int)next);
2152     #else
2153     othercase = NOTACHAR;
2154     #endif
2155     return (unsigned int)item != othercase;
2156     }
2157     else
2158     #endif /* SUPPORT_UTF8 */
2159     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2160    
2161     /* For OP_NOT, "item" must be a single-byte character. */
2162    
2163     case OP_NOT:
2164     if (item == next) return TRUE;
2165     if ((options & PCRE_CASELESS) == 0) return FALSE;
2166     #ifdef SUPPORT_UTF8
2167     if (utf8)
2168     {
2169     unsigned int othercase;
2170     if (next < 128) othercase = cd->fcc[next]; else
2171     #ifdef SUPPORT_UCP
2172     othercase = _pcre_ucp_othercase(next);
2173     #else
2174     othercase = NOTACHAR;
2175     #endif
2176     return (unsigned int)item == othercase;
2177     }
2178     else
2179     #endif /* SUPPORT_UTF8 */
2180     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2181    
2182     case OP_DIGIT:
2183     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2184    
2185     case OP_NOT_DIGIT:
2186     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2187    
2188     case OP_WHITESPACE:
2189     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2190    
2191     case OP_NOT_WHITESPACE:
2192     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2193    
2194     case OP_WORDCHAR:
2195     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2196    
2197     case OP_NOT_WORDCHAR:
2198     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2199    
2200 ph10 180 case OP_HSPACE:
2201     case OP_NOT_HSPACE:
2202     switch(next)
2203     {
2204     case 0x09:
2205     case 0x20:
2206     case 0xa0:
2207     case 0x1680:
2208     case 0x180e:
2209     case 0x2000:
2210     case 0x2001:
2211     case 0x2002:
2212     case 0x2003:
2213     case 0x2004:
2214     case 0x2005:
2215     case 0x2006:
2216     case 0x2007:
2217     case 0x2008:
2218     case 0x2009:
2219     case 0x200A:
2220     case 0x202f:
2221     case 0x205f:
2222     case 0x3000:
2223     return op_code != OP_HSPACE;
2224     default:
2225     return op_code == OP_HSPACE;
2226     }
2227    
2228     case OP_VSPACE:
2229     case OP_NOT_VSPACE:
2230     switch(next)
2231     {
2232     case 0x0a:
2233     case 0x0b:
2234     case 0x0c:
2235     case 0x0d:
2236     case 0x85:
2237     case 0x2028:
2238     case 0x2029:
2239     return op_code != OP_VSPACE;
2240     default:
2241     return op_code == OP_VSPACE;
2242     }
2243    
2244 nigel 93 default:
2245     return FALSE;
2246     }
2247    
2248    
2249     /* Handle the case when the next item is \d, \s, etc. */
2250    
2251     switch(op_code)
2252     {
2253     case OP_CHAR:
2254     case OP_CHARNC:
2255     #ifdef SUPPORT_UTF8
2256     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2257     #endif
2258     switch(-next)
2259     {
2260     case ESC_d:
2261     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2262    
2263     case ESC_D:
2264     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2265    
2266     case ESC_s:
2267     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2268    
2269     case ESC_S:
2270     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2271    
2272     case ESC_w:
2273     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2274    
2275     case ESC_W:
2276     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2277 ph10 182
2278 ph10 180 case ESC_h:
2279     case ESC_H:
2280     switch(item)
2281     {
2282     case 0x09:
2283     case 0x20:
2284     case 0xa0:
2285     case 0x1680:
2286     case 0x180e:
2287     case 0x2000:
2288     case 0x2001:
2289     case 0x2002:
2290     case 0x2003:
2291     case 0x2004:
2292     case 0x2005:
2293     case 0x2006:
2294     case 0x2007:
2295     case 0x2008:
2296     case 0x2009:
2297     case 0x200A:
2298     case 0x202f:
2299     case 0x205f:
2300     case 0x3000:
2301     return -next != ESC_h;
2302     default:
2303     return -next == ESC_h;
2304 ph10 182 }
2305    
2306 ph10 180 case ESC_v:
2307     case ESC_V:
2308     switch(item)
2309     {
2310     case 0x0a:
2311     case 0x0b:
2312     case 0x0c:
2313     case 0x0d:
2314     case 0x85:
2315     case 0x2028:
2316     case 0x2029:
2317     return -next != ESC_v;
2318     default:
2319     return -next == ESC_v;
2320 ph10 182 }
2321 nigel 93
2322     default:
2323     return FALSE;
2324     }
2325    
2326     case OP_DIGIT:
2327 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2328     next == -ESC_h || next == -ESC_v;
2329 nigel 93
2330     case OP_NOT_DIGIT:
2331     return next == -ESC_d;
2332    
2333     case OP_WHITESPACE:
2334     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2335    
2336     case OP_NOT_WHITESPACE:
2337 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2338 nigel 93
2339 ph10 180 case OP_HSPACE:
2340     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2341    
2342     case OP_NOT_HSPACE:
2343     return next == -ESC_h;
2344 ph10 182
2345 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2346 ph10 182 case OP_VSPACE:
2347 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2348    
2349     case OP_NOT_VSPACE:
2350 ph10 182 return next == -ESC_v;
2351 ph10 180
2352 nigel 93 case OP_WORDCHAR:
2353 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2354 nigel 93
2355     case OP_NOT_WORDCHAR:
2356     return next == -ESC_w || next == -ESC_d;
2357 ph10 182
2358 nigel 93 default:
2359     return FALSE;
2360     }
2361    
2362     /* Control does not reach here */
2363     }
2364    
2365    
2366    
2367     /*************************************************
2368 nigel 77 * Compile one branch *
2369     *************************************************/
2370    
2371 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2372 nigel 77 changed during the branch, the pointer is used to change the external options
2373 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2374     to find out the amount of memory needed, as well as during the real compile
2375     phase. The value of lengthptr distinguishes the two phases.
2376 nigel 77
2377     Arguments:
2378     optionsptr pointer to the option bits
2379     codeptr points to the pointer to the current code point
2380     ptrptr points to the current pattern pointer
2381     errorcodeptr points to error code variable
2382     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2383     reqbyteptr set to the last literal character required, else < 0
2384     bcptr points to current branch chain
2385     cd contains pointers to tables etc.
2386 nigel 93 lengthptr NULL during the real compile phase
2387     points to length accumulator during pre-compile phase
2388 nigel 77
2389     Returns: TRUE on success
2390     FALSE, with *errorcodeptr set non-zero on error
2391     */
2392    
2393     static BOOL
2394 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2395     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2396     compile_data *cd, int *lengthptr)
2397 nigel 77 {
2398     int repeat_type, op_type;
2399     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2400     int bravalue = 0;
2401     int greedy_default, greedy_non_default;
2402     int firstbyte, reqbyte;
2403     int zeroreqbyte, zerofirstbyte;
2404     int req_caseopt, reqvary, tempreqvary;
2405     int options = *optionsptr;
2406     int after_manual_callout = 0;
2407 nigel 93 int length_prevgroup = 0;
2408 nigel 77 register int c;
2409     register uschar *code = *codeptr;
2410 nigel 93 uschar *last_code = code;
2411     uschar *orig_code = code;
2412 nigel 77 uschar *tempcode;
2413     BOOL inescq = FALSE;
2414     BOOL groupsetfirstbyte = FALSE;
2415     const uschar *ptr = *ptrptr;
2416     const uschar *tempptr;
2417     uschar *previous = NULL;
2418     uschar *previous_callout = NULL;
2419 nigel 93 uschar *save_hwm = NULL;
2420 nigel 77 uschar classbits[32];
2421    
2422     #ifdef SUPPORT_UTF8
2423     BOOL class_utf8;
2424     BOOL utf8 = (options & PCRE_UTF8) != 0;
2425     uschar *class_utf8data;
2426 ph10 300 uschar *class_utf8data_base;
2427 nigel 77 uschar utf8_char[6];
2428     #else
2429     BOOL utf8 = FALSE;
2430 nigel 93 uschar *utf8_char = NULL;
2431 nigel 77 #endif
2432    
2433 nigel 93 #ifdef DEBUG
2434     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2435     #endif
2436    
2437 nigel 77 /* Set up the default and non-default settings for greediness */
2438    
2439     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2440     greedy_non_default = greedy_default ^ 1;
2441    
2442     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2443     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2444     matches a non-fixed char first char; reqbyte just remains unset if we never
2445     find one.
2446    
2447     When we hit a repeat whose minimum is zero, we may have to adjust these values
2448     to take the zero repeat into account. This is implemented by setting them to
2449     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2450     item types that can be repeated set these backoff variables appropriately. */
2451    
2452     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2453    
2454     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2455     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2456     value > 255. It is added into the firstbyte or reqbyte variables to record the
2457     case status of the value. This is used only for ASCII characters. */
2458    
2459     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2460    
2461     /* Switch on next character until the end of the branch */
2462    
2463     for (;; ptr++)
2464     {
2465     BOOL negate_class;
2466 ph10 286 BOOL should_flip_negation;
2467 nigel 77 BOOL possessive_quantifier;
2468     BOOL is_quantifier;
2469 nigel 93 BOOL is_recurse;
2470 ph10 180 BOOL reset_bracount;
2471 nigel 77 int class_charcount;
2472     int class_lastchar;
2473     int newoptions;
2474     int recno;
2475 ph10 172 int refsign;
2476 nigel 77 int skipbytes;
2477     int subreqbyte;
2478     int subfirstbyte;
2479 nigel 93 int terminator;
2480 nigel 77 int mclength;
2481     uschar mcbuffer[8];
2482    
2483 nigel 93 /* Get next byte in the pattern */
2484 nigel 77
2485     c = *ptr;
2486 ph10 334
2487 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2488     previous cycle of this loop. */
2489    
2490     if (lengthptr != NULL)
2491     {
2492     #ifdef DEBUG
2493     if (code > cd->hwm) cd->hwm = code; /* High water info */
2494     #endif
2495     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2496     {
2497     *errorcodeptr = ERR52;
2498     goto FAILED;
2499     }
2500    
2501     /* There is at least one situation where code goes backwards: this is the
2502     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2503     the class is simply eliminated. However, it is created first, so we have to
2504     allow memory for it. Therefore, don't ever reduce the length at this point.
2505     */
2506    
2507     if (code < last_code) code = last_code;
2508 ph10 202
2509     /* Paranoid check for integer overflow */
2510    
2511     if (OFLOW_MAX - *lengthptr < code - last_code)
2512     {
2513     *errorcodeptr = ERR20;
2514     goto FAILED;
2515     }
2516    
2517 nigel 93 *lengthptr += code - last_code;
2518     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2519    
2520     /* If "previous" is set and it is not at the start of the work space, move
2521     it back to there, in order to avoid filling up the work space. Otherwise,
2522     if "previous" is NULL, reset the current code pointer to the start. */
2523    
2524     if (previous != NULL)
2525     {
2526     if (previous > orig_code)
2527     {
2528     memmove(orig_code, previous, code - previous);
2529     code -= previous - orig_code;
2530     previous = orig_code;
2531     }
2532     }
2533     else code = orig_code;
2534    
2535     /* Remember where this code item starts so we can pick up the length
2536     next time round. */
2537    
2538     last_code = code;
2539     }
2540    
2541     /* In the real compile phase, just check the workspace used by the forward
2542     reference list. */
2543    
2544     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2545     {
2546     *errorcodeptr = ERR52;
2547     goto FAILED;
2548     }
2549    
2550 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2551    
2552     if (inescq && c != 0)
2553     {
2554     if (c == '\\' && ptr[1] == 'E')
2555     {
2556     inescq = FALSE;
2557     ptr++;
2558     continue;
2559     }
2560     else
2561     {
2562     if (previous_callout != NULL)
2563     {
2564 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2565     complete_callout(previous_callout, ptr, cd);
2566 nigel 77 previous_callout = NULL;
2567     }
2568     if ((options & PCRE_AUTO_CALLOUT) != 0)
2569     {
2570     previous_callout = code;
2571     code = auto_callout(code, ptr, cd);
2572     }
2573     goto NORMAL_CHAR;
2574     }
2575     }
2576    
2577     /* Fill in length of a previous callout, except when the next thing is
2578     a quantifier. */
2579    
2580     is_quantifier = c == '*' || c == '+' || c == '?' ||
2581     (c == '{' && is_counted_repeat(ptr+1));
2582    
2583     if (!is_quantifier && previous_callout != NULL &&
2584     after_manual_callout-- <= 0)
2585     {
2586 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2587     complete_callout(previous_callout, ptr, cd);
2588 nigel 77 previous_callout = NULL;
2589     }
2590    
2591     /* In extended mode, skip white space and comments */
2592    
2593     if ((options & PCRE_EXTENDED) != 0)
2594     {
2595     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2596     if (c == '#')
2597     {
2598 nigel 93 while (*(++ptr) != 0)
2599 nigel 91 {
2600 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2601 nigel 91 }
2602 nigel 93 if (*ptr != 0) continue;
2603    
2604 nigel 91 /* Else fall through to handle end of string */
2605     c = 0;
2606 nigel 77 }
2607     }
2608    
2609     /* No auto callout for quantifiers. */
2610    
2611     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2612     {
2613     previous_callout = code;
2614     code = auto_callout(code, ptr, cd);
2615     }
2616    
2617     switch(c)
2618     {
2619 nigel 93 /* ===================================================================*/
2620     case 0: /* The branch terminates at string end */
2621     case '|': /* or | or ) */
2622 nigel 77 case ')':
2623     *firstbyteptr = firstbyte;
2624     *reqbyteptr = reqbyte;
2625     *codeptr = code;
2626     *ptrptr = ptr;
2627 nigel 93 if (lengthptr != NULL)
2628     {
2629 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2630     {
2631     *errorcodeptr = ERR20;
2632     goto FAILED;
2633     }
2634 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2635     DPRINTF((">> end branch\n"));
2636     }
2637 nigel 77 return TRUE;
2638    
2639 nigel 93
2640     /* ===================================================================*/
2641 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2642     the setting of any following char as a first character. */
2643    
2644     case '^':
2645     if ((options & PCRE_MULTILINE) != 0)
2646     {
2647     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2648     }
2649     previous = NULL;
2650     *code++ = OP_CIRC;
2651     break;
2652    
2653     case '$':
2654     previous = NULL;
2655     *code++ = OP_DOLL;
2656     break;
2657    
2658     /* There can never be a first char if '.' is first, whatever happens about
2659     repeats. The value of reqbyte doesn't change either. */
2660    
2661     case '.':
2662     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2663     zerofirstbyte = firstbyte;
2664     zeroreqbyte = reqbyte;
2665     previous = code;
2666     *code++ = OP_ANY;
2667     break;
2668    
2669 nigel 93
2670     /* ===================================================================*/
2671 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2672     32-byte bitmap of the permitted characters, except in the special case
2673     where there is only one such character. For negated classes, we build the
2674     map as usual, then invert it at the end. However, we use a different opcode
2675     so that data characters > 255 can be handled correctly.
2676 nigel 77
2677     If the class contains characters outside the 0-255 range, a different
2678     opcode is compiled. It may optionally have a bit map for characters < 256,
2679     but those above are are explicitly listed afterwards. A flag byte tells
2680     whether the bitmap is present, and whether this is a negated class or not.
2681 ph10 336
2682     In JavaScript compatibility mode, an isolated ']' causes an error. In
2683     default (Perl) mode, it is treated as a data character. */
2684    
2685     case ']':
2686     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2687     {
2688     *errorcodeptr = ERR64;
2689     goto FAILED;
2690     }
2691     goto NORMAL_CHAR;
2692 nigel 77
2693     case '[':
2694     previous = code;
2695    
2696     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2697     they are encountered at the top level, so we'll do that too. */
2698    
2699     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2700 ph10 295 check_posix_syntax(ptr, &tempptr))
2701 nigel 77 {
2702     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2703     goto FAILED;
2704     }
2705    
2706 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2707 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2708 ph10 205 skip them too. This makes for compatibility with Perl. */
2709 ph10 208
2710 ph10 205 negate_class = FALSE;
2711     for (;;)
2712 nigel 77 {
2713     c = *(++ptr);
2714 ph10 205 if (c == '\\')
2715     {
2716 ph10 208 if (ptr[1] == 'E') ptr++;
2717 ph10 205 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2718 ph10 208 else break;
2719 ph10 205 }
2720     else if (!negate_class && c == '^')
2721     negate_class = TRUE;
2722     else break;
2723 ph10 208 }
2724 nigel 77
2725 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
2726     negation flag at the end, so that support for characters > 255 works
2727 ph10 264 correctly (they are all included in the class). */
2728    
2729     should_flip_negation = FALSE;
2730    
2731 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
2732 nigel 93 of just a single character (as long as it's < 256). However, For higher
2733     valued UTF-8 characters, we don't yet do any optimization. */
2734 nigel 77
2735     class_charcount = 0;
2736     class_lastchar = -1;
2737    
2738 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2739     temporary bit of memory, in case the class contains only 1 character (less
2740     than 256), because in that case the compiled code doesn't use the bit map.
2741     */
2742    
2743     memset(classbits, 0, 32 * sizeof(uschar));
2744    
2745 nigel 77 #ifdef SUPPORT_UTF8
2746     class_utf8 = FALSE; /* No chars >= 256 */
2747 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2748 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2749 nigel 77 #endif
2750    
2751     /* Process characters until ] is reached. By writing this as a "do" it
2752 nigel 93 means that an initial ] is taken as a data character. At the start of the
2753     loop, c contains the first byte of the character. */
2754 nigel 77
2755 nigel 93 if (c != 0) do
2756 nigel 77 {
2757 nigel 93 const uschar *oldptr;
2758    
2759 nigel 77 #ifdef SUPPORT_UTF8
2760     if (utf8 && c > 127)
2761     { /* Braces are required because the */
2762     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2763     }
2764 ph10 309
2765 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2766 ph10 309 data and reset the pointer. This is so that very large classes that
2767 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
2768 ph10 309 (which is on the stack). */
2769    
2770 ph10 300 if (lengthptr != NULL)
2771     {
2772     *lengthptr += class_utf8data - class_utf8data_base;
2773 ph10 309 class_utf8data = class_utf8data_base;
2774     }
2775    
2776 nigel 77 #endif
2777    
2778     /* Inside \Q...\E everything is literal except \E */
2779    
2780     if (inescq)
2781     {
2782 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2783 nigel 77 {
2784 nigel 93 inescq = FALSE; /* Reset literal state */
2785     ptr++; /* Skip the 'E' */
2786     continue; /* Carry on with next */
2787 nigel 77 }
2788 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2789 nigel 77 }
2790    
2791     /* Handle POSIX class names. Perl allows a negation extension of the
2792     form [:^name:]. A square bracket that doesn't match the syntax is
2793     treated as a literal. We also recognize the POSIX constructions
2794     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2795     5.6 and 5.8 do. */
2796    
2797     if (c == '[' &&
2798     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2799 ph10 295 check_posix_syntax(ptr, &tempptr))
2800 nigel 77 {
2801     BOOL local_negate = FALSE;
2802 nigel 87 int posix_class, taboffset, tabopt;
2803 nigel 77 register const uschar *cbits = cd->cbits;
2804 nigel 87 uschar pbits[32];
2805 nigel 77
2806     if (ptr[1] != ':')
2807     {
2808     *errorcodeptr = ERR31;
2809     goto FAILED;
2810     }
2811    
2812     ptr += 2;
2813     if (*ptr == '^')
2814     {
2815     local_negate = TRUE;
2816 ph10 286 should_flip_negation = TRUE; /* Note negative special */
2817 nigel 77 ptr++;
2818     }
2819    
2820     posix_class = check_posix_name(ptr, tempptr - ptr);
2821     if (posix_class < 0)
2822     {
2823     *errorcodeptr = ERR30;
2824     goto FAILED;
2825     }
2826    
2827     /* If matching is caseless, upper and lower are converted to
2828     alpha. This relies on the fact that the class table starts with
2829     alpha, lower, upper as the first 3 entries. */
2830    
2831     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2832     posix_class = 0;
2833    
2834 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2835     because we may be adding and subtracting from it, and we don't want to
2836     subtract bits that may be in the main map already. At the end we or the
2837     result into the bit map that is being built. */
2838 nigel 77
2839     posix_class *= 3;
2840 nigel 87
2841     /* Copy in the first table (always present) */
2842    
2843     memcpy(pbits, cbits + posix_class_maps[posix_class],
2844     32 * sizeof(uschar));
2845    
2846     /* If there is a second table, add or remove it as required. */
2847    
2848     taboffset = posix_class_maps[posix_class + 1];
2849     tabopt = posix_class_maps[posix_class + 2];
2850    
2851     if (taboffset >= 0)
2852 nigel 77 {
2853 nigel 87 if (tabopt >= 0)
2854     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2855 nigel 77 else
2856 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2857 nigel 77 }
2858    
2859 nigel 87 /* Not see if we need to remove any special characters. An option
2860     value of 1 removes vertical space and 2 removes underscore. */
2861    
2862     if (tabopt < 0) tabopt = -tabopt;
2863     if (tabopt == 1) pbits[1] &= ~0x3c;
2864     else if (tabopt == 2) pbits[11] &= 0x7f;
2865    
2866     /* Add the POSIX table or its complement into the main table that is
2867     being built and we are done. */
2868    
2869     if (local_negate)
2870     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2871     else
2872     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2873    
2874 nigel 77 ptr = tempptr + 1;
2875     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2876     continue; /* End of POSIX syntax handling */
2877     }
2878    
2879     /* Backslash may introduce a single character, or it may introduce one
2880 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2881     case. Inside a class (and only there) it is treated as backspace.
2882     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2883 ph10 205 to 'or' into the one we are building. We assume they have more than one
2884 nigel 77 character in them, so set class_charcount bigger than one. */
2885    
2886     if (c == '\\')
2887     {
2888 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2889     if (*errorcodeptr != 0) goto FAILED;
2890 nigel 77
2891 ph10 275 if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
2892 nigel 77 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2893 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2894 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2895     {
2896     if (ptr[1] == '\\' && ptr[2] == 'E')
2897     {
2898     ptr += 2; /* avoid empty string */
2899     }
2900     else inescq = TRUE;
2901     continue;
2902     }
2903 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2904 nigel 77
2905     if (c < 0)
2906     {
2907     register const uschar *cbits = cd->cbits;
2908     class_charcount += 2; /* Greater than 1 is what matters */
2909 nigel 93
2910     /* Save time by not doing this in the pre-compile phase. */
2911    
2912     if (lengthptr == NULL) switch (-c)
2913 nigel 77 {
2914     case ESC_d:
2915     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2916     continue;
2917    
2918     case ESC_D:
2919 ph10 286 should_flip_negation = TRUE;
2920 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2921     continue;
2922    
2923     case ESC_w:
2924     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2925     continue;
2926    
2927     case ESC_W:
2928 ph10 286 should_flip_negation = TRUE;
2929 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2930     continue;
2931    
2932     case ESC_s:
2933     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2934     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2935     continue;
2936    
2937     case ESC_S:
2938 ph10 286 should_flip_negation = TRUE;
2939 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2940     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2941     continue;
2942    
2943 nigel 93 default: /* Not recognized; fall through */
2944     break; /* Need "default" setting to stop compiler warning. */
2945     }
2946    
2947     /* In the pre-compile phase, just do the recognition. */
2948    
2949     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2950     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2951 ph10 180
2952 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
2953     they use extra memory. */
2954 ph10 180
2955 ph10 178 if (-c == ESC_h)
2956     {
2957     SETBIT(classbits, 0x09); /* VT */
2958     SETBIT(classbits, 0x20); /* SPACE */
2959 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
2960 ph10 178 #ifdef SUPPORT_UTF8
2961     if (utf8)
2962 ph10 180 {
2963 ph10 178 class_utf8 = TRUE;
2964     *class_utf8data++ = XCL_SINGLE;
2965 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2966 ph10 178 *class_utf8data++ = XCL_SINGLE;
2967 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2968     *class_utf8data++ = XCL_RANGE;
2969     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2970     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2971 ph10 178 *class_utf8data++ = XCL_SINGLE;
2972 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2973 ph10 178 *class_utf8data++ = XCL_SINGLE;
2974 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2975 ph10 178 *class_utf8data++ = XCL_SINGLE;
2976 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2977     }
2978     #endif
2979     continue;
2980     }
2981 nigel 93
2982 ph10 178 if (-c == ESC_H)
2983     {
2984     for (c = 0; c < 32; c++)
2985     {
2986     int x = 0xff;
2987     switch (c)
2988 ph10 180 {
2989 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
2990     case 0x20/8: x ^= 1 << (0x20%8); break;
2991     case 0xa0/8: x ^= 1 << (0xa0%8); break;
2992     default: break;
2993     }
2994     classbits[c] |= x;
2995 ph10 180 }
2996    
2997 ph10 178 #ifdef SUPPORT_UTF8
2998     if (utf8)
2999 ph10 180 {
3000 ph10 178 class_utf8 = TRUE;
3001 ph10 180 *class_utf8data++ = XCL_RANGE;
3002     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3003     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3004     *class_utf8data++ = XCL_RANGE;
3005     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3006     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3007     *class_utf8data++ = XCL_RANGE;
3008     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3009     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3010     *class_utf8data++ = XCL_RANGE;
3011     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3012     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3013     *class_utf8data++ = XCL_RANGE;
3014     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3015     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3016     *class_utf8data++ = XCL_RANGE;
3017     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3018     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3019     *class_utf8data++ = XCL_RANGE;
3020     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3021     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3022     }
3023     #endif
3024     continue;
3025     }
3026 ph10 178
3027     if (-c == ESC_v)
3028     {
3029     SETBIT(classbits, 0x0a); /* LF */
3030     SETBIT(classbits, 0x0b); /* VT */
3031 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3032     SETBIT(classbits, 0x0d); /* CR */
3033     SETBIT(classbits, 0x85); /* NEL */
3034 ph10 178 #ifdef SUPPORT_UTF8
3035     if (utf8)
3036 ph10 180 {
3037 ph10 178 class_utf8 = TRUE;
3038 ph10 180 *class_utf8data++ = XCL_RANGE;
3039     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3040     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3041     }
3042     #endif
3043     continue;
3044     }
3045 ph10 178
3046     if (-c == ESC_V)
3047     {
3048     for (c = 0; c < 32; c++)
3049     {
3050     int x = 0xff;
3051     switch (c)
3052 ph10 180 {
3053 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3054     x ^= 1 << (0x0b%8);
3055     x ^= 1 << (0x0c%8);
3056 ph10 180 x ^= 1 << (0x0d%8);
3057 ph10 178 break;
3058     case 0x85/8: x ^= 1 << (0x85%8); break;
3059     default: break;
3060     }
3061     classbits[c] |= x;
3062 ph10 180 }
3063    
3064 ph10 178 #ifdef SUPPORT_UTF8
3065     if (utf8)
3066 ph10 180 {
3067 ph10 178 class_utf8 = TRUE;
3068 ph10 180 *class_utf8data++ = XCL_RANGE;
3069     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3070     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3071     *class_utf8data++ = XCL_RANGE;
3072     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3073     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3074     }
3075     #endif
3076     continue;
3077     }
3078 ph10 178
3079 nigel 93 /* We need to deal with \P and \p in both phases. */
3080    
3081 nigel 77 #ifdef SUPPORT_UCP
3082 nigel 93 if (-c == ESC_p || -c == ESC_P)
3083     {
3084     BOOL negated;
3085     int pdata;
3086     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3087     if (ptype < 0) goto FAILED;
3088     class_utf8 = TRUE;
3089     *class_utf8data++ = ((-c == ESC_p) != negated)?
3090     XCL_PROP : XCL_NOTPROP;
3091     *class_utf8data++ = ptype;
3092     *class_utf8data++ = pdata;
3093     class_charcount -= 2; /* Not a < 256 character */
3094 nigel 77 continue;
3095 nigel 93 }
3096 nigel 77 #endif
3097 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
3098     strict mode. By default, for compatibility with Perl, they are
3099     treated as literals. */
3100 nigel 77
3101 nigel 93 if ((options & PCRE_EXTRA) != 0)
3102     {
3103     *errorcodeptr = ERR7;
3104     goto FAILED;
3105     }
3106 nigel 77
3107 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3108     c = *ptr; /* Get the final character and fall through */
3109 nigel 77 }
3110    
3111     /* Fall through if we have a single character (c >= 0). This may be
3112 nigel 93 greater than 256 in UTF-8 mode. */
3113 nigel 77
3114     } /* End of backslash handling */
3115    
3116     /* A single character may be followed by '-' to form a range. However,
3117     Perl does not permit ']' to be the end of the range. A '-' character
3118 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3119     entirely. The code for handling \Q and \E is messy. */
3120 nigel 77
3121 nigel 93 CHECK_RANGE:
3122     while (ptr[1] == '\\' && ptr[2] == 'E')
3123 nigel 77 {
3124 nigel 93 inescq = FALSE;
3125     ptr += 2;
3126     }
3127    
3128     oldptr = ptr;
3129 ph10 231
3130 ph10 230 /* Remember \r or \n */
3131 ph10 231
3132     if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3133    
3134 ph10 230 /* Check for range */
3135 nigel 93
3136     if (!inescq && ptr[1] == '-')
3137     {
3138 nigel 77 int d;
3139     ptr += 2;
3140 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3141 nigel 77
3142 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3143     mode. */
3144    
3145     while (*ptr == '\\' && ptr[1] == 'Q')
3146     {
3147     ptr += 2;
3148     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3149     inescq = TRUE;
3150     break;
3151     }
3152    
3153     if (*ptr == 0 || (!inescq && *ptr == ']'))
3154     {
3155     ptr = oldptr;
3156     goto LONE_SINGLE_CHARACTER;
3157     }
3158    
3159 nigel 77 #ifdef SUPPORT_UTF8
3160     if (utf8)
3161     { /* Braces are required because the */
3162     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3163     }
3164     else
3165     #endif
3166     d = *ptr; /* Not UTF-8 mode */
3167    
3168     /* The second part of a range can be a single-character escape, but
3169     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3170     in such circumstances. */
3171    
3172 nigel 93 if (!inescq && d == '\\')
3173 nigel 77 {
3174 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3175     if (*errorcodeptr != 0) goto FAILED;
3176 nigel 77
3177 ph10 275 /* \b is backspace; \X is literal X; \R is literal R; any other
3178 nigel 93 special means the '-' was literal */
3179 nigel 77
3180     if (d < 0)
3181     {
3182     if (d == -ESC_b) d = '\b';
3183 nigel 93 else if (d == -ESC_X) d = 'X';
3184     else if (d == -ESC_R) d = 'R'; else
3185 nigel 77 {
3186 nigel 93 ptr = oldptr;
3187 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3188     }
3189     }
3190     }
3191    
3192 nigel 93 /* Check that the two values are in the correct order. Optimize
3193     one-character ranges */
3194 nigel 77
3195 nigel 93 if (d < c)
3196     {
3197     *errorcodeptr = ERR8;
3198     goto FAILED;
3199     }
3200    
3201 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3202    
3203 ph10 230 /* Remember \r or \n */
3204 ph10 231
3205     if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3206    
3207 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3208     matching, we have to use an XCLASS with extra data items. Caseless
3209     matching for characters > 127 is available only if UCP support is
3210     available. */
3211    
3212     #ifdef SUPPORT_UTF8
3213     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3214     {
3215     class_utf8 = TRUE;
3216    
3217     /* With UCP support, we can find the other case equivalents of
3218     the relevant characters. There may be several ranges. Optimize how
3219     they fit with the basic range. */
3220    
3221     #ifdef SUPPORT_UCP
3222     if ((options & PCRE_CASELESS) != 0)
3223     {
3224 nigel 93 unsigned int occ, ocd;
3225     unsigned int cc = c;
3226     unsigned int origd = d;
3227 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3228     {
3229 ph10 180 if (occ >= (unsigned int)c &&
3230     ocd <= (unsigned int)d)
3231 ph10 176 continue; /* Skip embedded ranges */
3232 nigel 77
3233 ph10 180 if (occ < (unsigned int)c &&
3234 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3235 nigel 77 { /* if there is overlap, */
3236     c = occ; /* noting that if occ < c */
3237     continue; /* we can't have ocd > d */
3238     } /* because a subrange is */
3239 ph10 180 if (ocd > (unsigned int)d &&
3240 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3241 nigel 77 { /* the basic range. */
3242     d = ocd;
3243     continue;
3244     }
3245    
3246     if (occ == ocd)
3247     {
3248     *class_utf8data++ = XCL_SINGLE;
3249     }
3250     else
3251     {
3252     *class_utf8data++ = XCL_RANGE;
3253     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3254     }
3255     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3256     }
3257     }
3258     #endif /* SUPPORT_UCP */
3259    
3260     /* Now record the original range, possibly modified for UCP caseless
3261     overlapping ranges. */
3262    
3263     *class_utf8data++ = XCL_RANGE;
3264     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3265     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3266    
3267     /* With UCP support, we are done. Without UCP support, there is no
3268     caseless matching for UTF-8 characters > 127; we can use the bit map
3269     for the smaller ones. */
3270    
3271     #ifdef SUPPORT_UCP
3272     continue; /* With next character in the class */
3273     #else
3274     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3275    
3276     /* Adjust upper limit and fall through to set up the map */
3277    
3278     d = 127;
3279    
3280     #endif /* SUPPORT_UCP */
3281     }
3282     #endif /* SUPPORT_UTF8 */
3283    
3284     /* We use the bit map for all cases when not in UTF-8 mode; else
3285     ranges that lie entirely within 0-127 when there is UCP support; else
3286     for partial ranges without UCP support. */
3287    
3288 nigel 93 class_charcount += d - c + 1;
3289     class_lastchar = d;
3290    
3291     /* We can save a bit of time by skipping this in the pre-compile. */
3292    
3293     if (lengthptr == NULL) for (; c <= d; c++)
3294 nigel 77 {
3295     classbits[c/8] |= (1 << (c&7));
3296     if ((options & PCRE_CASELESS) != 0)
3297     {
3298     int uc = cd->fcc[c]; /* flip case */
3299     classbits[uc/8] |= (1 << (uc&7));
3300     }
3301     }
3302    
3303     continue; /* Go get the next char in the class */
3304     }
3305    
3306     /* Handle a lone single character - we can get here for a normal
3307     non-escape char, or after \ that introduces a single character or for an
3308     apparent range that isn't. */
3309    
3310     LONE_SINGLE_CHARACTER:
3311 ph10 231
3312 nigel 77 /* Handle a character that cannot go in the bit map */
3313    
3314     #ifdef SUPPORT_UTF8
3315     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3316     {
3317     class_utf8 = TRUE;
3318     *class_utf8data++ = XCL_SINGLE;
3319     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3320    
3321     #ifdef SUPPORT_UCP
3322     if ((options & PCRE_CASELESS) != 0)
3323     {
3324 nigel 93 unsigned int othercase;
3325     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3326 nigel 77 {
3327     *class_utf8data++ = XCL_SINGLE;
3328     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3329     }
3330     }
3331     #endif /* SUPPORT_UCP */
3332    
3333     }
3334     else
3335     #endif /* SUPPORT_UTF8 */
3336    
3337     /* Handle a single-byte character */
3338     {
3339     classbits[c/8] |= (1 << (c&7));
3340     if ((options & PCRE_CASELESS) != 0)
3341     {
3342     c = cd->fcc[c]; /* flip case */
3343     classbits[c/8] |= (1 << (c&7));
3344     }
3345     class_charcount++;
3346     class_lastchar = c;
3347     }
3348     }
3349    
3350 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3351 nigel 77
3352 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3353 nigel 77
3354 nigel 93 if (c == 0) /* Missing terminating ']' */
3355     {
3356     *errorcodeptr = ERR6;
3357     goto FAILED;
3358     }
3359 ph10 231
3360    
3361 ph10 230 /* This code has been disabled because it would mean that \s counts as
3362     an explicit \r or \n reference, and that's not really what is wanted. Now
3363     we set the flag only if there is a literal "\r" or "\n" in the class. */
3364 ph10 227
3365 ph10 230 #if 0
3366 ph10 226 /* Remember whether \r or \n are in this class */
3367 ph10 227
3368 ph10 226 if (negate_class)
3369     {
3370 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3371 ph10 226 }
3372     else
3373     {
3374 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3375 ph10 227 }
3376 ph10 230 #endif
3377 ph10 227
3378 ph10 231
3379 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3380 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3381     use of \p or \P, in other words, no use of any XCLASS features, we can
3382     optimize.
3383    
3384 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3385     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3386     operate on single-bytes only. This is an historical hangover. Maybe one day
3387     we can tidy these opcodes to handle multi-byte characters.
3388 nigel 77
3389     The optimization throws away the bit map. We turn the item into a
3390     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3391     that OP_NOT does not support multibyte characters. In the positive case, it
3392     can cause firstbyte to be set. Otherwise, there can be no first char if
3393     this item is first, whatever repeat count may follow. In the case of
3394     reqbyte, save the previous value for reinstating. */
3395    
3396     #ifdef SUPPORT_UTF8
3397 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3398 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3399 nigel 77 #else
3400     if (class_charcount == 1)
3401     #endif
3402     {
3403     zeroreqbyte = reqbyte;
3404    
3405     /* The OP_NOT opcode works on one-byte characters only. */
3406    
3407     if (negate_class)
3408     {
3409     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3410     zerofirstbyte = firstbyte;
3411     *code++ = OP_NOT;
3412     *code++ = class_lastchar;
3413     break;
3414     }
3415    
3416     /* For a single, positive character, get the value into mcbuffer, and
3417     then we can handle this with the normal one-character code. */
3418    
3419     #ifdef SUPPORT_UTF8
3420     if (utf8 && class_lastchar > 127)
3421     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3422     else
3423     #endif
3424     {
3425     mcbuffer[0] = class_lastchar;
3426     mclength = 1;
3427     }
3428     goto ONE_CHAR;
3429     } /* End of 1-char optimization */
3430    
3431     /* The general case - not the one-char optimization. If this is the first
3432     thing in the branch, there can be no first char setting, whatever the
3433     repeat count. Any reqbyte setting must remain unchanged after any kind of
3434     repeat. */
3435    
3436     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3437     zerofirstbyte = firstbyte;
3438     zeroreqbyte = reqbyte;
3439    
3440     /* If there are characters with values > 255, we have to compile an
3441 ph10 286 extended class, with its own opcode, unless there was a negated special
3442     such as \S in the class, because in that case all characters > 255 are in
3443     the class, so any that were explicitly given as well can be ignored. If
3444 ph10 264 (when there are explicit characters > 255 that must be listed) there are no
3445     characters < 256, we can omit the bitmap in the actual compiled code. */
3446 nigel 77
3447     #ifdef SUPPORT_UTF8
3448 ph10 264 if (class_utf8 && !should_flip_negation)
3449 nigel 77 {
3450     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3451     *code++ = OP_XCLASS;
3452     code += LINK_SIZE;
3453     *code = negate_class? XCL_NOT : 0;
3454    
3455 nigel 93 /* If the map is required, move up the extra data to make room for it;
3456     otherwise just move the code pointer to the end of the extra data. */
3457 nigel 77
3458     if (class_charcount > 0)
3459     {
3460     *code++ |= XCL_MAP;
3461 nigel 93 memmove(code + 32, code, class_utf8data - code);
3462 nigel 77 memcpy(code, classbits, 32);
3463 nigel 93 code = class_utf8data + 32;
3464 nigel 77 }
3465 nigel 93 else code = class_utf8data;
3466 nigel 77
3467     /* Now fill in the complete length of the item */
3468    
3469     PUT(previous, 1, code - previous);
3470     break; /* End of class handling */
3471     }
3472     #endif
3473    
3474 ph10 286 /* If there are no characters > 255, set the opcode to OP_CLASS or
3475     OP_NCLASS, depending on whether the whole class was negated and whether
3476     there were negative specials such as \S in the class. Then copy the 32-byte
3477 ph10 264 map into the code vector, negating it if necessary. */
3478 ph10 286
3479 ph10 264 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3480 nigel 77 if (negate_class)
3481     {
3482 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3483     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3484 nigel 77 }
3485     else
3486     {
3487     memcpy(code, classbits, 32);
3488     }
3489     code += 32;
3490     break;
3491    
3492 nigel 93
3493     /* ===================================================================*/
3494 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3495     has been tested above. */
3496    
3497     case '{':
3498     if (!is_quantifier) goto NORMAL_CHAR;
3499     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3500     if (*errorcodeptr != 0) goto FAILED;
3501     goto REPEAT;
3502    
3503     case '*':
3504     repeat_min = 0;
3505     repeat_max = -1;
3506     goto REPEAT;
3507    
3508     case '+':
3509     repeat_min = 1;
3510     repeat_max = -1;
3511     goto REPEAT;
3512    
3513     case '?':
3514     repeat_min = 0;
3515     repeat_max = 1;
3516    
3517     REPEAT:
3518     if (previous == NULL)
3519     {
3520     *errorcodeptr = ERR9;
3521     goto FAILED;
3522     }
3523    
3524     if (repeat_min == 0)
3525     {
3526     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3527     reqbyte = zeroreqbyte; /* Ditto */
3528     }
3529    
3530     /* Remember whether this is a variable length repeat */
3531    
3532     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3533    
3534     op_type = 0; /* Default single-char op codes */
3535     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3536    
3537     /* Save start of previous item, in case we have to move it up to make space
3538     for an inserted OP_ONCE for the additional '+' extension. */
3539    
3540     tempcode = previous;
3541    
3542     /* If the next character is '+', we have a possessive quantifier. This
3543     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3544     If the next character is '?' this is a minimizing repeat, by default,
3545     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3546     repeat type to the non-default. */
3547    
3548     if (ptr[1] == '+')
3549     {
3550     repeat_type = 0; /* Force greedy */
3551     possessive_quantifier = TRUE;
3552     ptr++;
3553     }
3554     else if (ptr[1] == '?')
3555     {
3556     repeat_type = greedy_non_default;
3557     ptr++;
3558     }
3559     else repeat_type = greedy_default;
3560    
3561     /* If previous was a character match, abolish the item and generate a
3562     repeat item instead. If a char item has a minumum of more than one, ensure
3563     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3564     the first thing in a branch because the x will have gone into firstbyte
3565     instead. */
3566    
3567     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3568     {
3569     /* Deal with UTF-8 characters that take up more than one byte. It's
3570     easier to write this out separately than try to macrify it. Use c to
3571     hold the length of the character in bytes, plus 0x80 to flag that it's a
3572     length rather than a small character. */
3573    
3574     #ifdef SUPPORT_UTF8
3575     if (utf8 && (code[-1] & 0x80) != 0)
3576     {
3577     uschar *lastchar = code - 1;
3578     while((*lastchar & 0xc0) == 0x80) lastchar--;
3579     c = code - lastchar; /* Length of UTF-8 character */
3580     memcpy(utf8_char, lastchar, c); /* Save the char */
3581     c |= 0x80; /* Flag c as a length */
3582     }
3583     else
3584     #endif
3585    
3586     /* Handle the case of a single byte - either with no UTF8 support, or
3587     with UTF-8 disabled, or for a UTF-8 character < 128. */
3588    
3589     {
3590     c = code[-1];
3591     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3592     }
3593    
3594 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3595     the line is something that cannot possibly match this character. If so,
3596     automatically possessifying this item gains some performance in the case
3597     where the match fails. */
3598    
3599     if (!possessive_quantifier &&
3600     repeat_max < 0 &&
3601     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3602     options, cd))
3603     {
3604     repeat_type = 0; /* Force greedy */
3605     possessive_quantifier = TRUE;
3606     }
3607    
3608 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3609     }
3610    
3611     /* If previous was a single negated character ([^a] or similar), we use
3612     one of the special opcodes, replacing it. The code is shared with single-
3613     character repeats by setting opt_type to add a suitable offset into
3614 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3615     currently used only for single-byte chars. */
3616 nigel 77
3617     else if (*previous == OP_NOT)
3618     {
3619     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3620     c = previous[1];
3621 nigel 93 if (!possessive_quantifier &&
3622     repeat_max < 0 &&
3623     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3624     {
3625     repeat_type = 0; /* Force greedy */
3626     possessive_quantifier = TRUE;
3627     }
3628 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3629     }
3630    
3631     /* If previous was a character type match (\d or similar), abolish it and
3632     create a suitable repeat item. The code is shared with single-character
3633     repeats by setting op_type to add a suitable offset into repeat_type. Note
3634     the the Unicode property types will be present only when SUPPORT_UCP is
3635     defined, but we don't wrap the little bits of code here because it just
3636     makes it horribly messy. */
3637    
3638     else if (*previous < OP_EODN)
3639     {
3640     uschar *oldcode;
3641 nigel 87 int prop_type, prop_value;
3642 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3643     c = *previous;
3644    
3645 nigel 93 if (!possessive_quantifier &&
3646     repeat_max < 0 &&
3647     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3648     {
3649     repeat_type = 0; /* Force greedy */
3650     possessive_quantifier = TRUE;
3651     }
3652    
3653 nigel 77 OUTPUT_SINGLE_REPEAT:
3654 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3655     {
3656     prop_type = previous[1];
3657     prop_value = previous[2];
3658     }
3659     else prop_type = prop_value = -1;
3660 nigel 77
3661     oldcode = code;
3662     code = previous; /* Usually overwrite previous item */
3663    
3664     /* If the maximum is zero then the minimum must also be zero; Perl allows
3665     this case, so we do too - by simply omitting the item altogether. */
3666    
3667     if (repeat_max == 0) goto END_REPEAT;
3668    
3669     /* All real repeats make it impossible to handle partial matching (maybe
3670     one day we will be able to remove this restriction). */
3671    
3672 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3673 nigel 77
3674     /* Combine the op_type with the repeat_type */
3675    
3676     repeat_type += op_type;
3677    
3678     /* A minimum of zero is handled either as the special case * or ?, or as
3679     an UPTO, with the maximum given. */
3680    
3681     if (repeat_min == 0)
3682     {
3683     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3684     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3685     else
3686     {
3687     *code++ = OP_UPTO + repeat_type;
3688     PUT2INC(code, 0, repeat_max);
3689     }
3690     }
3691    
3692     /* A repeat minimum of 1 is optimized into some special cases. If the
3693 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3694 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3695     one less than the maximum. */
3696    
3697     else if (repeat_min == 1)
3698     {
3699     if (repeat_max == -1)
3700     *code++ = OP_PLUS + repeat_type;
3701     else
3702     {
3703     code = oldcode; /* leave previous item in place */
3704     if (repeat_max == 1) goto END_REPEAT;
3705     *code++ = OP_UPTO + repeat_type;
3706     PUT2INC(code, 0, repeat_max - 1);
3707     }
3708     }
3709    
3710     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3711     handled as an EXACT followed by an UPTO. */
3712    
3713     else
3714     {
3715     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3716     PUT2INC(code, 0, repeat_min);
3717    
3718     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3719     we have to insert the character for the previous code. For a repeated
3720 nigel 87 Unicode property match, there are two extra bytes that define the
3721 nigel 77 required property. In UTF-8 mode, long characters have their length in
3722     c, with the 0x80 bit as a flag. */
3723    
3724     if (repeat_max < 0)
3725     {
3726     #ifdef SUPPORT_UTF8
3727     if (utf8 && c >= 128)
3728     {
3729     memcpy(code, utf8_char, c & 7);
3730     code += c & 7;
3731     }
3732     else
3733     #endif
3734     {
3735     *code++ = c;
3736 nigel 87 if (prop_type >= 0)
3737     {
3738     *code++ = prop_type;
3739     *code++ = prop_value;
3740     }
3741 nigel 77 }
3742     *code++ = OP_STAR + repeat_type;
3743     }
3744    
3745     /* Else insert an UPTO if the max is greater than the min, again
3746 nigel 93 preceded by the character, for the previously inserted code. If the
3747     UPTO is just for 1 instance, we can use QUERY instead. */
3748 nigel 77
3749     else if (repeat_max != repeat_min)
3750     {
3751     #ifdef SUPPORT_UTF8
3752     if (utf8 && c >= 128)
3753     {
3754     memcpy(code, utf8_char, c & 7);
3755     code += c & 7;
3756     }
3757     else
3758     #endif
3759     *code++ = c;
3760 nigel 87 if (prop_type >= 0)
3761     {
3762     *code++ = prop_type;
3763     *code++ = prop_value;
3764     }
3765 nigel 77 repeat_max -= repeat_min;
3766 nigel 93
3767     if (repeat_max == 1)
3768     {
3769     *code++ = OP_QUERY + repeat_type;
3770     }
3771     else
3772     {
3773     *code++ = OP_UPTO + repeat_type;
3774     PUT2INC(code, 0, repeat_max);
3775     }
3776 nigel 77 }
3777     }
3778    
3779     /* The character or character type itself comes last in all cases. */
3780    
3781     #ifdef SUPPORT_UTF8
3782     if (utf8 && c >= 128)
3783     {
3784     memcpy(code, utf8_char, c & 7);
3785     code += c & 7;
3786     }
3787     else
3788     #endif
3789     *code++ = c;
3790    
3791 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3792     define the required property. */
3793 nigel 77
3794     #ifdef SUPPORT_UCP
3795 nigel 87 if (prop_type >= 0)
3796     {
3797     *code++ = prop_type;
3798     *code++ = prop_value;
3799     }
3800 nigel 77 #endif
3801     }
3802    
3803     /* If previous was a character class or a back reference, we put the repeat
3804     stuff after it, but just skip the item if the repeat was {0,0}. */
3805    
3806     else if (*previous == OP_CLASS ||
3807     *previous == OP_NCLASS ||
3808     #ifdef SUPPORT_UTF8
3809     *previous == OP_XCLASS ||
3810     #endif
3811     *previous == OP_REF)
3812     {
3813     if (repeat_max == 0)
3814     {
3815     code = previous;
3816     goto END_REPEAT;
3817     }
3818    
3819     /* All real repeats make it impossible to handle partial matching (maybe
3820     one day we will be able to remove this restriction). */
3821    
3822 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3823 nigel 77
3824     if (repeat_min == 0 && repeat_max == -1)
3825     *code++ = OP_CRSTAR + repeat_type;
3826     else if (repeat_min == 1 && repeat_max == -1)
3827     *code++ = OP_CRPLUS + repeat_type;
3828     else if (repeat_min == 0 && repeat_max == 1)
3829     *code++ = OP_CRQUERY + repeat_type;
3830     else
3831     {
3832     *code++ = OP_CRRANGE + repeat_type;
3833     PUT2INC(code, 0, repeat_min);
3834     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3835     PUT2INC(code, 0, repeat_max);
3836     }
3837     }
3838    
3839     /* If previous was a bracket group, we may have to replicate it in certain
3840     cases. */
3841    
3842 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3843     *previous == OP_ONCE || *previous == OP_COND)
3844 nigel 77 {
3845     register int i;
3846     int ketoffset = 0;
3847     int len = code - previous;
3848     uschar *bralink = NULL;
3849    
3850 nigel 93 /* Repeating a DEFINE group is pointless */
3851    
3852     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3853     {
3854     *errorcodeptr = ERR55;
3855     goto FAILED;
3856     }
3857    
3858 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3859     by scanning through from the start, and compute the offset back to it
3860     from the current code pointer. There may be an OP_OPT setting following
3861     the final KET, so we can't find the end just by going back from the code
3862     pointer. */
3863    
3864     if (repeat_max == -1)
3865     {
3866     register uschar *ket = previous;
3867     do ket += GET(ket, 1); while (*ket != OP_KET);
3868     ketoffset = code - ket;
3869     }
3870    
3871     /* The case of a zero minimum is special because of the need to stick
3872     OP_BRAZERO in front of it, and because the group appears once in the
3873     data, whereas in other cases it appears the minimum number of times. For
3874     this reason, it is simplest to treat this case separately, as otherwise
3875     the code gets far too messy. There are several special subcases when the
3876     minimum is zero. */
3877    
3878     if (repeat_min == 0)
3879     {
3880 ph10 335 /* If the maximum is also zero, we used to just omit the group from the
3881     output altogether, like this:
3882 nigel 77
3883 ph10 335 ** if (repeat_max == 0)
3884     ** {
3885     ** code = previous;
3886     ** goto END_REPEAT;
3887     ** }
3888    
3889     However, that fails when a group is referenced as a subroutine from
3890     elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3891     so that it is skipped on execution. As we don't have a list of which
3892     groups are referenced, we cannot do this selectively.
3893 nigel 77
3894 ph10 335 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3895     and do no more at this point. However, we do need to adjust any
3896     OP_RECURSE calls inside the group that refer to the group itself or any
3897     internal or forward referenced group, because the offset is from the
3898     start of the whole regex. Temporarily terminate the pattern while doing
3899     this. */
3900 nigel 77
3901 ph10 335 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
3902 nigel 77 {
3903     *code = OP_END;
3904 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3905 nigel 77 memmove(previous+1, previous, len);
3906     code++;
3907 ph10 335 if (repeat_max == 0)
3908     {
3909     *previous++ = OP_SKIPZERO;
3910     goto END_REPEAT;
3911     }
3912 nigel 77 *previous++ = OP_BRAZERO + repeat_type;
3913     }
3914    
3915     /* If the maximum is greater than 1 and limited, we have to replicate
3916     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3917     The first one has to be handled carefully because it's the original
3918     copy, which has to be moved up. The remainder can be handled by code
3919     that is common with the non-zero minimum case below. We have to
3920     adjust the value or repeat_max, since one less copy is required. Once
3921     again, we may have to adjust any OP_RECURSE calls inside the group. */
3922    
3923     else
3924     {
3925     int offset;
3926     *code = OP_END;
3927 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3928 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3929     code += 2 + LINK_SIZE;
3930     *previous++ = OP_BRAZERO + repeat_type;
3931     *previous++ = OP_BRA;
3932    
3933     /* We chain together the bracket offset fields that have to be
3934     filled in later when the ends of the brackets are reached. */
3935    
3936     offset = (bralink == NULL)? 0 : previous - bralink;
3937     bralink = previous;
3938     PUTINC(previous, 0, offset);
3939     }
3940    
3941     repeat_max--;
3942     }
3943    
3944     /* If the minimum is greater than zero, replicate the group as many
3945     times as necessary, and adjust the maximum to the number of subsequent
3946     copies that we need. If we set a first char from the group, and didn't
3947 nigel 93 set a required char, copy the latter from the former. If there are any
3948     forward reference subroutine calls in the group, there will be entries on
3949     the workspace list; replicate these with an appropriate increment. */
3950 nigel 77
3951     else
3952     {
3953     if (repeat_min > 1)
3954     {
3955 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3956 ph10 202 just adjust the length as if we had. Do some paranoid checks for
3957     potential integer overflow. */
3958 nigel 93
3959     if (lengthptr != NULL)
3960 ph10 202 {
3961     int delta = (repeat_min - 1)*length_prevgroup;
3962     if ((double)(repeat_min - 1)*(double)length_prevgroup >
3963     (double)INT_MAX ||
3964     OFLOW_MAX - *lengthptr < delta)
3965     {
3966     *errorcodeptr = ERR20;
3967     goto FAILED;
3968     }
3969     *lengthptr += delta;
3970     }
3971 nigel 93
3972     /* This is compiling for real */
3973    
3974     else
3975 nigel 77 {
3976 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3977     for (i = 1; i < repeat_min; i++)
3978     {
3979     uschar *hc;
3980     uschar *this_hwm = cd->hwm;
3981     memcpy(code, previous, len);
3982     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3983     {
3984     PUT(cd->hwm, 0, GET(hc, 0) + len);
3985     cd->hwm += LINK_SIZE;
3986     }
3987     save_hwm = this_hwm;
3988     code += len;
3989     }
3990 nigel 77 }
3991     }
3992 nigel 93
3993 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3994     }
3995    
3996     /* This code is common to both the zero and non-zero minimum cases. If
3997     the maximum is limited, it replicates the group in a nested fashion,
3998     remembering the bracket starts on a stack. In the case of a zero minimum,
3999     the first one was set up above. In all cases the repeat_max now specifies
4000 nigel 93 the number of additional copies needed. Again, we must remember to
4001     replicate entries on the forward reference list. */
4002 nigel 77
4003     if (repeat_max >= 0)
4004     {
4005 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
4006     just adjust the length as if we had. For each repetition we must add 1
4007     to the length for BRAZERO and for all but the last repetition we must
4008 ph10 202 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4009     paranoid checks to avoid integer overflow. */
4010 nigel 93
4011     if (lengthptr != NULL && repeat_max > 0)
4012 ph10 202 {
4013     int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4014     2 - 2*LINK_SIZE; /* Last one doesn't nest */
4015     if ((double)repeat_max *
4016     (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4017     > (double)INT_MAX ||
4018     OFLOW_MAX - *lengthptr < delta)
4019     {
4020     *errorcodeptr = ERR20;
4021     goto FAILED;
4022     }
4023     *lengthptr += delta;
4024     }
4025 nigel 93
4026     /* This is compiling for real */
4027    
4028     else for (i = repeat_max - 1; i >= 0; i--)
4029 nigel 77 {
4030 nigel 93 uschar *hc;
4031     uschar *this_hwm = cd->hwm;
4032    
4033 nigel 77 *code++ = OP_BRAZERO + repeat_type;
4034    
4035     /* All but the final copy start a new nesting, maintaining the
4036     chain of brackets outstanding. */
4037    
4038     if (i != 0)
4039     {
4040     int offset;
4041     *code++ = OP_BRA;
4042     offset = (bralink == NULL)? 0 : code - bralink;
4043     bralink = code;
4044     PUTINC(code, 0, offset);
4045     }
4046    
4047     memcpy(code, previous, len);
4048 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4049     {
4050     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4051     cd->hwm += LINK_SIZE;
4052     }
4053     save_hwm = this_hwm;
4054 nigel 77 code += len;
4055     }
4056    
4057     /* Now chain through the pending brackets, and fill in their length
4058     fields (which are holding the chain links pro tem). */
4059    
4060     while (bralink != NULL)
4061     {
4062     int oldlinkoffset;
4063     int offset = code - bralink + 1;
4064     uschar *bra = code - offset;
4065     oldlinkoffset = GET(bra, 1);
4066     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4067     *code++ = OP_KET;
4068     PUTINC(code, 0, offset);
4069     PUT(bra, 1, offset);
4070     }
4071     }
4072    
4073     /* If the maximum is unlimited, set a repeater in the final copy. We
4074     can't just offset backwards from the current code point, because we
4075     don't know if there's been an options resetting after the ket. The
4076 nigel 93 correct offset was computed above.
4077 nigel 77
4078 nigel 93 Then, when we are doing the actual compile phase, check to see whether
4079     this group is a non-atomic one that could match an empty string. If so,
4080     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4081     that runtime checking can be done. [This check is also applied to
4082     atomic groups at runtime, but in a different way.] */
4083    
4084     else
4085     {
4086 &