/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 336 - (hide annotations) (download)
Sat Apr 12 15:59:03 2008 UTC (5 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 201134 byte(s)
Added PCRE_JAVASCRIPT_COMPAT option.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 305 Copyright (c) 1997-2008 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 nigel 77 static const short int escapes[] = {
102     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 nigel 77 0, 0, -ESC_z /* x - z */
112     };
113    
114 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
115 nigel 77 static const short int escapes[] = {
116     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139     };
140     #endif
141    
142    
143 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144     searched linearly. Put all the names into a single string, in order to reduce
145 ph10 240 the number of relocations when a shared library is dynamically linked. */
146 ph10 210
147     typedef struct verbitem {
148     int len;
149     int op;
150 ph10 211 } verbitem;
151 ph10 210
152 ph10 240 static const char verbnames[] =
153 ph10 243 "ACCEPT\0"
154     "COMMIT\0"
155     "F\0"
156     "FAIL\0"
157     "PRUNE\0"
158     "SKIP\0"
159     "THEN";
160 ph10 240
161 ph10 327 static const verbitem verbs[] = {
162 ph10 240 { 6, OP_ACCEPT },
163     { 6, OP_COMMIT },
164     { 1, OP_FAIL },
165     { 4, OP_FAIL },
166     { 5, OP_PRUNE },
167     { 4, OP_SKIP },
168     { 4, OP_THEN }
169 ph10 210 };
170    
171 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
172 ph10 210
173    
174 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
175     now all in a single string, to reduce the number of relocations when a shared
176 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
177     length entry. The first three must be alpha, lower, upper, as this is assumed
178     for handling case independence. */
179 nigel 77
180 ph10 240 static const char posix_names[] =
181 ph10 243 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182     "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 ph10 240 "word\0" "xdigit";
184 nigel 77
185     static const uschar posix_name_lengths[] = {
186     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187    
188 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
189     base map, with an optional addition or removal of another map. Then, for some
190     classes, there is some additional tweaking: for [:blank:] the vertical space
191     characters are removed, and for [:alpha:] and [:alnum:] the underscore
192     character is removed. The triples in the table consist of the base map offset,
193     second map offset or -1 if no second map, and a non-negative value for map
194     addition or a negative value for map subtraction (if there are two maps). The
195     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196     remove vertical space characters, 2 => remove underscore. */
197 nigel 77
198     static const int posix_class_maps[] = {
199 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
200     cbit_lower, -1, 0, /* lower */
201     cbit_upper, -1, 0, /* upper */
202     cbit_word, -1, 2, /* alnum - word without underscore */
203     cbit_print, cbit_cntrl, 0, /* ascii */
204     cbit_space, -1, 1, /* blank - a GNU extension */
205     cbit_cntrl, -1, 0, /* cntrl */
206     cbit_digit, -1, 0, /* digit */
207     cbit_graph, -1, 0, /* graph */
208     cbit_print, -1, 0, /* print */
209     cbit_punct, -1, 0, /* punct */
210     cbit_space, -1, 0, /* space */
211     cbit_word, -1, 0, /* word - a Perl extension */
212     cbit_xdigit,-1, 0 /* xdigit */
213 nigel 77 };
214    
215    
216 nigel 93 #define STRING(a) # a
217     #define XSTRING(s) STRING(s)
218    
219 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
220 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
221     they are documented. Always add a new error instead. Messages marked DEAD below
222 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
223     the number of relocations needed when a shared library is loaded dynamically,
224     it is now one long string. We cannot use a table of offsets, because the
225     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226     simply count through to the one we want - this isn't a performance issue
227 ph10 240 because these strings are used only when there is a compilation error. */
228 nigel 77
229 ph10 240 static const char error_texts[] =
230     "no error\0"
231     "\\ at end of pattern\0"
232     "\\c at end of pattern\0"
233     "unrecognized character follows \\\0"
234     "numbers out of order in {} quantifier\0"
235 nigel 77 /* 5 */
236 ph10 240 "number too big in {} quantifier\0"
237     "missing terminating ] for character class\0"
238     "invalid escape sequence in character class\0"
239     "range out of order in character class\0"
240     "nothing to repeat\0"
241 nigel 77 /* 10 */
242 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243     "internal error: unexpected repeat\0"
244 ph10 269 "unrecognized character after (? or (?-\0"
245 ph10 240 "POSIX named classes are supported only within a class\0"
246     "missing )\0"
247 nigel 77 /* 15 */
248 ph10 240 "reference to non-existent subpattern\0"
249     "erroffset passed as NULL\0"
250     "unknown option bit(s) set\0"
251     "missing ) after comment\0"
252     "parentheses nested too deeply\0" /** DEAD **/
253 nigel 77 /* 20 */
254 ph10 240 "regular expression is too large\0"
255     "failed to get memory\0"
256     "unmatched parentheses\0"
257     "internal error: code overflow\0"
258     "unrecognized character after (?<\0"
259 nigel 77 /* 25 */
260 ph10 240 "lookbehind assertion is not fixed length\0"
261     "malformed number or name after (?(\0"
262     "conditional group contains more than two branches\0"
263     "assertion expected after (?(\0"
264     "(?R or (?[+-]digits must be followed by )\0"
265 nigel 77 /* 30 */
266 ph10 240 "unknown POSIX class name\0"
267     "POSIX collating elements are not supported\0"
268     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269     "spare error\0" /** DEAD **/
270     "character value in \\x{...} sequence is too large\0"
271 nigel 77 /* 35 */
272 ph10 240 "invalid condition (?(0)\0"
273     "\\C not allowed in lookbehind assertion\0"
274     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275     "number after (?C is > 255\0"
276     "closing ) for (?C expected\0"
277 nigel 77 /* 40 */
278 ph10 240 "recursive call could loop indefinitely\0"
279     "unrecognized character after (?P\0"
280     "syntax error in subpattern name (missing terminator)\0"
281     "two named subpatterns have the same name\0"
282     "invalid UTF-8 string\0"
283 nigel 77 /* 45 */
284 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
285     "malformed \\P or \\p sequence\0"
286     "unknown property name after \\P or \\p\0"
287     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 nigel 91 /* 50 */
290 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
291     "octal value is greater than \\377 (not in UTF-8 mode)\0"
292     "internal error: overran compiling workspace\0"
293     "internal error: previously-checked referenced subpattern not found\0"
294     "DEFINE group contains more than one branch\0"
295 nigel 93 /* 55 */
296 ph10 240 "repeating a DEFINE group is not allowed\0"
297     "inconsistent NEWLINE options\0"
298 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
299     "a numbered reference must not be zero\0"
300 ph10 240 "(*VERB) with an argument is not supported\0"
301 ph10 211 /* 60 */
302 ph10 240 "(*VERB) not recognized\0"
303 ph10 268 "number is too big\0"
304 ph10 272 "subpattern name expected\0"
305 ph10 336 "digit expected after (?+\0"
306     "] is an invalid data character in JavaScript compatibility mode";
307 nigel 77
308    
309     /* Table to identify digits and hex digits. This is used when compiling
310     patterns. Note that the tables in chartables are dependent on the locale, and
311     may mark arbitrary characters as digits - but the PCRE compiling code expects
312     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
313     a private table here. It costs 256 bytes, but it is a lot faster than doing
314     character value tests (at least in some simple cases I timed), and in some
315     applications one wants PCRE to compile efficiently as well as match
316     efficiently.
317    
318     For convenience, we use the same bit definitions as in chartables:
319    
320     0x04 decimal digit
321     0x08 hexadecimal digit
322    
323     Then we can use ctype_digit and ctype_xdigit in the code. */
324    
325 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
326 nigel 77 static const unsigned char digitab[] =
327     {
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
333     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
334     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
335     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
336     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
337     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
338     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
339     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
340     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
341     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
342     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
343     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
344     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
347     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
348     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
350     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
351     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
352     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
355     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
356     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
357     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
358     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
359     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
360    
361 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
362 nigel 77 static const unsigned char digitab[] =
363     {
364     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
365     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
366     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
367     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
368     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
369     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
370     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
371     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
374     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
375 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
376 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
377     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
378     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
379     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
380     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
381     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
382     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
383     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
387     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
388     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
393     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
394     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
395     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
396    
397     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
398     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
399     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
400     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
401     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
402     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
403     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
404     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
405     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
406     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
407     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
408     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
409 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
410 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
411     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
412     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
413     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
414     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
415     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
416     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
417     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
418     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
419     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
420     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
421     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
422     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
423     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
424     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
425     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
426     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
427     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
428     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
429     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
430     #endif
431    
432    
433     /* Definition to allow mutual recursion */
434    
435     static BOOL
436 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
437 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
438 nigel 77
439    
440    
441     /*************************************************
442 ph10 240 * Find an error text *
443     *************************************************/
444    
445 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
446     some of the text is of unknown length, we can't use a table of offsets.
447     Instead, just count through the strings. This is not a performance issue
448 ph10 240 because it happens only when there has been a compilation error.
449    
450     Argument: the error number
451     Returns: pointer to the error string
452     */
453    
454     static const char *
455     find_error_text(int n)
456     {
457     const char *s = error_texts;
458 ph10 243 for (; n > 0; n--) while (*s++ != 0);
459 ph10 240 return s;
460     }
461    
462    
463     /*************************************************
464 nigel 77 * Handle escapes *
465     *************************************************/
466    
467     /* This function is called when a \ has been encountered. It either returns a
468     positive value for a simple escape such as \n, or a negative value which
469 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
470     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
471     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
472     ptr is pointing at the \. On exit, it is on the final character of the escape
473     sequence.
474 nigel 77
475     Arguments:
476     ptrptr points to the pattern position pointer
477     errorcodeptr points to the errorcode variable
478     bracount number of previous extracting brackets
479     options the options bits
480     isclass TRUE if inside a character class
481    
482     Returns: zero or positive => a data character
483     negative => a special escape sequence
484 ph10 213 on error, errorcodeptr is set
485 nigel 77 */
486    
487     static int
488     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
489     int options, BOOL isclass)
490     {
491 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
492     const uschar *ptr = *ptrptr + 1;
493 nigel 77 int c, i;
494    
495 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
496     ptr--; /* Set pointer back to the last byte */
497    
498 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
499    
500     if (c == 0) *errorcodeptr = ERR1;
501    
502 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
503     in a table. A non-zero result is something that can be returned immediately.
504 nigel 77 Otherwise further processing may be required. */
505    
506 ph10 97 #ifndef EBCDIC /* ASCII coding */
507 ph10 274 else if (c < '0' || c > 'z') {} /* Not alphanumeric */
508 nigel 77 else if ((i = escapes[c - '0']) != 0) c = i;
509    
510 ph10 97 #else /* EBCDIC coding */
511 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
512 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
513     #endif
514    
515     /* Escapes that need further processing, or are illegal. */
516    
517     else
518     {
519     const uschar *oldptr;
520 nigel 93 BOOL braced, negated;
521    
522 nigel 77 switch (c)
523     {
524     /* A number of Perl escapes are not handled by PCRE. We give an explicit
525     error. */
526    
527     case 'l':
528     case 'L':
529     case 'N':
530     case 'u':
531     case 'U':
532     *errorcodeptr = ERR37;
533     break;
534    
535 ph10 333 /* \g must be followed by one of a number of specific things:
536    
537     (1) A number, either plain or braced. If positive, it is an absolute
538     backreference. If negative, it is a relative backreference. This is a Perl
539     5.10 feature.
540    
541     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
542     is part of Perl's movement towards a unified syntax for back references. As
543     this is synonymous with \k{name}, we fudge it up by pretending it really
544     was \k.
545    
546     (3) For Oniguruma compatibility we also support \g followed by a name or a
547     number either in angle brackets or in single quotes. However, these are
548     (possibly recursive) subroutine calls, _not_ backreferences. Just return
549     the -ESC_g code (cf \k). */
550 nigel 93
551     case 'g':
552 ph10 333 if (ptr[1] == '<' || ptr[1] == '\'')
553     {
554     c = -ESC_g;
555     break;
556     }
557    
558     /* Handle the Perl-compatible cases */
559    
560 nigel 93 if (ptr[1] == '{')
561     {
562 ph10 171 const uschar *p;
563     for (p = ptr+2; *p != 0 && *p != '}'; p++)
564     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
565 ph10 172 if (*p != 0 && *p != '}')
566 ph10 171 {
567     c = -ESC_k;
568     break;
569 ph10 172 }
570 nigel 93 braced = TRUE;
571     ptr++;
572     }
573     else braced = FALSE;
574    
575     if (ptr[1] == '-')
576     {
577     negated = TRUE;
578     ptr++;
579     }
580     else negated = FALSE;
581    
582     c = 0;
583     while ((digitab[ptr[1]] & ctype_digit) != 0)
584     c = c * 10 + *(++ptr) - '0';
585 ph10 220
586 ph10 333 if (c < 0) /* Integer overflow */
587 ph10 213 {
588     *errorcodeptr = ERR61;
589     break;
590 ph10 220 }
591 ph10 333
592     if (braced && *(++ptr) != '}')
593 nigel 93 {
594     *errorcodeptr = ERR57;
595 ph10 213 break;
596 nigel 93 }
597 ph10 333
598     if (c == 0)
599     {
600     *errorcodeptr = ERR58;
601     break;
602     }
603 nigel 93
604     if (negated)
605     {
606     if (c > bracount)
607     {
608     *errorcodeptr = ERR15;
609 ph10 213 break;
610 nigel 93 }
611     c = bracount - (c - 1);
612     }
613    
614     c = -(ESC_REF + c);
615     break;
616    
617 nigel 77 /* The handling of escape sequences consisting of a string of digits
618     starting with one that is not zero is not straightforward. By experiment,
619     the way Perl works seems to be as follows:
620    
621     Outside a character class, the digits are read as a decimal number. If the
622     number is less than 10, or if there are that many previous extracting
623     left brackets, then it is a back reference. Otherwise, up to three octal
624     digits are read to form an escaped byte. Thus \123 is likely to be octal
625     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
626     value is greater than 377, the least significant 8 bits are taken. Inside a
627     character class, \ followed by a digit is always an octal number. */
628    
629     case '1': case '2': case '3': case '4': case '5':
630     case '6': case '7': case '8': case '9':
631    
632     if (!isclass)
633     {
634     oldptr = ptr;
635     c -= '0';
636     while ((digitab[ptr[1]] & ctype_digit) != 0)
637     c = c * 10 + *(++ptr) - '0';
638 ph10 333 if (c < 0) /* Integer overflow */
639 ph10 213 {
640     *errorcodeptr = ERR61;
641 ph10 220 break;
642     }
643 nigel 77 if (c < 10 || c <= bracount)
644     {
645     c = -(ESC_REF + c);
646     break;
647     }
648     ptr = oldptr; /* Put the pointer back and fall through */
649     }
650    
651     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
652     generates a binary zero byte and treats the digit as a following literal.
653     Thus we have to pull back the pointer by one. */
654    
655     if ((c = *ptr) >= '8')
656     {
657     ptr--;
658     c = 0;
659     break;
660     }
661    
662     /* \0 always starts an octal number, but we may drop through to here with a
663 nigel 91 larger first octal digit. The original code used just to take the least
664     significant 8 bits of octal numbers (I think this is what early Perls used
665     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
666     than 3 octal digits. */
667 nigel 77
668     case '0':
669     c -= '0';
670     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
671     c = c * 8 + *(++ptr) - '0';
672 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
673 nigel 77 break;
674    
675 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
676     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
677     treated as a data character. */
678 nigel 77
679     case 'x':
680 nigel 87 if (ptr[1] == '{')
681 nigel 77 {
682     const uschar *pt = ptr + 2;
683 nigel 87 int count = 0;
684    
685 nigel 77 c = 0;
686     while ((digitab[*pt] & ctype_xdigit) != 0)
687     {
688 nigel 87 register int cc = *pt++;
689     if (c == 0 && cc == '0') continue; /* Leading zeroes */
690 nigel 77 count++;
691 nigel 87
692 ph10 97 #ifndef EBCDIC /* ASCII coding */
693 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
694 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
695 ph10 97 #else /* EBCDIC coding */
696 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
697 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
698 nigel 77 #endif
699     }
700 nigel 87
701 nigel 77 if (*pt == '}')
702     {
703 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
704 nigel 77 ptr = pt;
705     break;
706     }
707 nigel 87
708 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
709     recognize this construct; fall through to the normal \x handling. */
710     }
711    
712 nigel 87 /* Read just a single-byte hex-defined char */
713 nigel 77
714     c = 0;
715     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
716     {
717     int cc; /* Some compilers don't like ++ */
718     cc = *(++ptr); /* in initializers */
719 ph10 97 #ifndef EBCDIC /* ASCII coding */
720 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
721     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
722 ph10 97 #else /* EBCDIC coding */
723 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
724     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
725     #endif
726     }
727     break;
728    
729 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
730     This coding is ASCII-specific, but then the whole concept of \cx is
731     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
732 nigel 77
733     case 'c':
734     c = *(++ptr);
735     if (c == 0)
736     {
737     *errorcodeptr = ERR2;
738 ph10 213 break;
739 nigel 77 }
740    
741 ph10 97 #ifndef EBCDIC /* ASCII coding */
742 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
743     c ^= 0x40;
744 ph10 97 #else /* EBCDIC coding */
745 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
746     c ^= 0xC0;
747     #endif
748     break;
749    
750     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
751 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
752     otherwise, for Perl compatibility, it is a literal. This code looks a bit
753     odd, but there used to be some cases other than the default, and there may
754     be again in future, so I haven't "optimized" it. */
755 nigel 77
756     default:
757     if ((options & PCRE_EXTRA) != 0) switch(c)
758     {
759     default:
760     *errorcodeptr = ERR3;
761     break;
762     }
763     break;
764     }
765     }
766    
767     *ptrptr = ptr;
768     return c;
769     }
770    
771    
772    
773     #ifdef SUPPORT_UCP
774     /*************************************************
775     * Handle \P and \p *
776     *************************************************/
777    
778     /* This function is called after \P or \p has been encountered, provided that
779     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
780     pointing at the P or p. On exit, it is pointing at the final character of the
781     escape sequence.
782    
783     Argument:
784     ptrptr points to the pattern position pointer
785     negptr points to a boolean that is set TRUE for negation else FALSE
786 nigel 87 dptr points to an int that is set to the detailed property value
787 nigel 77 errorcodeptr points to the error code variable
788    
789 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
790 nigel 77 */
791    
792     static int
793 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
794 nigel 77 {
795     int c, i, bot, top;
796     const uschar *ptr = *ptrptr;
797 nigel 87 char name[32];
798 nigel 77
799     c = *(++ptr);
800     if (c == 0) goto ERROR_RETURN;
801    
802     *negptr = FALSE;
803    
804 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
805     negation. */
806 nigel 77
807     if (c == '{')
808     {
809     if (ptr[1] == '^')
810     {
811     *negptr = TRUE;
812     ptr++;
813     }
814 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
815 nigel 77 {
816     c = *(++ptr);
817     if (c == 0) goto ERROR_RETURN;
818     if (c == '}') break;
819     name[i] = c;
820     }
821 nigel 87 if (c !='}') goto ERROR_RETURN;
822 nigel 77 name[i] = 0;
823     }
824    
825     /* Otherwise there is just one following character */
826    
827     else
828     {
829     name[0] = c;
830     name[1] = 0;
831     }
832    
833     *ptrptr = ptr;
834    
835     /* Search for a recognized property name using binary chop */
836    
837     bot = 0;
838     top = _pcre_utt_size;
839    
840     while (bot < top)
841     {
842 nigel 87 i = (bot + top) >> 1;
843 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
844 nigel 87 if (c == 0)
845     {
846     *dptr = _pcre_utt[i].value;
847     return _pcre_utt[i].type;
848     }
849 nigel 77 if (c > 0) bot = i + 1; else top = i;
850     }
851    
852     *errorcodeptr = ERR47;
853     *ptrptr = ptr;
854     return -1;
855    
856     ERROR_RETURN:
857     *errorcodeptr = ERR46;
858     *ptrptr = ptr;
859     return -1;
860     }
861     #endif
862    
863    
864    
865    
866     /*************************************************
867     * Check for counted repeat *
868     *************************************************/
869    
870     /* This function is called when a '{' is encountered in a place where it might
871     start a quantifier. It looks ahead to see if it really is a quantifier or not.
872     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
873     where the ddds are digits.
874    
875     Arguments:
876     p pointer to the first char after '{'
877    
878     Returns: TRUE or FALSE
879     */
880    
881     static BOOL
882     is_counted_repeat(const uschar *p)
883     {
884     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
885     while ((digitab[*p] & ctype_digit) != 0) p++;
886     if (*p == '}') return TRUE;
887    
888     if (*p++ != ',') return FALSE;
889     if (*p == '}') return TRUE;
890    
891     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
892     while ((digitab[*p] & ctype_digit) != 0) p++;
893    
894     return (*p == '}');
895     }
896    
897    
898    
899     /*************************************************
900     * Read repeat counts *
901     *************************************************/
902    
903     /* Read an item of the form {n,m} and return the values. This is called only
904     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
905     so the syntax is guaranteed to be correct, but we need to check the values.
906    
907     Arguments:
908     p pointer to first char after '{'
909     minp pointer to int for min
910     maxp pointer to int for max
911     returned as -1 if no max
912     errorcodeptr points to error code variable
913    
914     Returns: pointer to '}' on success;
915     current ptr on error, with errorcodeptr set non-zero
916     */
917    
918     static const uschar *
919     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
920     {
921     int min = 0;
922     int max = -1;
923    
924 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
925     an integer overflow. */
926    
927 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
928 nigel 81 if (min < 0 || min > 65535)
929     {
930     *errorcodeptr = ERR5;
931     return p;
932     }
933 nigel 77
934 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
935     Also, max must not be less than min. */
936    
937 nigel 77 if (*p == '}') max = min; else
938     {
939     if (*(++p) != '}')
940     {
941     max = 0;
942     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
943 nigel 81 if (max < 0 || max > 65535)
944     {
945     *errorcodeptr = ERR5;
946     return p;
947     }
948 nigel 77 if (max < min)
949     {
950     *errorcodeptr = ERR4;
951     return p;
952     }
953     }
954     }
955    
956 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
957     '}'. */
958 nigel 77
959 nigel 81 *minp = min;
960     *maxp = max;
961 nigel 77 return p;
962     }
963    
964    
965    
966     /*************************************************
967 nigel 93 * Find forward referenced subpattern *
968 nigel 91 *************************************************/
969    
970 nigel 93 /* This function scans along a pattern's text looking for capturing
971     subpatterns, and counting them. If it finds a named pattern that matches the
972     name it is given, it returns its number. Alternatively, if the name is NULL, it
973     returns when it reaches a given numbered subpattern. This is used for forward
974     references to subpatterns. We know that if (?P< is encountered, the name will
975     be terminated by '>' because that is checked in the first pass.
976 nigel 91
977     Arguments:
978 nigel 93 ptr current position in the pattern
979     count current count of capturing parens so far encountered
980     name name to seek, or NULL if seeking a numbered subpattern
981     lorn name length, or subpattern number if name is NULL
982     xmode TRUE if we are in /x mode
983 nigel 91
984     Returns: the number of the named subpattern, or -1 if not found
985     */
986    
987     static int
988 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
989     BOOL xmode)
990 nigel 91 {
991     const uschar *thisname;
992 nigel 93
993 nigel 91 for (; *ptr != 0; ptr++)
994     {
995 nigel 93 int term;
996    
997     /* Skip over backslashed characters and also entire \Q...\E */
998    
999     if (*ptr == '\\')
1000     {
1001     if (*(++ptr) == 0) return -1;
1002     if (*ptr == 'Q') for (;;)
1003     {
1004     while (*(++ptr) != 0 && *ptr != '\\');
1005     if (*ptr == 0) return -1;
1006     if (*(++ptr) == 'E') break;
1007     }
1008     continue;
1009     }
1010    
1011     /* Skip over character classes */
1012    
1013     if (*ptr == '[')
1014     {
1015     while (*(++ptr) != ']')
1016     {
1017 ph10 220 if (*ptr == 0) return -1;
1018 nigel 93 if (*ptr == '\\')
1019     {
1020     if (*(++ptr) == 0) return -1;
1021     if (*ptr == 'Q') for (;;)
1022     {
1023     while (*(++ptr) != 0 && *ptr != '\\');
1024     if (*ptr == 0) return -1;
1025     if (*(++ptr) == 'E') break;
1026     }
1027     continue;
1028     }
1029     }
1030     continue;
1031     }
1032    
1033     /* Skip comments in /x mode */
1034    
1035     if (xmode && *ptr == '#')
1036     {
1037     while (*(++ptr) != 0 && *ptr != '\n');
1038     if (*ptr == 0) return -1;
1039     continue;
1040     }
1041    
1042     /* An opening parens must now be a real metacharacter */
1043    
1044 nigel 91 if (*ptr != '(') continue;
1045 ph10 210 if (ptr[1] != '?' && ptr[1] != '*')
1046 nigel 93 {
1047     count++;
1048     if (name == NULL && count == lorn) return count;
1049     continue;
1050     }
1051    
1052     ptr += 2;
1053     if (*ptr == 'P') ptr++; /* Allow optional P */
1054    
1055     /* We have to disambiguate (?<! and (?<= from (?<name> */
1056    
1057     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1058     *ptr != '\'')
1059     continue;
1060    
1061 nigel 91 count++;
1062 nigel 93
1063     if (name == NULL && count == lorn) return count;
1064     term = *ptr++;
1065     if (term == '<') term = '>';
1066 nigel 91 thisname = ptr;
1067 nigel 93 while (*ptr != term) ptr++;
1068     if (name != NULL && lorn == ptr - thisname &&
1069     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1070 nigel 91 return count;
1071     }
1072 nigel 93
1073 nigel 91 return -1;
1074     }
1075    
1076    
1077    
1078     /*************************************************
1079 nigel 77 * Find first significant op code *
1080     *************************************************/
1081    
1082     /* This is called by several functions that scan a compiled expression looking
1083     for a fixed first character, or an anchoring op code etc. It skips over things
1084     that do not influence this. For some calls, a change of option is important.
1085     For some calls, it makes sense to skip negative forward and all backward
1086     assertions, and also the \b assertion; for others it does not.
1087    
1088     Arguments:
1089     code pointer to the start of the group
1090     options pointer to external options
1091     optbit the option bit whose changing is significant, or
1092     zero if none are
1093     skipassert TRUE if certain assertions are to be skipped
1094    
1095     Returns: pointer to the first significant opcode
1096     */
1097    
1098     static const uschar*
1099     first_significant_code(const uschar *code, int *options, int optbit,
1100     BOOL skipassert)
1101     {
1102     for (;;)
1103     {
1104     switch ((int)*code)
1105     {
1106     case OP_OPT:
1107     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1108     *options = (int)code[1];
1109     code += 2;
1110     break;
1111    
1112     case OP_ASSERT_NOT:
1113     case OP_ASSERTBACK:
1114     case OP_ASSERTBACK_NOT:
1115     if (!skipassert) return code;
1116     do code += GET(code, 1); while (*code == OP_ALT);
1117     code += _pcre_OP_lengths[*code];
1118     break;
1119    
1120     case OP_WORD_BOUNDARY:
1121     case OP_NOT_WORD_BOUNDARY:
1122     if (!skipassert) return code;
1123     /* Fall through */
1124    
1125     case OP_CALLOUT:
1126     case OP_CREF:
1127 nigel 93 case OP_RREF:
1128     case OP_DEF:
1129 nigel 77 code += _pcre_OP_lengths[*code];
1130     break;
1131    
1132     default:
1133     return code;
1134     }
1135     }
1136     /* Control never reaches here */
1137     }
1138    
1139    
1140    
1141    
1142     /*************************************************
1143     * Find the fixed length of a pattern *
1144     *************************************************/
1145    
1146     /* Scan a pattern and compute the fixed length of subject that will match it,
1147     if the length is fixed. This is needed for dealing with backward assertions.
1148     In UTF8 mode, the result is in characters rather than bytes.
1149    
1150     Arguments:
1151     code points to the start of the pattern (the bracket)
1152     options the compiling options
1153    
1154     Returns: the fixed length, or -1 if there is no fixed length,
1155     or -2 if \C was encountered
1156     */
1157    
1158     static int
1159     find_fixedlength(uschar *code, int options)
1160     {
1161     int length = -1;
1162    
1163     register int branchlength = 0;
1164     register uschar *cc = code + 1 + LINK_SIZE;
1165    
1166     /* Scan along the opcodes for this branch. If we get to the end of the
1167     branch, check the length against that of the other branches. */
1168    
1169     for (;;)
1170     {
1171     int d;
1172     register int op = *cc;
1173     switch (op)
1174     {
1175 nigel 93 case OP_CBRA:
1176 nigel 77 case OP_BRA:
1177     case OP_ONCE:
1178     case OP_COND:
1179 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1180 nigel 77 if (d < 0) return d;
1181     branchlength += d;
1182     do cc += GET(cc, 1); while (*cc == OP_ALT);
1183     cc += 1 + LINK_SIZE;
1184     break;
1185    
1186     /* Reached end of a branch; if it's a ket it is the end of a nested
1187     call. If it's ALT it is an alternation in a nested call. If it is
1188     END it's the end of the outer call. All can be handled by the same code. */
1189    
1190     case OP_ALT:
1191     case OP_KET:
1192     case OP_KETRMAX:
1193     case OP_KETRMIN:
1194     case OP_END:
1195     if (length < 0) length = branchlength;
1196     else if (length != branchlength) return -1;
1197     if (*cc != OP_ALT) return length;
1198     cc += 1 + LINK_SIZE;
1199     branchlength = 0;
1200     break;
1201    
1202     /* Skip over assertive subpatterns */
1203    
1204     case OP_ASSERT:
1205     case OP_ASSERT_NOT:
1206     case OP_ASSERTBACK:
1207     case OP_ASSERTBACK_NOT:
1208     do cc += GET(cc, 1); while (*cc == OP_ALT);
1209     /* Fall through */
1210    
1211     /* Skip over things that don't match chars */
1212    
1213     case OP_REVERSE:
1214     case OP_CREF:
1215 nigel 93 case OP_RREF:
1216     case OP_DEF:
1217 nigel 77 case OP_OPT:
1218     case OP_CALLOUT:
1219     case OP_SOD:
1220     case OP_SOM:
1221     case OP_EOD:
1222     case OP_EODN:
1223     case OP_CIRC:
1224     case OP_DOLL:
1225     case OP_NOT_WORD_BOUNDARY:
1226     case OP_WORD_BOUNDARY:
1227     cc += _pcre_OP_lengths[*cc];
1228     break;
1229    
1230     /* Handle literal characters */
1231    
1232     case OP_CHAR:
1233     case OP_CHARNC:
1234 nigel 91 case OP_NOT:
1235 nigel 77 branchlength++;
1236     cc += 2;
1237     #ifdef SUPPORT_UTF8
1238     if ((options & PCRE_UTF8) != 0)
1239     {
1240     while ((*cc & 0xc0) == 0x80) cc++;
1241     }
1242     #endif
1243     break;
1244    
1245     /* Handle exact repetitions. The count is already in characters, but we
1246     need to skip over a multibyte character in UTF8 mode. */
1247    
1248     case OP_EXACT:
1249     branchlength += GET2(cc,1);
1250     cc += 4;
1251     #ifdef SUPPORT_UTF8
1252     if ((options & PCRE_UTF8) != 0)
1253     {
1254     while((*cc & 0x80) == 0x80) cc++;
1255     }
1256     #endif
1257     break;
1258    
1259     case OP_TYPEEXACT:
1260     branchlength += GET2(cc,1);
1261 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1262 nigel 77 cc += 4;
1263     break;
1264    
1265     /* Handle single-char matchers */
1266    
1267     case OP_PROP:
1268     case OP_NOTPROP:
1269 nigel 87 cc += 2;
1270 nigel 77 /* Fall through */
1271    
1272     case OP_NOT_DIGIT:
1273     case OP_DIGIT:
1274     case OP_NOT_WHITESPACE:
1275     case OP_WHITESPACE:
1276     case OP_NOT_WORDCHAR:
1277     case OP_WORDCHAR:
1278     case OP_ANY:
1279     branchlength++;
1280     cc++;
1281     break;
1282    
1283     /* The single-byte matcher isn't allowed */
1284    
1285     case OP_ANYBYTE:
1286     return -2;
1287    
1288     /* Check a class for variable quantification */
1289    
1290     #ifdef SUPPORT_UTF8
1291     case OP_XCLASS:
1292     cc += GET(cc, 1) - 33;
1293     /* Fall through */
1294     #endif
1295    
1296     case OP_CLASS:
1297     case OP_NCLASS:
1298     cc += 33;
1299    
1300     switch (*cc)
1301     {
1302     case OP_CRSTAR:
1303     case OP_CRMINSTAR:
1304     case OP_CRQUERY:
1305     case OP_CRMINQUERY:
1306     return -1;
1307    
1308     case OP_CRRANGE:
1309     case OP_CRMINRANGE:
1310     if (GET2(cc,1) != GET2(cc,3)) return -1;
1311     branchlength += GET2(cc,1);
1312     cc += 5;
1313     break;
1314    
1315     default:
1316     branchlength++;
1317     }
1318     break;
1319    
1320     /* Anything else is variable length */
1321    
1322     default:
1323     return -1;
1324     }
1325     }
1326     /* Control never gets here */
1327     }
1328    
1329    
1330    
1331    
1332     /*************************************************
1333     * Scan compiled regex for numbered bracket *
1334     *************************************************/
1335    
1336     /* This little function scans through a compiled pattern until it finds a
1337     capturing bracket with the given number.
1338    
1339     Arguments:
1340     code points to start of expression
1341     utf8 TRUE in UTF-8 mode
1342     number the required bracket number
1343    
1344     Returns: pointer to the opcode for the bracket, or NULL if not found
1345     */
1346    
1347     static const uschar *
1348     find_bracket(const uschar *code, BOOL utf8, int number)
1349     {
1350     for (;;)
1351     {
1352     register int c = *code;
1353     if (c == OP_END) return NULL;
1354 nigel 91
1355     /* XCLASS is used for classes that cannot be represented just by a bit
1356     map. This includes negated single high-valued characters. The length in
1357     the table is zero; the actual length is stored in the compiled code. */
1358    
1359     if (c == OP_XCLASS) code += GET(code, 1);
1360    
1361 nigel 93 /* Handle capturing bracket */
1362 nigel 91
1363 nigel 93 else if (c == OP_CBRA)
1364 nigel 77 {
1365 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1366 nigel 77 if (n == number) return (uschar *)code;
1367 nigel 93 code += _pcre_OP_lengths[c];
1368 nigel 77 }
1369 nigel 91
1370 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1371     repeated character types, we have to test for \p and \P, which have an extra
1372 ph10 218 two bytes of parameters. */
1373 nigel 91
1374 nigel 77 else
1375     {
1376 ph10 218 switch(c)
1377     {
1378     case OP_TYPESTAR:
1379     case OP_TYPEMINSTAR:
1380     case OP_TYPEPLUS:
1381     case OP_TYPEMINPLUS:
1382     case OP_TYPEQUERY:
1383     case OP_TYPEMINQUERY:
1384     case OP_TYPEPOSSTAR:
1385     case OP_TYPEPOSPLUS:
1386     case OP_TYPEPOSQUERY:
1387     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1388 ph10 220 break;
1389 ph10 221
1390     case OP_TYPEUPTO:
1391     case OP_TYPEMINUPTO:
1392     case OP_TYPEEXACT:
1393     case OP_TYPEPOSUPTO:
1394     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1395     break;
1396 ph10 220 }
1397    
1398 ph10 218 /* Add in the fixed length from the table */
1399 ph10 220
1400 nigel 77 code += _pcre_OP_lengths[c];
1401 ph10 220
1402 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1403     a multi-byte character. The length in the table is a minimum, so we have to
1404     arrange to skip the extra bytes. */
1405 ph10 220
1406 ph10 107 #ifdef SUPPORT_UTF8
1407 nigel 77 if (utf8) switch(c)
1408     {
1409     case OP_CHAR:
1410     case OP_CHARNC:
1411     case OP_EXACT:
1412     case OP_UPTO:
1413     case OP_MINUPTO:
1414 nigel 93 case OP_POSUPTO:
1415 nigel 77 case OP_STAR:
1416     case OP_MINSTAR:
1417 nigel 93 case OP_POSSTAR:
1418 nigel 77 case OP_PLUS:
1419     case OP_MINPLUS:
1420 nigel 93 case OP_POSPLUS:
1421 nigel 77 case OP_QUERY:
1422     case OP_MINQUERY:
1423 nigel 93 case OP_POSQUERY:
1424     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1425 nigel 77 break;
1426     }
1427 ph10 111 #endif
1428 nigel 77 }
1429     }
1430     }
1431    
1432    
1433    
1434     /*************************************************
1435     * Scan compiled regex for recursion reference *
1436     *************************************************/
1437    
1438     /* This little function scans through a compiled pattern until it finds an
1439     instance of OP_RECURSE.
1440    
1441     Arguments:
1442     code points to start of expression
1443     utf8 TRUE in UTF-8 mode
1444    
1445     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1446     */
1447    
1448     static const uschar *
1449     find_recurse(const uschar *code, BOOL utf8)
1450     {
1451     for (;;)
1452     {
1453     register int c = *code;
1454     if (c == OP_END) return NULL;
1455 nigel 91 if (c == OP_RECURSE) return code;
1456 ph10 220
1457 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1458     map. This includes negated single high-valued characters. The length in
1459     the table is zero; the actual length is stored in the compiled code. */
1460    
1461     if (c == OP_XCLASS) code += GET(code, 1);
1462    
1463 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1464     repeated character types, we have to test for \p and \P, which have an extra
1465 ph10 218 two bytes of parameters. */
1466 nigel 91
1467 nigel 77 else
1468     {
1469 ph10 218 switch(c)
1470     {
1471     case OP_TYPESTAR:
1472     case OP_TYPEMINSTAR:
1473     case OP_TYPEPLUS:
1474     case OP_TYPEMINPLUS:
1475     case OP_TYPEQUERY:
1476     case OP_TYPEMINQUERY:
1477     case OP_TYPEPOSSTAR:
1478     case OP_TYPEPOSPLUS:
1479     case OP_TYPEPOSQUERY:
1480     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1481 ph10 220 break;
1482 ph10 221
1483     case OP_TYPEPOSUPTO:
1484     case OP_TYPEUPTO:
1485     case OP_TYPEMINUPTO:
1486     case OP_TYPEEXACT:
1487     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1488     break;
1489 ph10 220 }
1490    
1491 ph10 218 /* Add in the fixed length from the table */
1492    
1493 nigel 77 code += _pcre_OP_lengths[c];
1494 ph10 220
1495 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1496     by a multi-byte character. The length in the table is a minimum, so we have
1497     to arrange to skip the extra bytes. */
1498 ph10 220
1499 ph10 107 #ifdef SUPPORT_UTF8
1500 nigel 77 if (utf8) switch(c)
1501     {
1502     case OP_CHAR:
1503     case OP_CHARNC:
1504     case OP_EXACT:
1505     case OP_UPTO:
1506     case OP_MINUPTO:
1507 nigel 93 case OP_POSUPTO:
1508 nigel 77 case OP_STAR:
1509     case OP_MINSTAR:
1510 nigel 93 case OP_POSSTAR:
1511 nigel 77 case OP_PLUS:
1512     case OP_MINPLUS:
1513 nigel 93 case OP_POSPLUS:
1514 nigel 77 case OP_QUERY:
1515     case OP_MINQUERY:
1516 nigel 93 case OP_POSQUERY:
1517     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1518 nigel 77 break;
1519     }
1520 ph10 111 #endif
1521 nigel 77 }
1522     }
1523     }
1524    
1525    
1526    
1527     /*************************************************
1528     * Scan compiled branch for non-emptiness *
1529     *************************************************/
1530    
1531     /* This function scans through a branch of a compiled pattern to see whether it
1532 nigel 93 can match the empty string or not. It is called from could_be_empty()
1533     below and from compile_branch() when checking for an unlimited repeat of a
1534     group that can match nothing. Note that first_significant_code() skips over
1535 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1536     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1537     bracket whose current branch will already have been scanned.
1538 nigel 77
1539     Arguments:
1540     code points to start of search
1541     endcode points to where to stop
1542     utf8 TRUE if in UTF8 mode
1543    
1544     Returns: TRUE if what is matched could be empty
1545     */
1546    
1547     static BOOL
1548     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1549     {
1550     register int c;
1551 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1552 nigel 77 code < endcode;
1553     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1554     {
1555     const uschar *ccode;
1556    
1557     c = *code;
1558 ph10 286
1559     /* Skip over forward assertions; the other assertions are skipped by
1560 ph10 282 first_significant_code() with a TRUE final argument. */
1561 ph10 286
1562 ph10 282 if (c == OP_ASSERT)
1563 ph10 286 {
1564 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1565     c = *code;
1566     continue;
1567 ph10 286 }
1568 ph10 172
1569 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1570 nigel 77
1571 ph10 335 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1572 ph10 170 {
1573 ph10 172 code += _pcre_OP_lengths[c];
1574 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1575     c = *code;
1576     continue;
1577     }
1578    
1579     /* For other groups, scan the branches. */
1580 ph10 172
1581 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1582 nigel 77 {
1583     BOOL empty_branch;
1584     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1585    
1586     /* Scan a closed bracket */
1587    
1588     empty_branch = FALSE;
1589     do
1590     {
1591     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1592     empty_branch = TRUE;
1593     code += GET(code, 1);
1594     }
1595     while (*code == OP_ALT);
1596     if (!empty_branch) return FALSE; /* All branches are non-empty */
1597 ph10 172 c = *code;
1598 nigel 93 continue;
1599 nigel 77 }
1600    
1601 nigel 93 /* Handle the other opcodes */
1602    
1603     switch (c)
1604 nigel 77 {
1605 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1606     cannot be represented just by a bit map. This includes negated single
1607     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1608 ph10 220 actual length is stored in the compiled code, so we must update "code"
1609 ph10 216 here. */
1610 nigel 77
1611     #ifdef SUPPORT_UTF8
1612     case OP_XCLASS:
1613 ph10 216 ccode = code += GET(code, 1);
1614 nigel 77 goto CHECK_CLASS_REPEAT;
1615     #endif
1616    
1617     case OP_CLASS:
1618     case OP_NCLASS:
1619     ccode = code + 33;
1620    
1621     #ifdef SUPPORT_UTF8
1622     CHECK_CLASS_REPEAT:
1623     #endif
1624    
1625     switch (*ccode)
1626     {
1627     case OP_CRSTAR: /* These could be empty; continue */
1628     case OP_CRMINSTAR:
1629     case OP_CRQUERY:
1630     case OP_CRMINQUERY:
1631     break;
1632    
1633     default: /* Non-repeat => class must match */
1634     case OP_CRPLUS: /* These repeats aren't empty */
1635     case OP_CRMINPLUS:
1636     return FALSE;
1637    
1638     case OP_CRRANGE:
1639     case OP_CRMINRANGE:
1640     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1641     break;
1642     }
1643     break;
1644    
1645     /* Opcodes that must match a character */
1646    
1647     case OP_PROP:
1648     case OP_NOTPROP:
1649     case OP_EXTUNI:
1650     case OP_NOT_DIGIT:
1651     case OP_DIGIT:
1652     case OP_NOT_WHITESPACE:
1653     case OP_WHITESPACE:
1654     case OP_NOT_WORDCHAR:
1655     case OP_WORDCHAR:
1656     case OP_ANY:
1657     case OP_ANYBYTE:
1658     case OP_CHAR:
1659     case OP_CHARNC:
1660     case OP_NOT:
1661     case OP_PLUS:
1662     case OP_MINPLUS:
1663 nigel 93 case OP_POSPLUS:
1664 nigel 77 case OP_EXACT:
1665     case OP_NOTPLUS:
1666     case OP_NOTMINPLUS:
1667 nigel 93 case OP_NOTPOSPLUS:
1668 nigel 77 case OP_NOTEXACT:
1669     case OP_TYPEPLUS:
1670     case OP_TYPEMINPLUS:
1671 nigel 93 case OP_TYPEPOSPLUS:
1672 nigel 77 case OP_TYPEEXACT:
1673     return FALSE;
1674 ph10 227
1675     /* These are going to continue, as they may be empty, but we have to
1676     fudge the length for the \p and \P cases. */
1677    
1678 ph10 224 case OP_TYPESTAR:
1679     case OP_TYPEMINSTAR:
1680     case OP_TYPEPOSSTAR:
1681     case OP_TYPEQUERY:
1682     case OP_TYPEMINQUERY:
1683     case OP_TYPEPOSQUERY:
1684     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1685 ph10 227 break;
1686    
1687 ph10 224 /* Same for these */
1688 ph10 227
1689 ph10 224 case OP_TYPEUPTO:
1690     case OP_TYPEMINUPTO:
1691     case OP_TYPEPOSUPTO:
1692     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1693     break;
1694 nigel 77
1695     /* End of branch */
1696    
1697     case OP_KET:
1698     case OP_KETRMAX:
1699     case OP_KETRMIN:
1700     case OP_ALT:
1701     return TRUE;
1702    
1703 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1704     MINUPTO, and POSUPTO may be followed by a multibyte character */
1705 nigel 77
1706     #ifdef SUPPORT_UTF8
1707     case OP_STAR:
1708     case OP_MINSTAR:
1709 nigel 93 case OP_POSSTAR:
1710 nigel 77 case OP_QUERY:
1711     case OP_MINQUERY:
1712 nigel 93 case OP_POSQUERY:
1713 nigel 77 case OP_UPTO:
1714     case OP_MINUPTO:
1715 nigel 93 case OP_POSUPTO:
1716 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1717     break;
1718     #endif
1719     }
1720     }
1721    
1722     return TRUE;
1723     }
1724    
1725    
1726    
1727     /*************************************************
1728     * Scan compiled regex for non-emptiness *
1729     *************************************************/
1730    
1731     /* This function is called to check for left recursive calls. We want to check
1732     the current branch of the current pattern to see if it could match the empty
1733     string. If it could, we must look outwards for branches at other levels,
1734     stopping when we pass beyond the bracket which is the subject of the recursion.
1735    
1736     Arguments:
1737     code points to start of the recursion
1738     endcode points to where to stop (current RECURSE item)
1739     bcptr points to the chain of current (unclosed) branch starts
1740     utf8 TRUE if in UTF-8 mode
1741    
1742     Returns: TRUE if what is matched could be empty
1743     */
1744    
1745     static BOOL
1746     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1747     BOOL utf8)
1748     {
1749     while (bcptr != NULL && bcptr->current >= code)
1750     {
1751     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1752     bcptr = bcptr->outer;
1753     }
1754     return TRUE;
1755     }
1756    
1757    
1758    
1759     /*************************************************
1760     * Check for POSIX class syntax *
1761     *************************************************/
1762    
1763     /* This function is called when the sequence "[:" or "[." or "[=" is
1764 ph10 295 encountered in a character class. It checks whether this is followed by a
1765 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1766 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
1767 nigel 77
1768 ph10 298 Originally, this function only recognized a sequence of letters between the
1769     terminators, but it seems that Perl recognizes any sequence of characters,
1770     though of course unknown POSIX names are subsequently rejected. Perl gives an
1771     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1772     didn't consider this to be a POSIX class. Likewise for [:1234:].
1773 ph10 295
1774 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
1775     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1776     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1777     below handles the special case of \], but does not try to do any other escape
1778     processing. This makes it different from Perl for cases such as [:l\ower:]
1779 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1780 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1781 ph10 295 I think.
1782    
1783     Arguments:
1784 nigel 77 ptr pointer to the initial [
1785     endptr where to return the end pointer
1786    
1787     Returns: TRUE or FALSE
1788     */
1789    
1790     static BOOL
1791 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1792 nigel 77 {
1793     int terminator; /* Don't combine these lines; the Solaris cc */
1794     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1795 ph10 295 for (++ptr; *ptr != 0; ptr++)
1796 nigel 77 {
1797 ph10 295 if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1798 ph10 298 {
1799     if (*ptr == ']') return FALSE;
1800 ph10 295 if (*ptr == terminator && ptr[1] == ']')
1801     {
1802     *endptr = ptr;
1803     return TRUE;
1804 ph10 298 }
1805     }
1806     }
1807 nigel 77 return FALSE;
1808     }
1809    
1810    
1811    
1812    
1813     /*************************************************
1814     * Check POSIX class name *
1815     *************************************************/
1816    
1817     /* This function is called to check the name given in a POSIX-style class entry
1818     such as [:alnum:].
1819    
1820     Arguments:
1821     ptr points to the first letter
1822     len the length of the name
1823    
1824     Returns: a value representing the name, or -1 if unknown
1825     */
1826    
1827     static int
1828     check_posix_name(const uschar *ptr, int len)
1829     {
1830 ph10 240 const char *pn = posix_names;
1831 nigel 77 register int yield = 0;
1832     while (posix_name_lengths[yield] != 0)
1833     {
1834     if (len == posix_name_lengths[yield] &&
1835 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
1836 ph10 243 pn += posix_name_lengths[yield] + 1;
1837 nigel 77 yield++;
1838     }
1839     return -1;
1840     }
1841    
1842    
1843     /*************************************************
1844     * Adjust OP_RECURSE items in repeated group *
1845     *************************************************/
1846    
1847     /* OP_RECURSE items contain an offset from the start of the regex to the group
1848     that is referenced. This means that groups can be replicated for fixed
1849     repetition simply by copying (because the recursion is allowed to refer to
1850     earlier groups that are outside the current group). However, when a group is
1851 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1852     inserted before it, after it has been compiled. This means that any OP_RECURSE
1853     items within it that refer to the group itself or any contained groups have to
1854     have their offsets adjusted. That one of the jobs of this function. Before it
1855     is called, the partially compiled regex must be temporarily terminated with
1856     OP_END.
1857 nigel 77
1858 nigel 93 This function has been extended with the possibility of forward references for
1859     recursions and subroutine calls. It must also check the list of such references
1860     for the group we are dealing with. If it finds that one of the recursions in
1861     the current group is on this list, it adjusts the offset in the list, not the
1862     value in the reference (which is a group number).
1863    
1864 nigel 77 Arguments:
1865     group points to the start of the group
1866     adjust the amount by which the group is to be moved
1867     utf8 TRUE in UTF-8 mode
1868     cd contains pointers to tables etc.
1869 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1870 nigel 77
1871     Returns: nothing
1872     */
1873    
1874     static void
1875 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1876     uschar *save_hwm)
1877 nigel 77 {
1878     uschar *ptr = group;
1879 ph10 224
1880 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1881     {
1882 nigel 93 int offset;
1883     uschar *hc;
1884    
1885     /* See if this recursion is on the forward reference list. If so, adjust the
1886     reference. */
1887 ph10 334
1888 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1889     {
1890     offset = GET(hc, 0);
1891     if (cd->start_code + offset == ptr + 1)
1892     {
1893     PUT(hc, 0, offset + adjust);
1894     break;
1895     }
1896     }
1897    
1898     /* Otherwise, adjust the recursion offset if it's after the start of this
1899     group. */
1900    
1901     if (hc >= cd->hwm)
1902     {
1903     offset = GET(ptr, 1);
1904     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1905     }
1906    
1907 nigel 77 ptr += 1 + LINK_SIZE;
1908     }
1909     }
1910    
1911    
1912    
1913     /*************************************************
1914     * Insert an automatic callout point *
1915     *************************************************/
1916    
1917     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1918     callout points before each pattern item.
1919    
1920     Arguments:
1921     code current code pointer
1922     ptr current pattern pointer
1923     cd pointers to tables etc
1924    
1925     Returns: new code pointer
1926     */
1927    
1928     static uschar *
1929     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1930     {
1931     *code++ = OP_CALLOUT;
1932     *code++ = 255;
1933     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1934     PUT(code, LINK_SIZE, 0); /* Default length */
1935     return code + 2*LINK_SIZE;
1936     }
1937    
1938    
1939    
1940     /*************************************************
1941     * Complete a callout item *
1942     *************************************************/
1943    
1944     /* A callout item contains the length of the next item in the pattern, which
1945     we can't fill in till after we have reached the relevant point. This is used
1946     for both automatic and manual callouts.
1947    
1948     Arguments:
1949     previous_callout points to previous callout item
1950     ptr current pattern pointer
1951     cd pointers to tables etc
1952    
1953     Returns: nothing
1954     */
1955    
1956     static void
1957     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1958     {
1959     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1960     PUT(previous_callout, 2 + LINK_SIZE, length);
1961     }
1962    
1963    
1964    
1965     #ifdef SUPPORT_UCP
1966     /*************************************************
1967     * Get othercase range *
1968     *************************************************/
1969    
1970     /* This function is passed the start and end of a class range, in UTF-8 mode
1971     with UCP support. It searches up the characters, looking for internal ranges of
1972     characters in the "other" case. Each call returns the next one, updating the
1973     start address.
1974    
1975     Arguments:
1976     cptr points to starting character value; updated
1977     d end value
1978     ocptr where to put start of othercase range
1979     odptr where to put end of othercase range
1980    
1981     Yield: TRUE when range returned; FALSE when no more
1982     */
1983    
1984     static BOOL
1985 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1986     unsigned int *odptr)
1987 nigel 77 {
1988 nigel 93 unsigned int c, othercase, next;
1989 nigel 77
1990     for (c = *cptr; c <= d; c++)
1991 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1992 nigel 77
1993     if (c > d) return FALSE;
1994    
1995     *ocptr = othercase;
1996     next = othercase + 1;
1997    
1998     for (++c; c <= d; c++)
1999     {
2000 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
2001 nigel 77 next++;
2002     }
2003    
2004     *odptr = next - 1;
2005     *cptr = c;
2006    
2007     return TRUE;
2008     }
2009     #endif /* SUPPORT_UCP */
2010    
2011    
2012 nigel 93
2013 nigel 77 /*************************************************
2014 nigel 93 * Check if auto-possessifying is possible *
2015     *************************************************/
2016    
2017     /* This function is called for unlimited repeats of certain items, to see
2018     whether the next thing could possibly match the repeated item. If not, it makes
2019     sense to automatically possessify the repeated item.
2020    
2021     Arguments:
2022     op_code the repeated op code
2023     this data for this item, depends on the opcode
2024     utf8 TRUE in UTF-8 mode
2025     utf8_char used for utf8 character bytes, NULL if not relevant
2026     ptr next character in pattern
2027     options options bits
2028     cd contains pointers to tables etc.
2029    
2030     Returns: TRUE if possessifying is wanted
2031     */
2032    
2033     static BOOL
2034     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2035     const uschar *ptr, int options, compile_data *cd)
2036     {
2037     int next;
2038    
2039     /* Skip whitespace and comments in extended mode */
2040    
2041     if ((options & PCRE_EXTENDED) != 0)
2042     {
2043     for (;;)
2044     {
2045     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2046     if (*ptr == '#')
2047     {
2048     while (*(++ptr) != 0)
2049     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2050     }
2051     else break;
2052     }
2053     }
2054    
2055     /* If the next item is one that we can handle, get its value. A non-negative
2056     value is a character, a negative value is an escape value. */
2057    
2058     if (*ptr == '\\')
2059     {
2060     int temperrorcode = 0;
2061     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2062     if (temperrorcode != 0) return FALSE;
2063     ptr++; /* Point after the escape sequence */
2064     }
2065    
2066     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2067     {
2068     #ifdef SUPPORT_UTF8
2069     if (utf8) { GETCHARINC(next, ptr); } else
2070     #endif
2071     next = *ptr++;
2072     }
2073    
2074     else return FALSE;
2075    
2076     /* Skip whitespace and comments in extended mode */
2077    
2078     if ((options & PCRE_EXTENDED) != 0)
2079     {
2080     for (;;)
2081     {
2082     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2083     if (*ptr == '#')
2084     {
2085     while (*(++ptr) != 0)
2086     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2087     }
2088     else break;
2089     }
2090     }
2091    
2092     /* If the next thing is itself optional, we have to give up. */
2093    
2094     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2095     return FALSE;
2096    
2097     /* Now compare the next item with the previous opcode. If the previous is a
2098     positive single character match, "item" either contains the character or, if
2099     "item" is greater than 127 in utf8 mode, the character's bytes are in
2100     utf8_char. */
2101    
2102    
2103     /* Handle cases when the next item is a character. */
2104    
2105     if (next >= 0) switch(op_code)
2106     {
2107     case OP_CHAR:
2108     #ifdef SUPPORT_UTF8
2109     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2110     #endif
2111     return item != next;
2112    
2113     /* For CHARNC (caseless character) we must check the other case. If we have
2114     Unicode property support, we can use it to test the other case of
2115     high-valued characters. */
2116    
2117     case OP_CHARNC:
2118     #ifdef SUPPORT_UTF8
2119     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2120     #endif
2121     if (item == next) return FALSE;
2122     #ifdef SUPPORT_UTF8
2123     if (utf8)
2124     {
2125     unsigned int othercase;
2126     if (next < 128) othercase = cd->fcc[next]; else
2127     #ifdef SUPPORT_UCP
2128     othercase = _pcre_ucp_othercase((unsigned int)next);
2129     #else
2130     othercase = NOTACHAR;
2131     #endif
2132     return (unsigned int)item != othercase;
2133     }
2134     else
2135     #endif /* SUPPORT_UTF8 */
2136     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2137    
2138     /* For OP_NOT, "item" must be a single-byte character. */
2139    
2140     case OP_NOT:
2141     if (item == next) return TRUE;
2142     if ((options & PCRE_CASELESS) == 0) return FALSE;
2143     #ifdef SUPPORT_UTF8
2144     if (utf8)
2145     {
2146     unsigned int othercase;
2147     if (next < 128) othercase = cd->fcc[next]; else
2148     #ifdef SUPPORT_UCP
2149     othercase = _pcre_ucp_othercase(next);
2150     #else
2151     othercase = NOTACHAR;
2152     #endif
2153     return (unsigned int)item == othercase;
2154     }
2155     else
2156     #endif /* SUPPORT_UTF8 */
2157     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2158    
2159     case OP_DIGIT:
2160     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2161    
2162     case OP_NOT_DIGIT:
2163     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2164    
2165     case OP_WHITESPACE:
2166     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2167    
2168     case OP_NOT_WHITESPACE:
2169     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2170    
2171     case OP_WORDCHAR:
2172     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2173    
2174     case OP_NOT_WORDCHAR:
2175     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2176    
2177 ph10 180 case OP_HSPACE:
2178     case OP_NOT_HSPACE:
2179     switch(next)
2180     {
2181     case 0x09:
2182     case 0x20:
2183     case 0xa0:
2184     case 0x1680:
2185     case 0x180e:
2186     case 0x2000:
2187     case 0x2001:
2188     case 0x2002:
2189     case 0x2003:
2190     case 0x2004:
2191     case 0x2005:
2192     case 0x2006:
2193     case 0x2007:
2194     case 0x2008:
2195     case 0x2009:
2196     case 0x200A:
2197     case 0x202f:
2198     case 0x205f:
2199     case 0x3000:
2200     return op_code != OP_HSPACE;
2201     default:
2202     return op_code == OP_HSPACE;
2203     }
2204    
2205     case OP_VSPACE:
2206     case OP_NOT_VSPACE:
2207     switch(next)
2208     {
2209     case 0x0a:
2210     case 0x0b:
2211     case 0x0c:
2212     case 0x0d:
2213     case 0x85:
2214     case 0x2028:
2215     case 0x2029:
2216     return op_code != OP_VSPACE;
2217     default:
2218     return op_code == OP_VSPACE;
2219     }
2220    
2221 nigel 93 default:
2222     return FALSE;
2223     }
2224    
2225    
2226     /* Handle the case when the next item is \d, \s, etc. */
2227    
2228     switch(op_code)
2229     {
2230     case OP_CHAR:
2231     case OP_CHARNC:
2232     #ifdef SUPPORT_UTF8
2233     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2234     #endif
2235     switch(-next)
2236     {
2237     case ESC_d:
2238     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2239    
2240     case ESC_D:
2241     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2242    
2243     case ESC_s:
2244     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2245    
2246     case ESC_S:
2247     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2248    
2249     case ESC_w:
2250     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2251    
2252     case ESC_W:
2253     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2254 ph10 182
2255 ph10 180 case ESC_h:
2256     case ESC_H:
2257     switch(item)
2258     {
2259     case 0x09:
2260     case 0x20:
2261     case 0xa0:
2262     case 0x1680:
2263     case 0x180e:
2264     case 0x2000:
2265     case 0x2001:
2266     case 0x2002:
2267     case 0x2003:
2268     case 0x2004:
2269     case 0x2005:
2270     case 0x2006:
2271     case 0x2007:
2272     case 0x2008:
2273     case 0x2009:
2274     case 0x200A:
2275     case 0x202f:
2276     case 0x205f:
2277     case 0x3000:
2278     return -next != ESC_h;
2279     default:
2280     return -next == ESC_h;
2281 ph10 182 }
2282    
2283 ph10 180 case ESC_v:
2284     case ESC_V:
2285     switch(item)
2286     {
2287     case 0x0a:
2288     case 0x0b:
2289     case 0x0c:
2290     case 0x0d:
2291     case 0x85:
2292     case 0x2028:
2293     case 0x2029:
2294     return -next != ESC_v;
2295     default:
2296     return -next == ESC_v;
2297 ph10 182 }
2298 nigel 93
2299     default:
2300     return FALSE;
2301     }
2302    
2303     case OP_DIGIT:
2304 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2305     next == -ESC_h || next == -ESC_v;
2306 nigel 93
2307     case OP_NOT_DIGIT:
2308     return next == -ESC_d;
2309    
2310     case OP_WHITESPACE:
2311     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2312    
2313     case OP_NOT_WHITESPACE:
2314 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2315 nigel 93
2316 ph10 180 case OP_HSPACE:
2317     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2318    
2319     case OP_NOT_HSPACE:
2320     return next == -ESC_h;
2321 ph10 182
2322 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2323 ph10 182 case OP_VSPACE:
2324 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2325    
2326     case OP_NOT_VSPACE:
2327 ph10 182 return next == -ESC_v;
2328 ph10 180
2329 nigel 93 case OP_WORDCHAR:
2330 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2331 nigel 93
2332     case OP_NOT_WORDCHAR:
2333     return next == -ESC_w || next == -ESC_d;
2334 ph10 182
2335 nigel 93 default:
2336     return FALSE;
2337     }
2338    
2339     /* Control does not reach here */
2340     }
2341    
2342    
2343    
2344     /*************************************************
2345 nigel 77 * Compile one branch *
2346     *************************************************/
2347    
2348 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2349 nigel 77 changed during the branch, the pointer is used to change the external options
2350 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2351     to find out the amount of memory needed, as well as during the real compile
2352     phase. The value of lengthptr distinguishes the two phases.
2353 nigel 77
2354     Arguments:
2355     optionsptr pointer to the option bits
2356     codeptr points to the pointer to the current code point
2357     ptrptr points to the current pattern pointer
2358     errorcodeptr points to error code variable
2359     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2360     reqbyteptr set to the last literal character required, else < 0
2361     bcptr points to current branch chain
2362     cd contains pointers to tables etc.
2363 nigel 93 lengthptr NULL during the real compile phase
2364     points to length accumulator during pre-compile phase
2365 nigel 77
2366     Returns: TRUE on success
2367     FALSE, with *errorcodeptr set non-zero on error
2368     */
2369    
2370     static BOOL
2371 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2372     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2373     compile_data *cd, int *lengthptr)
2374 nigel 77 {
2375     int repeat_type, op_type;
2376     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2377     int bravalue = 0;
2378     int greedy_default, greedy_non_default;
2379     int firstbyte, reqbyte;
2380     int zeroreqbyte, zerofirstbyte;
2381     int req_caseopt, reqvary, tempreqvary;
2382     int options = *optionsptr;
2383     int after_manual_callout = 0;
2384 nigel 93 int length_prevgroup = 0;
2385 nigel 77 register int c;
2386     register uschar *code = *codeptr;
2387 nigel 93 uschar *last_code = code;
2388     uschar *orig_code = code;
2389 nigel 77 uschar *tempcode;
2390     BOOL inescq = FALSE;
2391     BOOL groupsetfirstbyte = FALSE;
2392     const uschar *ptr = *ptrptr;
2393     const uschar *tempptr;
2394     uschar *previous = NULL;
2395     uschar *previous_callout = NULL;
2396 nigel 93 uschar *save_hwm = NULL;
2397 nigel 77 uschar classbits[32];
2398    
2399     #ifdef SUPPORT_UTF8
2400     BOOL class_utf8;
2401     BOOL utf8 = (options & PCRE_UTF8) != 0;
2402     uschar *class_utf8data;
2403 ph10 300 uschar *class_utf8data_base;
2404 nigel 77 uschar utf8_char[6];
2405     #else
2406     BOOL utf8 = FALSE;
2407 nigel 93 uschar *utf8_char = NULL;
2408 nigel 77 #endif
2409    
2410 nigel 93 #ifdef DEBUG
2411     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2412     #endif
2413    
2414 nigel 77 /* Set up the default and non-default settings for greediness */
2415    
2416     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2417     greedy_non_default = greedy_default ^ 1;
2418    
2419     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2420     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2421     matches a non-fixed char first char; reqbyte just remains unset if we never
2422     find one.
2423    
2424     When we hit a repeat whose minimum is zero, we may have to adjust these values
2425     to take the zero repeat into account. This is implemented by setting them to
2426     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2427     item types that can be repeated set these backoff variables appropriately. */
2428    
2429     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2430    
2431     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2432     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2433     value > 255. It is added into the firstbyte or reqbyte variables to record the
2434     case status of the value. This is used only for ASCII characters. */
2435    
2436     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2437    
2438     /* Switch on next character until the end of the branch */
2439    
2440     for (;; ptr++)
2441     {
2442     BOOL negate_class;
2443 ph10 286 BOOL should_flip_negation;
2444 nigel 77 BOOL possessive_quantifier;
2445     BOOL is_quantifier;
2446 nigel 93 BOOL is_recurse;
2447 ph10 180 BOOL reset_bracount;
2448 nigel 77 int class_charcount;
2449     int class_lastchar;
2450     int newoptions;
2451     int recno;
2452 ph10 172 int refsign;
2453 nigel 77 int skipbytes;
2454     int subreqbyte;
2455     int subfirstbyte;
2456 nigel 93 int terminator;
2457 nigel 77 int mclength;
2458     uschar mcbuffer[8];
2459    
2460 nigel 93 /* Get next byte in the pattern */
2461 nigel 77
2462     c = *ptr;
2463 ph10 334
2464 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2465     previous cycle of this loop. */
2466    
2467     if (lengthptr != NULL)
2468     {
2469     #ifdef DEBUG
2470     if (code > cd->hwm) cd->hwm = code; /* High water info */
2471     #endif
2472     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2473     {
2474     *errorcodeptr = ERR52;
2475     goto FAILED;
2476     }
2477    
2478     /* There is at least one situation where code goes backwards: this is the
2479     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2480     the class is simply eliminated. However, it is created first, so we have to
2481     allow memory for it. Therefore, don't ever reduce the length at this point.
2482     */
2483    
2484     if (code < last_code) code = last_code;
2485 ph10 202
2486     /* Paranoid check for integer overflow */
2487    
2488     if (OFLOW_MAX - *lengthptr < code - last_code)
2489     {
2490     *errorcodeptr = ERR20;
2491     goto FAILED;
2492     }
2493    
2494 nigel 93 *lengthptr += code - last_code;
2495     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2496    
2497     /* If "previous" is set and it is not at the start of the work space, move
2498     it back to there, in order to avoid filling up the work space. Otherwise,
2499     if "previous" is NULL, reset the current code pointer to the start. */
2500    
2501     if (previous != NULL)
2502     {
2503     if (previous > orig_code)
2504     {
2505     memmove(orig_code, previous, code - previous);
2506     code -= previous - orig_code;
2507     previous = orig_code;
2508     }
2509     }
2510     else code = orig_code;
2511    
2512     /* Remember where this code item starts so we can pick up the length
2513     next time round. */
2514    
2515     last_code = code;
2516     }
2517    
2518     /* In the real compile phase, just check the workspace used by the forward
2519     reference list. */
2520    
2521     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2522     {
2523     *errorcodeptr = ERR52;
2524     goto FAILED;
2525     }
2526    
2527 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2528    
2529     if (inescq && c != 0)
2530     {
2531     if (c == '\\' && ptr[1] == 'E')
2532     {
2533     inescq = FALSE;
2534     ptr++;
2535     continue;
2536     }
2537     else
2538     {
2539     if (previous_callout != NULL)
2540     {
2541 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2542     complete_callout(previous_callout, ptr, cd);
2543 nigel 77 previous_callout = NULL;
2544     }
2545     if ((options & PCRE_AUTO_CALLOUT) != 0)
2546     {
2547     previous_callout = code;
2548     code = auto_callout(code, ptr, cd);
2549     }
2550     goto NORMAL_CHAR;
2551     }
2552     }
2553    
2554     /* Fill in length of a previous callout, except when the next thing is
2555     a quantifier. */
2556    
2557     is_quantifier = c == '*' || c == '+' || c == '?' ||
2558     (c == '{' && is_counted_repeat(ptr+1));
2559    
2560     if (!is_quantifier && previous_callout != NULL &&
2561     after_manual_callout-- <= 0)
2562     {
2563 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2564     complete_callout(previous_callout, ptr, cd);
2565 nigel 77 previous_callout = NULL;
2566     }
2567    
2568     /* In extended mode, skip white space and comments */
2569    
2570     if ((options & PCRE_EXTENDED) != 0)
2571     {
2572     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2573     if (c == '#')
2574     {
2575 nigel 93 while (*(++ptr) != 0)
2576 nigel 91 {
2577 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2578 nigel 91 }
2579 nigel 93 if (*ptr != 0) continue;
2580    
2581 nigel 91 /* Else fall through to handle end of string */
2582     c = 0;
2583 nigel 77 }
2584     }
2585    
2586     /* No auto callout for quantifiers. */
2587    
2588     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2589     {
2590     previous_callout = code;
2591     code = auto_callout(code, ptr, cd);
2592     }
2593    
2594     switch(c)
2595     {
2596 nigel 93 /* ===================================================================*/
2597     case 0: /* The branch terminates at string end */
2598     case '|': /* or | or ) */
2599 nigel 77 case ')':
2600     *firstbyteptr = firstbyte;
2601     *reqbyteptr = reqbyte;
2602     *codeptr = code;
2603     *ptrptr = ptr;
2604 nigel 93 if (lengthptr != NULL)
2605     {
2606 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2607     {
2608     *errorcodeptr = ERR20;
2609     goto FAILED;
2610     }
2611 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2612     DPRINTF((">> end branch\n"));
2613     }
2614 nigel 77 return TRUE;
2615    
2616 nigel 93
2617     /* ===================================================================*/
2618 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2619     the setting of any following char as a first character. */
2620    
2621     case '^':
2622     if ((options & PCRE_MULTILINE) != 0)
2623     {
2624     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2625     }
2626     previous = NULL;
2627     *code++ = OP_CIRC;
2628     break;
2629    
2630     case '$':
2631     previous = NULL;
2632     *code++ = OP_DOLL;
2633     break;
2634    
2635     /* There can never be a first char if '.' is first, whatever happens about
2636     repeats. The value of reqbyte doesn't change either. */
2637    
2638     case '.':
2639     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2640     zerofirstbyte = firstbyte;
2641     zeroreqbyte = reqbyte;
2642     previous = code;
2643     *code++ = OP_ANY;
2644     break;
2645    
2646 nigel 93
2647     /* ===================================================================*/
2648 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2649     32-byte bitmap of the permitted characters, except in the special case
2650     where there is only one such character. For negated classes, we build the
2651     map as usual, then invert it at the end. However, we use a different opcode
2652     so that data characters > 255 can be handled correctly.
2653 nigel 77
2654     If the class contains characters outside the 0-255 range, a different
2655     opcode is compiled. It may optionally have a bit map for characters < 256,
2656     but those above are are explicitly listed afterwards. A flag byte tells
2657     whether the bitmap is present, and whether this is a negated class or not.
2658 ph10 336
2659     In JavaScript compatibility mode, an isolated ']' causes an error. In
2660     default (Perl) mode, it is treated as a data character. */
2661    
2662     case ']':
2663     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2664     {
2665     *errorcodeptr = ERR64;
2666     goto FAILED;
2667     }
2668     goto NORMAL_CHAR;
2669 nigel 77
2670     case '[':
2671     previous = code;
2672    
2673     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2674     they are encountered at the top level, so we'll do that too. */
2675    
2676     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2677 ph10 295 check_posix_syntax(ptr, &tempptr))
2678 nigel 77 {
2679     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2680     goto FAILED;
2681     }
2682    
2683 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2684 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2685 ph10 205 skip them too. This makes for compatibility with Perl. */
2686 ph10 208
2687 ph10 205 negate_class = FALSE;
2688     for (;;)
2689 nigel 77 {
2690     c = *(++ptr);
2691 ph10 205 if (c == '\\')
2692     {
2693 ph10 208 if (ptr[1] == 'E') ptr++;
2694 ph10 205 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2695 ph10 208 else break;
2696 ph10 205 }
2697     else if (!negate_class && c == '^')
2698     negate_class = TRUE;
2699     else break;
2700 ph10 208 }
2701 nigel 77
2702 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
2703     negation flag at the end, so that support for characters > 255 works
2704 ph10 264 correctly (they are all included in the class). */
2705    
2706     should_flip_negation = FALSE;
2707    
2708 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
2709 nigel 93 of just a single character (as long as it's < 256). However, For higher
2710     valued UTF-8 characters, we don't yet do any optimization. */
2711 nigel 77
2712     class_charcount = 0;
2713     class_lastchar = -1;
2714    
2715 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2716     temporary bit of memory, in case the class contains only 1 character (less
2717     than 256), because in that case the compiled code doesn't use the bit map.
2718     */
2719    
2720     memset(classbits, 0, 32 * sizeof(uschar));
2721    
2722 nigel 77 #ifdef SUPPORT_UTF8
2723     class_utf8 = FALSE; /* No chars >= 256 */
2724 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2725 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2726 nigel 77 #endif
2727    
2728     /* Process characters until ] is reached. By writing this as a "do" it
2729 nigel 93 means that an initial ] is taken as a data character. At the start of the
2730     loop, c contains the first byte of the character. */
2731 nigel 77
2732 nigel 93 if (c != 0) do
2733 nigel 77 {
2734 nigel 93 const uschar *oldptr;
2735    
2736 nigel 77 #ifdef SUPPORT_UTF8
2737     if (utf8 && c > 127)
2738     { /* Braces are required because the */
2739     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2740     }
2741 ph10 309
2742 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2743 ph10 309 data and reset the pointer. This is so that very large classes that
2744 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
2745 ph10 309 (which is on the stack). */
2746    
2747 ph10 300 if (lengthptr != NULL)
2748     {
2749     *lengthptr += class_utf8data - class_utf8data_base;
2750 ph10 309 class_utf8data = class_utf8data_base;
2751     }
2752    
2753 nigel 77 #endif
2754    
2755     /* Inside \Q...\E everything is literal except \E */
2756    
2757     if (inescq)
2758     {
2759 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2760 nigel 77 {
2761 nigel 93 inescq = FALSE; /* Reset literal state */
2762     ptr++; /* Skip the 'E' */
2763     continue; /* Carry on with next */
2764 nigel 77 }
2765 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2766 nigel 77 }
2767    
2768     /* Handle POSIX class names. Perl allows a negation extension of the
2769     form [:^name:]. A square bracket that doesn't match the syntax is
2770     treated as a literal. We also recognize the POSIX constructions
2771     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2772     5.6 and 5.8 do. */
2773    
2774     if (c == '[' &&
2775     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2776 ph10 295 check_posix_syntax(ptr, &tempptr))
2777 nigel 77 {
2778     BOOL local_negate = FALSE;
2779 nigel 87 int posix_class, taboffset, tabopt;
2780 nigel 77 register const uschar *cbits = cd->cbits;
2781 nigel 87 uschar pbits[32];
2782 nigel 77
2783     if (ptr[1] != ':')
2784     {
2785     *errorcodeptr = ERR31;
2786     goto FAILED;
2787     }
2788    
2789     ptr += 2;
2790     if (*ptr == '^')
2791     {
2792     local_negate = TRUE;
2793 ph10 286 should_flip_negation = TRUE; /* Note negative special */
2794 nigel 77 ptr++;
2795     }
2796    
2797     posix_class = check_posix_name(ptr, tempptr - ptr);
2798     if (posix_class < 0)
2799     {
2800     *errorcodeptr = ERR30;
2801     goto FAILED;
2802     }
2803    
2804     /* If matching is caseless, upper and lower are converted to
2805     alpha. This relies on the fact that the class table starts with
2806     alpha, lower, upper as the first 3 entries. */
2807    
2808     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2809     posix_class = 0;
2810    
2811 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2812     because we may be adding and subtracting from it, and we don't want to
2813     subtract bits that may be in the main map already. At the end we or the
2814     result into the bit map that is being built. */
2815 nigel 77
2816     posix_class *= 3;
2817 nigel 87
2818     /* Copy in the first table (always present) */
2819    
2820     memcpy(pbits, cbits + posix_class_maps[posix_class],
2821     32 * sizeof(uschar));
2822    
2823     /* If there is a second table, add or remove it as required. */
2824    
2825     taboffset = posix_class_maps[posix_class + 1];
2826     tabopt = posix_class_maps[posix_class + 2];
2827    
2828     if (taboffset >= 0)
2829 nigel 77 {
2830 nigel 87 if (tabopt >= 0)
2831     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2832 nigel 77 else
2833 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2834 nigel 77 }
2835    
2836 nigel 87 /* Not see if we need to remove any special characters. An option
2837     value of 1 removes vertical space and 2 removes underscore. */
2838    
2839     if (tabopt < 0) tabopt = -tabopt;
2840     if (tabopt == 1) pbits[1] &= ~0x3c;
2841     else if (tabopt == 2) pbits[11] &= 0x7f;
2842    
2843     /* Add the POSIX table or its complement into the main table that is
2844     being built and we are done. */
2845    
2846     if (local_negate)
2847     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2848     else
2849     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2850    
2851 nigel 77 ptr = tempptr + 1;
2852     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2853     continue; /* End of POSIX syntax handling */
2854     }
2855    
2856     /* Backslash may introduce a single character, or it may introduce one
2857 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2858     case. Inside a class (and only there) it is treated as backspace.
2859     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2860 ph10 205 to 'or' into the one we are building. We assume they have more than one
2861 nigel 77 character in them, so set class_charcount bigger than one. */
2862    
2863     if (c == '\\')
2864     {
2865 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2866     if (*errorcodeptr != 0) goto FAILED;
2867 nigel 77
2868 ph10 275 if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
2869 nigel 77 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2870 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2871 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2872     {
2873     if (ptr[1] == '\\' && ptr[2] == 'E')
2874     {
2875     ptr += 2; /* avoid empty string */
2876     }
2877     else inescq = TRUE;
2878     continue;
2879     }
2880 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2881 nigel 77
2882     if (c < 0)
2883     {
2884     register const uschar *cbits = cd->cbits;
2885     class_charcount += 2; /* Greater than 1 is what matters */
2886 nigel 93
2887     /* Save time by not doing this in the pre-compile phase. */
2888    
2889     if (lengthptr == NULL) switch (-c)
2890 nigel 77 {
2891     case ESC_d:
2892     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2893     continue;
2894    
2895     case ESC_D:
2896 ph10 286 should_flip_negation = TRUE;
2897 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2898     continue;
2899    
2900     case ESC_w:
2901     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2902     continue;
2903    
2904     case ESC_W:
2905 ph10 286 should_flip_negation = TRUE;
2906 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2907     continue;
2908    
2909     case ESC_s:
2910     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2911     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2912     continue;
2913    
2914     case ESC_S:
2915 ph10 286 should_flip_negation = TRUE;
2916 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2917     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2918     continue;
2919    
2920 nigel 93 default: /* Not recognized; fall through */
2921     break; /* Need "default" setting to stop compiler warning. */
2922     }
2923    
2924     /* In the pre-compile phase, just do the recognition. */
2925    
2926     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2927     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2928 ph10 180
2929 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
2930     they use extra memory. */
2931 ph10 180
2932 ph10 178 if (-c == ESC_h)
2933     {
2934     SETBIT(classbits, 0x09); /* VT */
2935     SETBIT(classbits, 0x20); /* SPACE */
2936 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
2937 ph10 178 #ifdef SUPPORT_UTF8
2938     if (utf8)
2939 ph10 180 {
2940 ph10 178 class_utf8 = TRUE;
2941     *class_utf8data++ = XCL_SINGLE;
2942 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2943 ph10 178 *class_utf8data++ = XCL_SINGLE;
2944 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2945     *class_utf8data++ = XCL_RANGE;
2946     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2947     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2948 ph10 178 *class_utf8data++ = XCL_SINGLE;
2949 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2950 ph10 178 *class_utf8data++ = XCL_SINGLE;
2951 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2952 ph10 178 *class_utf8data++ = XCL_SINGLE;
2953 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2954     }
2955     #endif
2956     continue;
2957     }
2958 nigel 93
2959 ph10 178 if (-c == ESC_H)
2960     {
2961     for (c = 0; c < 32; c++)
2962     {
2963     int x = 0xff;
2964     switch (c)
2965 ph10 180 {
2966 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
2967     case 0x20/8: x ^= 1 << (0x20%8); break;
2968     case 0xa0/8: x ^= 1 << (0xa0%8); break;
2969     default: break;
2970     }
2971     classbits[c] |= x;
2972 ph10 180 }
2973    
2974 ph10 178 #ifdef SUPPORT_UTF8
2975     if (utf8)
2976 ph10 180 {
2977 ph10 178 class_utf8 = TRUE;
2978 ph10 180 *class_utf8data++ = XCL_RANGE;
2979     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2980     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2981     *class_utf8data++ = XCL_RANGE;
2982     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2983     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2984     *class_utf8data++ = XCL_RANGE;
2985     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2986     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2987     *class_utf8data++ = XCL_RANGE;
2988     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2989     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2990     *class_utf8data++ = XCL_RANGE;
2991     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2992     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2993     *class_utf8data++ = XCL_RANGE;
2994     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2995     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2996     *class_utf8data++ = XCL_RANGE;
2997     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2998     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2999     }
3000     #endif
3001     continue;
3002     }
3003 ph10 178
3004     if (-c == ESC_v)
3005     {
3006     SETBIT(classbits, 0x0a); /* LF */
3007     SETBIT(classbits, 0x0b); /* VT */
3008 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3009     SETBIT(classbits, 0x0d); /* CR */
3010     SETBIT(classbits, 0x85); /* NEL */
3011 ph10 178 #ifdef SUPPORT_UTF8
3012     if (utf8)
3013 ph10 180 {
3014 ph10 178 class_utf8 = TRUE;
3015 ph10 180 *class_utf8data++ = XCL_RANGE;
3016     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3017     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3018     }
3019     #endif
3020     continue;
3021     }
3022 ph10 178
3023     if (-c == ESC_V)
3024     {
3025     for (c = 0; c < 32; c++)
3026     {
3027     int x = 0xff;
3028     switch (c)
3029 ph10 180 {
3030 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3031     x ^= 1 << (0x0b%8);
3032     x ^= 1 << (0x0c%8);
3033 ph10 180 x ^= 1 << (0x0d%8);
3034 ph10 178 break;
3035     case 0x85/8: x ^= 1 << (0x85%8); break;
3036     default: break;
3037     }
3038     classbits[c] |= x;
3039 ph10 180 }
3040    
3041 ph10 178 #ifdef SUPPORT_UTF8
3042     if (utf8)
3043 ph10 180 {
3044 ph10 178 class_utf8 = TRUE;
3045 ph10 180 *class_utf8data++ = XCL_RANGE;
3046     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3047     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3048     *class_utf8data++ = XCL_RANGE;
3049     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3050     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3051     }
3052     #endif
3053     continue;
3054     }
3055 ph10 178
3056 nigel 93 /* We need to deal with \P and \p in both phases. */
3057    
3058 nigel 77 #ifdef SUPPORT_UCP
3059 nigel 93 if (-c == ESC_p || -c == ESC_P)
3060     {
3061     BOOL negated;
3062     int pdata;
3063     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3064     if (ptype < 0) goto FAILED;
3065     class_utf8 = TRUE;
3066     *class_utf8data++ = ((-c == ESC_p) != negated)?
3067     XCL_PROP : XCL_NOTPROP;
3068     *class_utf8data++ = ptype;
3069     *class_utf8data++ = pdata;
3070     class_charcount -= 2; /* Not a < 256 character */
3071 nigel 77 continue;
3072 nigel 93 }
3073 nigel 77 #endif
3074 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
3075     strict mode. By default, for compatibility with Perl, they are
3076     treated as literals. */
3077 nigel 77
3078 nigel 93 if ((options & PCRE_EXTRA) != 0)
3079     {
3080     *errorcodeptr = ERR7;
3081     goto FAILED;
3082     }
3083 nigel 77
3084 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3085     c = *ptr; /* Get the final character and fall through */
3086 nigel 77 }
3087    
3088     /* Fall through if we have a single character (c >= 0). This may be
3089 nigel 93 greater than 256 in UTF-8 mode. */
3090 nigel 77
3091     } /* End of backslash handling */
3092    
3093     /* A single character may be followed by '-' to form a range. However,
3094     Perl does not permit ']' to be the end of the range. A '-' character
3095 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3096     entirely. The code for handling \Q and \E is messy. */
3097 nigel 77
3098 nigel 93 CHECK_RANGE:
3099     while (ptr[1] == '\\' && ptr[2] == 'E')
3100 nigel 77 {
3101 nigel 93 inescq = FALSE;
3102     ptr += 2;
3103     }
3104    
3105     oldptr = ptr;
3106 ph10 231
3107 ph10 230 /* Remember \r or \n */
3108 ph10 231
3109     if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3110    
3111 ph10 230 /* Check for range */
3112 nigel 93
3113     if (!inescq && ptr[1] == '-')
3114     {
3115 nigel 77 int d;
3116     ptr += 2;
3117 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3118 nigel 77
3119 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3120     mode. */
3121    
3122     while (*ptr == '\\' && ptr[1] == 'Q')
3123     {
3124     ptr += 2;
3125     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3126     inescq = TRUE;
3127     break;
3128     }
3129    
3130     if (*ptr == 0 || (!inescq && *ptr == ']'))
3131     {
3132     ptr = oldptr;
3133     goto LONE_SINGLE_CHARACTER;
3134     }
3135    
3136 nigel 77 #ifdef SUPPORT_UTF8
3137     if (utf8)
3138     { /* Braces are required because the */
3139     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3140     }
3141     else
3142     #endif
3143     d = *ptr; /* Not UTF-8 mode */
3144    
3145     /* The second part of a range can be a single-character escape, but
3146     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3147     in such circumstances. */
3148    
3149 nigel 93 if (!inescq && d == '\\')
3150 nigel 77 {
3151 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3152     if (*errorcodeptr != 0) goto FAILED;
3153 nigel 77
3154 ph10 275 /* \b is backspace; \X is literal X; \R is literal R; any other
3155 nigel 93 special means the '-' was literal */
3156 nigel 77
3157     if (d < 0)
3158     {
3159     if (d == -ESC_b) d = '\b';
3160 nigel 93 else if (d == -ESC_X) d = 'X';
3161     else if (d == -ESC_R) d = 'R'; else
3162 nigel 77 {
3163 nigel 93 ptr = oldptr;
3164 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3165     }
3166     }
3167     }
3168    
3169 nigel 93 /* Check that the two values are in the correct order. Optimize
3170     one-character ranges */
3171 nigel 77
3172 nigel 93 if (d < c)
3173     {
3174     *errorcodeptr = ERR8;
3175     goto FAILED;
3176     }
3177    
3178 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3179    
3180 ph10 230 /* Remember \r or \n */
3181 ph10 231
3182     if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3183    
3184 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3185     matching, we have to use an XCLASS with extra data items. Caseless
3186     matching for characters > 127 is available only if UCP support is
3187     available. */
3188    
3189     #ifdef SUPPORT_UTF8
3190     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3191     {
3192     class_utf8 = TRUE;
3193    
3194     /* With UCP support, we can find the other case equivalents of
3195     the relevant characters. There may be several ranges. Optimize how
3196     they fit with the basic range. */
3197    
3198     #ifdef SUPPORT_UCP
3199     if ((options & PCRE_CASELESS) != 0)
3200     {
3201 nigel 93 unsigned int occ, ocd;
3202     unsigned int cc = c;
3203     unsigned int origd = d;
3204 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3205     {
3206 ph10 180 if (occ >= (unsigned int)c &&
3207     ocd <= (unsigned int)d)
3208 ph10 176 continue; /* Skip embedded ranges */
3209 nigel 77
3210 ph10 180 if (occ < (unsigned int)c &&
3211 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3212 nigel 77 { /* if there is overlap, */
3213     c = occ; /* noting that if occ < c */
3214     continue; /* we can't have ocd > d */
3215     } /* because a subrange is */
3216 ph10 180 if (ocd > (unsigned int)d &&
3217 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3218 nigel 77 { /* the basic range. */
3219     d = ocd;
3220     continue;
3221     }
3222    
3223     if (occ == ocd)
3224     {
3225     *class_utf8data++ = XCL_SINGLE;
3226     }
3227     else
3228     {
3229     *class_utf8data++ = XCL_RANGE;
3230     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3231     }
3232     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3233     }
3234     }
3235     #endif /* SUPPORT_UCP */
3236    
3237     /* Now record the original range, possibly modified for UCP caseless
3238     overlapping ranges. */
3239    
3240     *class_utf8data++ = XCL_RANGE;
3241     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3242     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3243    
3244     /* With UCP support, we are done. Without UCP support, there is no
3245     caseless matching for UTF-8 characters > 127; we can use the bit map
3246     for the smaller ones. */
3247    
3248     #ifdef SUPPORT_UCP
3249     continue; /* With next character in the class */
3250     #else
3251     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3252    
3253     /* Adjust upper limit and fall through to set up the map */
3254    
3255     d = 127;
3256    
3257     #endif /* SUPPORT_UCP */
3258     }
3259     #endif /* SUPPORT_UTF8 */
3260    
3261     /* We use the bit map for all cases when not in UTF-8 mode; else
3262     ranges that lie entirely within 0-127 when there is UCP support; else
3263     for partial ranges without UCP support. */
3264    
3265 nigel 93 class_charcount += d - c + 1;
3266     class_lastchar = d;
3267    
3268     /* We can save a bit of time by skipping this in the pre-compile. */
3269    
3270     if (lengthptr == NULL) for (; c <= d; c++)
3271 nigel 77 {
3272     classbits[c/8] |= (1 << (c&7));
3273     if ((options & PCRE_CASELESS) != 0)
3274     {
3275     int uc = cd->fcc[c]; /* flip case */
3276     classbits[uc/8] |= (1 << (uc&7));
3277     }
3278     }
3279    
3280     continue; /* Go get the next char in the class */
3281     }
3282    
3283     /* Handle a lone single character - we can get here for a normal
3284     non-escape char, or after \ that introduces a single character or for an
3285     apparent range that isn't. */
3286    
3287     LONE_SINGLE_CHARACTER:
3288 ph10 231
3289 nigel 77 /* Handle a character that cannot go in the bit map */
3290    
3291     #ifdef SUPPORT_UTF8
3292     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3293     {
3294     class_utf8 = TRUE;
3295     *class_utf8data++ = XCL_SINGLE;
3296     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3297    
3298     #ifdef SUPPORT_UCP
3299     if ((options & PCRE_CASELESS) != 0)
3300     {
3301 nigel 93 unsigned int othercase;
3302     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3303 nigel 77 {
3304     *class_utf8data++ = XCL_SINGLE;
3305     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3306     }
3307     }
3308     #endif /* SUPPORT_UCP */
3309    
3310     }
3311     else
3312     #endif /* SUPPORT_UTF8 */
3313    
3314     /* Handle a single-byte character */
3315     {
3316     classbits[c/8] |= (1 << (c&7));
3317     if ((options & PCRE_CASELESS) != 0)
3318     {
3319     c = cd->fcc[c]; /* flip case */
3320     classbits[c/8] |= (1 << (c&7));
3321     }
3322     class_charcount++;
3323     class_lastchar = c;
3324     }
3325     }
3326    
3327 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3328 nigel 77
3329 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3330 nigel 77
3331 nigel 93 if (c == 0) /* Missing terminating ']' */
3332     {
3333     *errorcodeptr = ERR6;
3334     goto FAILED;
3335     }
3336 ph10 231
3337    
3338 ph10 230 /* This code has been disabled because it would mean that \s counts as
3339     an explicit \r or \n reference, and that's not really what is wanted. Now
3340     we set the flag only if there is a literal "\r" or "\n" in the class. */
3341 ph10 227
3342 ph10 230 #if 0
3343 ph10 226 /* Remember whether \r or \n are in this class */
3344 ph10 227
3345 ph10 226 if (negate_class)
3346     {
3347 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3348 ph10 226 }
3349     else
3350     {
3351 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3352 ph10 227 }
3353 ph10 230 #endif
3354 ph10 227
3355 ph10 231
3356 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3357 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3358     use of \p or \P, in other words, no use of any XCLASS features, we can
3359     optimize.
3360    
3361 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3362     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3363     operate on single-bytes only. This is an historical hangover. Maybe one day
3364     we can tidy these opcodes to handle multi-byte characters.
3365 nigel 77
3366     The optimization throws away the bit map. We turn the item into a
3367     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3368     that OP_NOT does not support multibyte characters. In the positive case, it
3369     can cause firstbyte to be set. Otherwise, there can be no first char if
3370     this item is first, whatever repeat count may follow. In the case of
3371     reqbyte, save the previous value for reinstating. */
3372    
3373     #ifdef SUPPORT_UTF8
3374 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3375 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3376 nigel 77 #else
3377     if (class_charcount == 1)
3378     #endif
3379     {
3380     zeroreqbyte = reqbyte;
3381    
3382     /* The OP_NOT opcode works on one-byte characters only. */
3383    
3384     if (negate_class)
3385     {
3386     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3387     zerofirstbyte = firstbyte;
3388     *code++ = OP_NOT;
3389     *code++ = class_lastchar;
3390     break;
3391     }
3392    
3393     /* For a single, positive character, get the value into mcbuffer, and
3394     then we can handle this with the normal one-character code. */
3395    
3396     #ifdef SUPPORT_UTF8
3397     if (utf8 && class_lastchar > 127)
3398     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3399     else
3400     #endif
3401     {
3402     mcbuffer[0] = class_lastchar;
3403     mclength = 1;
3404     }
3405     goto ONE_CHAR;
3406     } /* End of 1-char optimization */
3407    
3408     /* The general case - not the one-char optimization. If this is the first
3409     thing in the branch, there can be no first char setting, whatever the
3410     repeat count. Any reqbyte setting must remain unchanged after any kind of
3411     repeat. */
3412    
3413     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3414     zerofirstbyte = firstbyte;
3415     zeroreqbyte = reqbyte;
3416    
3417     /* If there are characters with values > 255, we have to compile an
3418 ph10 286 extended class, with its own opcode, unless there was a negated special
3419     such as \S in the class, because in that case all characters > 255 are in
3420     the class, so any that were explicitly given as well can be ignored. If
3421 ph10 264 (when there are explicit characters > 255 that must be listed) there are no
3422     characters < 256, we can omit the bitmap in the actual compiled code. */
3423 nigel 77
3424     #ifdef SUPPORT_UTF8
3425 ph10 264 if (class_utf8 && !should_flip_negation)
3426 nigel 77 {
3427     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3428     *code++ = OP_XCLASS;
3429     code += LINK_SIZE;
3430     *code = negate_class? XCL_NOT : 0;
3431    
3432 nigel 93 /* If the map is required, move up the extra data to make room for it;
3433     otherwise just move the code pointer to the end of the extra data. */
3434 nigel 77
3435     if (class_charcount > 0)
3436     {
3437     *code++ |= XCL_MAP;
3438 nigel 93 memmove(code + 32, code, class_utf8data - code);
3439 nigel 77 memcpy(code, classbits, 32);
3440 nigel 93 code = class_utf8data + 32;
3441 nigel 77 }
3442 nigel 93 else code = class_utf8data;
3443 nigel 77
3444     /* Now fill in the complete length of the item */
3445    
3446     PUT(previous, 1, code - previous);
3447     break; /* End of class handling */
3448     }
3449     #endif
3450    
3451 ph10 286 /* If there are no characters > 255, set the opcode to OP_CLASS or
3452     OP_NCLASS, depending on whether the whole class was negated and whether
3453     there were negative specials such as \S in the class. Then copy the 32-byte
3454 ph10 264 map into the code vector, negating it if necessary. */
3455 ph10 286
3456 ph10 264 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3457 nigel 77 if (negate_class)
3458     {
3459 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3460     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3461 nigel 77 }
3462     else
3463     {
3464     memcpy(code, classbits, 32);
3465     }
3466     code += 32;
3467     break;
3468    
3469 nigel 93
3470     /* ===================================================================*/
3471 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3472     has been tested above. */
3473    
3474     case '{':
3475     if (!is_quantifier) goto NORMAL_CHAR;
3476     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3477     if (*errorcodeptr != 0) goto FAILED;
3478     goto REPEAT;
3479    
3480     case '*':
3481     repeat_min = 0;
3482     repeat_max = -1;
3483     goto REPEAT;
3484    
3485     case '+':
3486     repeat_min = 1;
3487     repeat_max = -1;
3488     goto REPEAT;
3489    
3490     case '?':
3491     repeat_min = 0;
3492     repeat_max = 1;
3493    
3494     REPEAT:
3495     if (previous == NULL)
3496     {
3497     *errorcodeptr = ERR9;
3498     goto FAILED;
3499     }
3500    
3501     if (repeat_min == 0)
3502     {
3503     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3504     reqbyte = zeroreqbyte; /* Ditto */
3505     }
3506    
3507     /* Remember whether this is a variable length repeat */
3508    
3509     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3510    
3511     op_type = 0; /* Default single-char op codes */
3512     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3513    
3514     /* Save start of previous item, in case we have to move it up to make space
3515     for an inserted OP_ONCE for the additional '+' extension. */
3516    
3517     tempcode = previous;
3518    
3519     /* If the next character is '+', we have a possessive quantifier. This
3520     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3521     If the next character is '?' this is a minimizing repeat, by default,
3522     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3523     repeat type to the non-default. */
3524    
3525     if (ptr[1] == '+')
3526     {
3527     repeat_type = 0; /* Force greedy */
3528     possessive_quantifier = TRUE;
3529     ptr++;
3530     }
3531     else if (ptr[1] == '?')
3532     {
3533     repeat_type = greedy_non_default;
3534     ptr++;
3535     }
3536     else repeat_type = greedy_default;
3537    
3538     /* If previous was a character match, abolish the item and generate a
3539     repeat item instead. If a char item has a minumum of more than one, ensure
3540     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3541     the first thing in a branch because the x will have gone into firstbyte
3542     instead. */
3543    
3544     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3545     {
3546     /* Deal with UTF-8 characters that take up more than one byte. It's
3547     easier to write this out separately than try to macrify it. Use c to
3548     hold the length of the character in bytes, plus 0x80 to flag that it's a
3549     length rather than a small character. */
3550    
3551     #ifdef SUPPORT_UTF8
3552     if (utf8 && (code[-1] & 0x80) != 0)
3553     {
3554     uschar *lastchar = code - 1;
3555     while((*lastchar & 0xc0) == 0x80) lastchar--;
3556     c = code - lastchar; /* Length of UTF-8 character */
3557     memcpy(utf8_char, lastchar, c); /* Save the char */
3558     c |= 0x80; /* Flag c as a length */
3559     }
3560     else
3561     #endif
3562    
3563     /* Handle the case of a single byte - either with no UTF8 support, or
3564     with UTF-8 disabled, or for a UTF-8 character < 128. */
3565    
3566     {
3567     c = code[-1];
3568     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3569     }
3570    
3571 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3572     the line is something that cannot possibly match this character. If so,
3573     automatically possessifying this item gains some performance in the case
3574     where the match fails. */
3575    
3576     if (!possessive_quantifier &&
3577     repeat_max < 0 &&
3578     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3579     options, cd))
3580     {
3581     repeat_type = 0; /* Force greedy */
3582     possessive_quantifier = TRUE;
3583     }
3584    
3585 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3586     }
3587    
3588     /* If previous was a single negated character ([^a] or similar), we use
3589     one of the special opcodes, replacing it. The code is shared with single-
3590     character repeats by setting opt_type to add a suitable offset into
3591 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3592     currently used only for single-byte chars. */
3593 nigel 77
3594     else if (*previous == OP_NOT)
3595     {
3596     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3597     c = previous[1];
3598 nigel 93 if (!possessive_quantifier &&
3599     repeat_max < 0 &&
3600     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3601     {
3602     repeat_type = 0; /* Force greedy */
3603     possessive_quantifier = TRUE;
3604     }
3605 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3606     }
3607    
3608     /* If previous was a character type match (\d or similar), abolish it and
3609     create a suitable repeat item. The code is shared with single-character
3610     repeats by setting op_type to add a suitable offset into repeat_type. Note
3611     the the Unicode property types will be present only when SUPPORT_UCP is
3612     defined, but we don't wrap the little bits of code here because it just
3613     makes it horribly messy. */
3614    
3615     else if (*previous < OP_EODN)
3616     {
3617     uschar *oldcode;
3618 nigel 87 int prop_type, prop_value;
3619 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3620     c = *previous;
3621    
3622 nigel 93 if (!possessive_quantifier &&
3623     repeat_max < 0 &&
3624     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3625     {
3626     repeat_type = 0; /* Force greedy */
3627     possessive_quantifier = TRUE;
3628     }
3629    
3630 nigel 77 OUTPUT_SINGLE_REPEAT:
3631 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3632     {
3633     prop_type = previous[1];
3634     prop_value = previous[2];
3635     }
3636     else prop_type = prop_value = -1;
3637 nigel 77
3638     oldcode = code;
3639     code = previous; /* Usually overwrite previous item */
3640    
3641     /* If the maximum is zero then the minimum must also be zero; Perl allows
3642     this case, so we do too - by simply omitting the item altogether. */
3643    
3644     if (repeat_max == 0) goto END_REPEAT;
3645    
3646     /* All real repeats make it impossible to handle partial matching (maybe
3647     one day we will be able to remove this restriction). */
3648    
3649 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3650 nigel 77
3651     /* Combine the op_type with the repeat_type */
3652    
3653     repeat_type += op_type;
3654    
3655     /* A minimum of zero is handled either as the special case * or ?, or as
3656     an UPTO, with the maximum given. */
3657    
3658     if (repeat_min == 0)
3659     {
3660     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3661     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3662     else
3663     {
3664     *code++ = OP_UPTO + repeat_type;
3665     PUT2INC(code, 0, repeat_max);
3666     }
3667     }
3668    
3669     /* A repeat minimum of 1 is optimized into some special cases. If the
3670 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3671 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3672     one less than the maximum. */
3673    
3674     else if (repeat_min == 1)
3675     {
3676     if (repeat_max == -1)
3677     *code++ = OP_PLUS + repeat_type;
3678     else
3679     {
3680     code = oldcode; /* leave previous item in place */
3681     if (repeat_max == 1) goto END_REPEAT;
3682     *code++ = OP_UPTO + repeat_type;
3683     PUT2INC(code, 0, repeat_max - 1);
3684     }
3685     }
3686    
3687     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3688     handled as an EXACT followed by an UPTO. */
3689    
3690     else
3691     {
3692     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3693     PUT2INC(code, 0, repeat_min);
3694    
3695     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3696     we have to insert the character for the previous code. For a repeated
3697 nigel 87 Unicode property match, there are two extra bytes that define the
3698 nigel 77 required property. In UTF-8 mode, long characters have their length in
3699     c, with the 0x80 bit as a flag. */
3700    
3701     if (repeat_max < 0)
3702     {
3703     #ifdef SUPPORT_UTF8
3704     if (utf8 && c >= 128)
3705     {
3706     memcpy(code, utf8_char, c & 7);
3707     code += c & 7;
3708     }
3709     else
3710     #endif
3711     {
3712     *code++ = c;
3713 nigel 87 if (prop_type >= 0)
3714     {
3715     *code++ = prop_type;
3716     *code++ = prop_value;
3717     }
3718 nigel 77 }
3719     *code++ = OP_STAR + repeat_type;
3720     }
3721    
3722     /* Else insert an UPTO if the max is greater than the min, again
3723 nigel 93 preceded by the character, for the previously inserted code. If the
3724     UPTO is just for 1 instance, we can use QUERY instead. */
3725 nigel 77
3726     else if (repeat_max != repeat_min)
3727     {
3728     #ifdef SUPPORT_UTF8
3729     if (utf8 && c >= 128)
3730     {
3731     memcpy(code, utf8_char, c & 7);
3732     code += c & 7;
3733     }
3734     else
3735     #endif
3736     *code++ = c;
3737 nigel 87 if (prop_type >= 0)
3738     {
3739     *code++ = prop_type;
3740     *code++ = prop_value;
3741     }
3742 nigel 77 repeat_max -= repeat_min;
3743 nigel 93
3744     if (repeat_max == 1)
3745     {
3746     *code++ = OP_QUERY + repeat_type;
3747     }
3748     else
3749     {
3750     *code++ = OP_UPTO + repeat_type;
3751     PUT2INC(code, 0, repeat_max);
3752     }
3753 nigel 77 }
3754     }
3755    
3756     /* The character or character type itself comes last in all cases. */
3757    
3758     #ifdef SUPPORT_UTF8
3759     if (utf8 && c >= 128)
3760     {
3761     memcpy(code, utf8_char, c & 7);
3762     code += c & 7;
3763     }
3764     else
3765     #endif
3766     *code++ = c;
3767    
3768 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3769     define the required property. */
3770 nigel 77
3771     #ifdef SUPPORT_UCP
3772 nigel 87 if (prop_type >= 0)
3773     {
3774     *code++ = prop_type;
3775     *code++ = prop_value;
3776     }
3777 nigel 77 #endif
3778     }
3779    
3780     /* If previous was a character class or a back reference, we put the repeat
3781     stuff after it, but just skip the item if the repeat was {0,0}. */
3782    
3783     else if (*previous == OP_CLASS ||
3784     *previous == OP_NCLASS ||
3785     #ifdef SUPPORT_UTF8
3786     *previous == OP_XCLASS ||
3787     #endif
3788     *previous == OP_REF)
3789     {
3790     if (repeat_max == 0)
3791     {
3792     code = previous;
3793     goto END_REPEAT;
3794     }
3795    
3796     /* All real repeats make it impossible to handle partial matching (maybe
3797     one day we will be able to remove this restriction). */
3798    
3799 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3800 nigel 77
3801     if (repeat_min == 0 && repeat_max == -1)
3802     *code++ = OP_CRSTAR + repeat_type;
3803     else if (repeat_min == 1 && repeat_max == -1)
3804     *code++ = OP_CRPLUS + repeat_type;
3805     else if (repeat_min == 0 && repeat_max == 1)
3806     *code++ = OP_CRQUERY + repeat_type;
3807     else
3808     {
3809     *code++ = OP_CRRANGE + repeat_type;
3810     PUT2INC(code, 0, repeat_min);
3811     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3812     PUT2INC(code, 0, repeat_max);
3813     }
3814     }
3815    
3816     /* If previous was a bracket group, we may have to replicate it in certain
3817     cases. */
3818    
3819 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3820     *previous == OP_ONCE || *previous == OP_COND)
3821 nigel 77 {
3822     register int i;
3823     int ketoffset = 0;
3824     int len = code - previous;
3825     uschar *bralink = NULL;
3826    
3827 nigel 93 /* Repeating a DEFINE group is pointless */
3828    
3829     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3830     {
3831     *errorcodeptr = ERR55;
3832     goto FAILED;
3833     }
3834    
3835 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3836     by scanning through from the start, and compute the offset back to it
3837     from the current code pointer. There may be an OP_OPT setting following
3838     the final KET, so we can't find the end just by going back from the code
3839     pointer. */
3840    
3841     if (repeat_max == -1)
3842     {
3843     register uschar *ket = previous;
3844     do ket += GET(ket, 1); while (*ket != OP_KET);
3845     ketoffset = code - ket;
3846     }
3847    
3848     /* The case of a zero minimum is special because of the need to stick
3849     OP_BRAZERO in front of it, and because the group appears once in the
3850     data, whereas in other cases it appears the minimum number of times. For
3851     this reason, it is simplest to treat this case separately, as otherwise
3852     the code gets far too messy. There are several special subcases when the
3853     minimum is zero. */
3854    
3855     if (repeat_min == 0)
3856     {
3857 ph10 335 /* If the maximum is also zero, we used to just omit the group from the
3858     output altogether, like this:
3859 nigel 77
3860 ph10 335 ** if (repeat_max == 0)
3861     ** {
3862     ** code = previous;
3863     ** goto END_REPEAT;
3864     ** }
3865    
3866     However, that fails when a group is referenced as a subroutine from
3867     elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3868     so that it is skipped on execution. As we don't have a list of which
3869     groups are referenced, we cannot do this selectively.
3870 nigel 77
3871 ph10 335 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3872     and do no more at this point. However, we do need to adjust any
3873     OP_RECURSE calls inside the group that refer to the group itself or any
3874     internal or forward referenced group, because the offset is from the
3875     start of the whole regex. Temporarily terminate the pattern while doing
3876     this. */
3877 nigel 77
3878 ph10 335 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
3879 nigel 77 {
3880     *code = OP_END;
3881 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3882 nigel 77 memmove(previous+1, previous, len);
3883     code++;
3884 ph10 335 if (repeat_max == 0)
3885     {
3886     *previous++ = OP_SKIPZERO;
3887     goto END_REPEAT;
3888     }
3889 nigel 77 *previous++ = OP_BRAZERO + repeat_type;
3890     }
3891    
3892     /* If the maximum is greater than 1 and limited, we have to replicate
3893     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3894     The first one has to be handled carefully because it's the original
3895     copy, which has to be moved up. The remainder can be handled by code
3896     that is common with the non-zero minimum case below. We have to
3897     adjust the value or repeat_max, since one less copy is required. Once
3898     again, we may have to adjust any OP_RECURSE calls inside the group. */
3899    
3900     else
3901     {
3902     int offset;
3903     *code = OP_END;
3904 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3905 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3906     code += 2 + LINK_SIZE;
3907     *previous++ = OP_BRAZERO + repeat_type;
3908     *previous++ = OP_BRA;
3909    
3910     /* We chain together the bracket offset fields that have to be
3911     filled in later when the ends of the brackets are reached. */
3912    
3913     offset = (bralink == NULL)? 0 : previous - bralink;
3914     bralink = previous;
3915     PUTINC(previous, 0, offset);
3916     }
3917    
3918     repeat_max--;
3919     }
3920    
3921     /* If the minimum is greater than zero, replicate the group as many
3922     times as necessary, and adjust the maximum to the number of subsequent
3923     copies that we need. If we set a first char from the group, and didn't
3924 nigel 93 set a required char, copy the latter from the former. If there are any
3925     forward reference subroutine calls in the group, there will be entries on
3926     the workspace list; replicate these with an appropriate increment. */
3927 nigel 77
3928     else
3929     {
3930     if (repeat_min > 1)
3931     {
3932 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3933 ph10 202 just adjust the length as if we had. Do some paranoid checks for
3934     potential integer overflow. */
3935 nigel 93
3936     if (lengthptr != NULL)
3937 ph10 202 {
3938     int delta = (repeat_min - 1)*length_prevgroup;
3939     if ((double)(repeat_min - 1)*(double)length_prevgroup >
3940     (double)INT_MAX ||
3941     OFLOW_MAX - *lengthptr < delta)
3942     {
3943     *errorcodeptr = ERR20;
3944     goto FAILED;
3945     }
3946     *lengthptr += delta;
3947     }
3948 nigel 93
3949     /* This is compiling for real */
3950    
3951     else
3952 nigel 77 {
3953 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3954     for (i = 1; i < repeat_min; i++)
3955     {
3956     uschar *hc;
3957     uschar *this_hwm = cd->hwm;
3958     memcpy(code, previous, len);
3959     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3960     {
3961     PUT(cd->hwm, 0, GET(hc, 0) + len);
3962     cd->hwm += LINK_SIZE;
3963     }
3964     save_hwm = this_hwm;
3965     code += len;
3966     }
3967 nigel 77 }
3968     }
3969 nigel 93
3970 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3971     }
3972    
3973     /* This code is common to both the zero and non-zero minimum cases. If
3974     the maximum is limited, it replicates the group in a nested fashion,
3975     remembering the bracket starts on a stack. In the case of a zero minimum,
3976     the first one was set up above. In all cases the repeat_max now specifies
3977 nigel 93 the number of additional copies needed. Again, we must remember to
3978     replicate entries on the forward reference list. */
3979 nigel 77
3980     if (repeat_max >= 0)
3981     {
3982 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3983     just adjust the length as if we had. For each repetition we must add 1
3984     to the length for BRAZERO and for all but the last repetition we must
3985 ph10 202 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3986     paranoid checks to avoid integer overflow. */
3987 nigel 93
3988     if (lengthptr != NULL && repeat_max > 0)
3989 ph10 202 {
3990     int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3991     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3992     if ((double)repeat_max *
3993     (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3994     > (double)INT_MAX ||
3995     OFLOW_MAX - *lengthptr < delta)
3996     {
3997     *errorcodeptr = ERR20;
3998     goto FAILED;
3999     }
4000     *lengthptr += delta;
4001     }
4002 nigel 93
4003     /* This is compiling for real */
4004    
4005     else for (i = repeat_max - 1; i >= 0; i--)
4006 nigel 77 {
4007 nigel 93 uschar *hc;
4008     uschar *this_hwm = cd->hwm;
4009    
4010 nigel 77 *code++ = OP_BRAZERO + repeat_type;
4011    
4012     /* All but the final copy start a new nesting, maintaining the
4013     chain of brackets outstanding. */
4014    
4015     if (i != 0)
4016     {
4017     int offset;
4018     *code++ = OP_BRA;
4019     offset = (bralink == NULL)? 0 : code - bralink;
4020     bralink = code;
4021     PUTINC(code, 0, offset);
4022     }
4023    
4024     memcpy(code, previous, len);
4025 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4026     {
4027     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4028     cd->hwm += LINK_SIZE;
4029     }
4030     save_hwm = this_hwm;
4031 nigel 77 code += len;
4032     }
4033    
4034     /* Now chain through the pending brackets, and fill in their length
4035     fields (which are holding the chain links pro tem). */
4036    
4037     while (bralink != NULL)
4038     {
4039     int oldlinkoffset;
4040     int offset = code - bralink + 1;
4041     uschar *bra = code - offset;
4042     oldlinkoffset = GET(bra, 1);
4043     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4044     *code++ = OP_KET;
4045     PUTINC(code, 0, offset);
4046     PUT(bra, 1, offset);
4047     }
4048     }
4049    
4050     /* If the maximum is unlimited, set a repeater in the final copy. We
4051     can't just offset backwards from the current code point, because we
4052     don't know if there's been an options resetting after the ket. The
4053 nigel 93 correct offset was computed above.
4054 nigel 77
4055 nigel 93 Then, when we are doing the actual compile phase, check to see whether
4056     this group is a non-atomic one that could match an empty string. If so,
4057     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4058     that runtime checking can be done. [This check is also applied to
4059     atomic groups at runtime, but in a different way.] */
4060    
4061     else
4062     {
4063     uschar *ketcode = code - ketoffset;
4064     uschar *bracode = ketcode - GET(ketcode, 1);
4065     *ketcode = OP_KETRMAX + repeat_type;
4066     if (lengthptr == NULL && *bracode != OP_ONCE)
4067     {
4068     uschar *scode = bracode;
4069     do
4070     {
4071     if (could_be_empty_branch(scode, ketcode, utf8))
4072     {
4073     *bracode += OP_SBRA - OP_BRA;
4074     break;
4075     }
4076     scode += GET(scode, 1);
4077     }
4078     while (*scode == OP_ALT);
4079     }
4080     }
4081 nigel 77 }
4082    
4083     /* Else there's some kind of shambles */
4084    
4085     else
4086     {
4087     *errorcodeptr = ERR11;
4088     goto FAILED;
4089     }
4090    
4091 nigel 93 /* If the character following a repeat is '+', or if certain optimization
4092     tests above succeeded, possessive_quantifier is TRUE. For some of the
4093     simpler opcodes, there is an special alternative opcode for this. For
4094     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4095     The '+' notation is just syntactic sugar, taken from Sun's Java package,
4096     but the special opcodes can optimize it a bit. The repeated item starts at
4097     tempcode, not at previous, which might be the first part of a string whose
4098     (former) last char we repeated.
4099 nigel 77
4100 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4101     an 'upto' may follow. We skip over an 'exact' item, and then test the
4102     length of what remains before proceeding. */
4103    
4104 nigel 77 if (possessive_quantifier)
4105     {
4106 nigel 93 int len;
4107     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4108     *tempcode == OP_NOTEXACT)
4109 ph10 285 tempcode += _pcre_OP_lengths[*tempcode] +
4110 ph10 286 ((*tempcode == OP_TYPEEXACT &&
4111     (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4112 nigel 93 len = code - tempcode;
4113     if (len > 0) switch (*tempcode)
4114     {
4115     case OP_STAR: *tempcode = OP_POSSTAR; break;
4116     case OP_PLUS: *tempcode = OP_POSPLUS; break;
4117     case OP_QUERY: *tempcode = OP_POSQUERY; break;
4118     case OP_UPTO: *tempcode = OP_POSUPTO; break;
4119    
4120     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;