/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 295 - (hide annotations) (download)
Mon Dec 31 17:00:24 2007 UTC (6 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 196656 byte(s)
Make POSIX character class parsing more like Perl.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 nigel 77 static const short int escapes[] = {
102     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 nigel 77 0, 0, -ESC_z /* x - z */
112     };
113    
114 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
115 nigel 77 static const short int escapes[] = {
116     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139     };
140     #endif
141    
142    
143 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144     searched linearly. Put all the names into a single string, in order to reduce
145 ph10 240 the number of relocations when a shared library is dynamically linked. */
146 ph10 210
147     typedef struct verbitem {
148     int len;
149     int op;
150 ph10 211 } verbitem;
151 ph10 210
152 ph10 240 static const char verbnames[] =
153 ph10 243 "ACCEPT\0"
154     "COMMIT\0"
155     "F\0"
156     "FAIL\0"
157     "PRUNE\0"
158     "SKIP\0"
159     "THEN";
160 ph10 240
161 ph10 210 static verbitem verbs[] = {
162 ph10 240 { 6, OP_ACCEPT },
163     { 6, OP_COMMIT },
164     { 1, OP_FAIL },
165     { 4, OP_FAIL },
166     { 5, OP_PRUNE },
167     { 4, OP_SKIP },
168     { 4, OP_THEN }
169 ph10 210 };
170    
171     static int verbcount = sizeof(verbs)/sizeof(verbitem);
172    
173    
174 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
175     now all in a single string, to reduce the number of relocations when a shared
176 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
177     length entry. The first three must be alpha, lower, upper, as this is assumed
178     for handling case independence. */
179 nigel 77
180 ph10 240 static const char posix_names[] =
181 ph10 243 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182     "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 ph10 240 "word\0" "xdigit";
184 nigel 77
185     static const uschar posix_name_lengths[] = {
186     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187    
188 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
189     base map, with an optional addition or removal of another map. Then, for some
190     classes, there is some additional tweaking: for [:blank:] the vertical space
191     characters are removed, and for [:alpha:] and [:alnum:] the underscore
192     character is removed. The triples in the table consist of the base map offset,
193     second map offset or -1 if no second map, and a non-negative value for map
194     addition or a negative value for map subtraction (if there are two maps). The
195     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196     remove vertical space characters, 2 => remove underscore. */
197 nigel 77
198     static const int posix_class_maps[] = {
199 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
200     cbit_lower, -1, 0, /* lower */
201     cbit_upper, -1, 0, /* upper */
202     cbit_word, -1, 2, /* alnum - word without underscore */
203     cbit_print, cbit_cntrl, 0, /* ascii */
204     cbit_space, -1, 1, /* blank - a GNU extension */
205     cbit_cntrl, -1, 0, /* cntrl */
206     cbit_digit, -1, 0, /* digit */
207     cbit_graph, -1, 0, /* graph */
208     cbit_print, -1, 0, /* print */
209     cbit_punct, -1, 0, /* punct */
210     cbit_space, -1, 0, /* space */
211     cbit_word, -1, 0, /* word - a Perl extension */
212     cbit_xdigit,-1, 0 /* xdigit */
213 nigel 77 };
214    
215    
216 nigel 93 #define STRING(a) # a
217     #define XSTRING(s) STRING(s)
218    
219 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
220 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
221     they are documented. Always add a new error instead. Messages marked DEAD below
222 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
223     the number of relocations needed when a shared library is loaded dynamically,
224     it is now one long string. We cannot use a table of offsets, because the
225     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226     simply count through to the one we want - this isn't a performance issue
227 ph10 240 because these strings are used only when there is a compilation error. */
228 nigel 77
229 ph10 240 static const char error_texts[] =
230     "no error\0"
231     "\\ at end of pattern\0"
232     "\\c at end of pattern\0"
233     "unrecognized character follows \\\0"
234     "numbers out of order in {} quantifier\0"
235 nigel 77 /* 5 */
236 ph10 240 "number too big in {} quantifier\0"
237     "missing terminating ] for character class\0"
238     "invalid escape sequence in character class\0"
239     "range out of order in character class\0"
240     "nothing to repeat\0"
241 nigel 77 /* 10 */
242 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243     "internal error: unexpected repeat\0"
244 ph10 269 "unrecognized character after (? or (?-\0"
245 ph10 240 "POSIX named classes are supported only within a class\0"
246     "missing )\0"
247 nigel 77 /* 15 */
248 ph10 240 "reference to non-existent subpattern\0"
249     "erroffset passed as NULL\0"
250     "unknown option bit(s) set\0"
251     "missing ) after comment\0"
252     "parentheses nested too deeply\0" /** DEAD **/
253 nigel 77 /* 20 */
254 ph10 240 "regular expression is too large\0"
255     "failed to get memory\0"
256     "unmatched parentheses\0"
257     "internal error: code overflow\0"
258     "unrecognized character after (?<\0"
259 nigel 77 /* 25 */
260 ph10 240 "lookbehind assertion is not fixed length\0"
261     "malformed number or name after (?(\0"
262     "conditional group contains more than two branches\0"
263     "assertion expected after (?(\0"
264     "(?R or (?[+-]digits must be followed by )\0"
265 nigel 77 /* 30 */
266 ph10 240 "unknown POSIX class name\0"
267     "POSIX collating elements are not supported\0"
268     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269     "spare error\0" /** DEAD **/
270     "character value in \\x{...} sequence is too large\0"
271 nigel 77 /* 35 */
272 ph10 240 "invalid condition (?(0)\0"
273     "\\C not allowed in lookbehind assertion\0"
274     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275     "number after (?C is > 255\0"
276     "closing ) for (?C expected\0"
277 nigel 77 /* 40 */
278 ph10 240 "recursive call could loop indefinitely\0"
279     "unrecognized character after (?P\0"
280     "syntax error in subpattern name (missing terminator)\0"
281     "two named subpatterns have the same name\0"
282     "invalid UTF-8 string\0"
283 nigel 77 /* 45 */
284 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
285     "malformed \\P or \\p sequence\0"
286     "unknown property name after \\P or \\p\0"
287     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 nigel 91 /* 50 */
290 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
291     "octal value is greater than \\377 (not in UTF-8 mode)\0"
292     "internal error: overran compiling workspace\0"
293     "internal error: previously-checked referenced subpattern not found\0"
294     "DEFINE group contains more than one branch\0"
295 nigel 93 /* 55 */
296 ph10 240 "repeating a DEFINE group is not allowed\0"
297     "inconsistent NEWLINE options\0"
298     "\\g is not followed by a braced name or an optionally braced non-zero number\0"
299     "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
300     "(*VERB) with an argument is not supported\0"
301 ph10 211 /* 60 */
302 ph10 240 "(*VERB) not recognized\0"
303 ph10 268 "number is too big\0"
304 ph10 272 "subpattern name expected\0"
305 ph10 269 "digit expected after (?+";
306 nigel 77
307    
308     /* Table to identify digits and hex digits. This is used when compiling
309     patterns. Note that the tables in chartables are dependent on the locale, and
310     may mark arbitrary characters as digits - but the PCRE compiling code expects
311     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
312     a private table here. It costs 256 bytes, but it is a lot faster than doing
313     character value tests (at least in some simple cases I timed), and in some
314     applications one wants PCRE to compile efficiently as well as match
315     efficiently.
316    
317     For convenience, we use the same bit definitions as in chartables:
318    
319     0x04 decimal digit
320     0x08 hexadecimal digit
321    
322     Then we can use ctype_digit and ctype_xdigit in the code. */
323    
324 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
325 nigel 77 static const unsigned char digitab[] =
326     {
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
333     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
334     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
335     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
336     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
337     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
338     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
339     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
340     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
341     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
342     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
343     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
344     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
347     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
348     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
350     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
351     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
352     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
355     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
356     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
357     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
358     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
359    
360 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
361 nigel 77 static const unsigned char digitab[] =
362     {
363     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
364     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
365     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
366     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
367     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
368     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
369     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
370     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
371     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
374 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
375 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
376     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
377     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
378     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
379     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
380     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
381     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
382     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
383     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
387     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
388     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
393     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
394     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
395    
396     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
397     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
398     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
399     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
400     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
401     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
402     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
403     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
404     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
405     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
406     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
407     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
408 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
409 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
410     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
411     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
412     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
413     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
414     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
415     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
416     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
417     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
418     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
419     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
420     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
421     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
422     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
423     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
424     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
425     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
426     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
427     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
428     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
429     #endif
430    
431    
432     /* Definition to allow mutual recursion */
433    
434     static BOOL
435 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
436 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
437 nigel 77
438    
439    
440     /*************************************************
441 ph10 240 * Find an error text *
442     *************************************************/
443    
444 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
445     some of the text is of unknown length, we can't use a table of offsets.
446     Instead, just count through the strings. This is not a performance issue
447 ph10 240 because it happens only when there has been a compilation error.
448    
449     Argument: the error number
450     Returns: pointer to the error string
451     */
452    
453     static const char *
454     find_error_text(int n)
455     {
456     const char *s = error_texts;
457 ph10 243 for (; n > 0; n--) while (*s++ != 0);
458 ph10 240 return s;
459     }
460    
461    
462     /*************************************************
463 nigel 77 * Handle escapes *
464     *************************************************/
465    
466     /* This function is called when a \ has been encountered. It either returns a
467     positive value for a simple escape such as \n, or a negative value which
468 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
469     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
470     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
471     ptr is pointing at the \. On exit, it is on the final character of the escape
472     sequence.
473 nigel 77
474     Arguments:
475     ptrptr points to the pattern position pointer
476     errorcodeptr points to the errorcode variable
477     bracount number of previous extracting brackets
478     options the options bits
479     isclass TRUE if inside a character class
480    
481     Returns: zero or positive => a data character
482     negative => a special escape sequence
483 ph10 213 on error, errorcodeptr is set
484 nigel 77 */
485    
486     static int
487     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
488     int options, BOOL isclass)
489     {
490 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
491     const uschar *ptr = *ptrptr + 1;
492 nigel 77 int c, i;
493    
494 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
495     ptr--; /* Set pointer back to the last byte */
496    
497 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
498    
499     if (c == 0) *errorcodeptr = ERR1;
500    
501 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
502     in a table. A non-zero result is something that can be returned immediately.
503 nigel 77 Otherwise further processing may be required. */
504    
505 ph10 97 #ifndef EBCDIC /* ASCII coding */
506 ph10 274 else if (c < '0' || c > 'z') {} /* Not alphanumeric */
507 nigel 77 else if ((i = escapes[c - '0']) != 0) c = i;
508    
509 ph10 97 #else /* EBCDIC coding */
510 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
511 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
512     #endif
513    
514     /* Escapes that need further processing, or are illegal. */
515    
516     else
517     {
518     const uschar *oldptr;
519 nigel 93 BOOL braced, negated;
520    
521 nigel 77 switch (c)
522     {
523     /* A number of Perl escapes are not handled by PCRE. We give an explicit
524     error. */
525    
526     case 'l':
527     case 'L':
528     case 'N':
529     case 'u':
530     case 'U':
531     *errorcodeptr = ERR37;
532     break;
533    
534 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
535     is an absolute backreference. If negative, it is a relative backreference.
536 ph10 172 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
537     reference to a named group. This is part of Perl's movement towards a
538     unified syntax for back references. As this is synonymous with \k{name}, we
539 ph10 171 fudge it up by pretending it really was \k. */
540 nigel 93
541     case 'g':
542     if (ptr[1] == '{')
543     {
544 ph10 171 const uschar *p;
545     for (p = ptr+2; *p != 0 && *p != '}'; p++)
546     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
547 ph10 172 if (*p != 0 && *p != '}')
548 ph10 171 {
549     c = -ESC_k;
550     break;
551 ph10 172 }
552 nigel 93 braced = TRUE;
553     ptr++;
554     }
555     else braced = FALSE;
556    
557     if (ptr[1] == '-')
558     {
559     negated = TRUE;
560     ptr++;
561     }
562     else negated = FALSE;
563    
564     c = 0;
565     while ((digitab[ptr[1]] & ctype_digit) != 0)
566     c = c * 10 + *(++ptr) - '0';
567 ph10 220
568 ph10 213 if (c < 0)
569     {
570     *errorcodeptr = ERR61;
571     break;
572 ph10 220 }
573 nigel 93
574     if (c == 0 || (braced && *(++ptr) != '}'))
575     {
576     *errorcodeptr = ERR57;
577 ph10 213 break;
578 nigel 93 }
579    
580     if (negated)
581     {
582     if (c > bracount)
583     {
584     *errorcodeptr = ERR15;
585 ph10 213 break;
586 nigel 93 }
587     c = bracount - (c - 1);
588     }
589    
590     c = -(ESC_REF + c);
591     break;
592    
593 nigel 77 /* The handling of escape sequences consisting of a string of digits
594     starting with one that is not zero is not straightforward. By experiment,
595     the way Perl works seems to be as follows:
596    
597     Outside a character class, the digits are read as a decimal number. If the
598     number is less than 10, or if there are that many previous extracting
599     left brackets, then it is a back reference. Otherwise, up to three octal
600     digits are read to form an escaped byte. Thus \123 is likely to be octal
601     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
602     value is greater than 377, the least significant 8 bits are taken. Inside a
603     character class, \ followed by a digit is always an octal number. */
604    
605     case '1': case '2': case '3': case '4': case '5':
606     case '6': case '7': case '8': case '9':
607    
608     if (!isclass)
609     {
610     oldptr = ptr;
611     c -= '0';
612     while ((digitab[ptr[1]] & ctype_digit) != 0)
613     c = c * 10 + *(++ptr) - '0';
614 ph10 213 if (c < 0)
615     {
616     *errorcodeptr = ERR61;
617 ph10 220 break;
618     }
619 nigel 77 if (c < 10 || c <= bracount)
620     {
621     c = -(ESC_REF + c);
622     break;
623     }
624     ptr = oldptr; /* Put the pointer back and fall through */
625     }
626    
627     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
628     generates a binary zero byte and treats the digit as a following literal.
629     Thus we have to pull back the pointer by one. */
630    
631     if ((c = *ptr) >= '8')
632     {
633     ptr--;
634     c = 0;
635     break;
636     }
637    
638     /* \0 always starts an octal number, but we may drop through to here with a
639 nigel 91 larger first octal digit. The original code used just to take the least
640     significant 8 bits of octal numbers (I think this is what early Perls used
641     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
642     than 3 octal digits. */
643 nigel 77
644     case '0':
645     c -= '0';
646     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
647     c = c * 8 + *(++ptr) - '0';
648 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
649 nigel 77 break;
650    
651 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
652     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
653     treated as a data character. */
654 nigel 77
655     case 'x':
656 nigel 87 if (ptr[1] == '{')
657 nigel 77 {
658     const uschar *pt = ptr + 2;
659 nigel 87 int count = 0;
660    
661 nigel 77 c = 0;
662     while ((digitab[*pt] & ctype_xdigit) != 0)
663     {
664 nigel 87 register int cc = *pt++;
665     if (c == 0 && cc == '0') continue; /* Leading zeroes */
666 nigel 77 count++;
667 nigel 87
668 ph10 97 #ifndef EBCDIC /* ASCII coding */
669 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
670 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
671 ph10 97 #else /* EBCDIC coding */
672 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
673 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
674 nigel 77 #endif
675     }
676 nigel 87
677 nigel 77 if (*pt == '}')
678     {
679 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
680 nigel 77 ptr = pt;
681     break;
682     }
683 nigel 87
684 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
685     recognize this construct; fall through to the normal \x handling. */
686     }
687    
688 nigel 87 /* Read just a single-byte hex-defined char */
689 nigel 77
690     c = 0;
691     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
692     {
693     int cc; /* Some compilers don't like ++ */
694     cc = *(++ptr); /* in initializers */
695 ph10 97 #ifndef EBCDIC /* ASCII coding */
696 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
697     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
698 ph10 97 #else /* EBCDIC coding */
699 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
700     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
701     #endif
702     }
703     break;
704    
705 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
706     This coding is ASCII-specific, but then the whole concept of \cx is
707     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
708 nigel 77
709     case 'c':
710     c = *(++ptr);
711     if (c == 0)
712     {
713     *errorcodeptr = ERR2;
714 ph10 213 break;
715 nigel 77 }
716    
717 ph10 97 #ifndef EBCDIC /* ASCII coding */
718 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
719     c ^= 0x40;
720 ph10 97 #else /* EBCDIC coding */
721 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
722     c ^= 0xC0;
723     #endif
724     break;
725    
726     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
727 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
728     otherwise, for Perl compatibility, it is a literal. This code looks a bit
729     odd, but there used to be some cases other than the default, and there may
730     be again in future, so I haven't "optimized" it. */
731 nigel 77
732     default:
733     if ((options & PCRE_EXTRA) != 0) switch(c)
734     {
735     default:
736     *errorcodeptr = ERR3;
737     break;
738     }
739     break;
740     }
741     }
742    
743     *ptrptr = ptr;
744     return c;
745     }
746    
747    
748    
749     #ifdef SUPPORT_UCP
750     /*************************************************
751     * Handle \P and \p *
752     *************************************************/
753    
754     /* This function is called after \P or \p has been encountered, provided that
755     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
756     pointing at the P or p. On exit, it is pointing at the final character of the
757     escape sequence.
758    
759     Argument:
760     ptrptr points to the pattern position pointer
761     negptr points to a boolean that is set TRUE for negation else FALSE
762 nigel 87 dptr points to an int that is set to the detailed property value
763 nigel 77 errorcodeptr points to the error code variable
764    
765 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
766 nigel 77 */
767    
768     static int
769 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
770 nigel 77 {
771     int c, i, bot, top;
772     const uschar *ptr = *ptrptr;
773 nigel 87 char name[32];
774 nigel 77
775     c = *(++ptr);
776     if (c == 0) goto ERROR_RETURN;
777    
778     *negptr = FALSE;
779    
780 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
781     negation. */
782 nigel 77
783     if (c == '{')
784     {
785     if (ptr[1] == '^')
786     {
787     *negptr = TRUE;
788     ptr++;
789     }
790 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
791 nigel 77 {
792     c = *(++ptr);
793     if (c == 0) goto ERROR_RETURN;
794     if (c == '}') break;
795     name[i] = c;
796     }
797 nigel 87 if (c !='}') goto ERROR_RETURN;
798 nigel 77 name[i] = 0;
799     }
800    
801     /* Otherwise there is just one following character */
802    
803     else
804     {
805     name[0] = c;
806     name[1] = 0;
807     }
808    
809     *ptrptr = ptr;
810    
811     /* Search for a recognized property name using binary chop */
812    
813     bot = 0;
814     top = _pcre_utt_size;
815    
816     while (bot < top)
817     {
818 nigel 87 i = (bot + top) >> 1;
819 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
820 nigel 87 if (c == 0)
821     {
822     *dptr = _pcre_utt[i].value;
823     return _pcre_utt[i].type;
824     }
825 nigel 77 if (c > 0) bot = i + 1; else top = i;
826     }
827    
828     *errorcodeptr = ERR47;
829     *ptrptr = ptr;
830     return -1;
831    
832     ERROR_RETURN:
833     *errorcodeptr = ERR46;
834     *ptrptr = ptr;
835     return -1;
836     }
837     #endif
838    
839    
840    
841    
842     /*************************************************
843     * Check for counted repeat *
844     *************************************************/
845    
846     /* This function is called when a '{' is encountered in a place where it might
847     start a quantifier. It looks ahead to see if it really is a quantifier or not.
848     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
849     where the ddds are digits.
850    
851     Arguments:
852     p pointer to the first char after '{'
853    
854     Returns: TRUE or FALSE
855     */
856    
857     static BOOL
858     is_counted_repeat(const uschar *p)
859     {
860     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
861     while ((digitab[*p] & ctype_digit) != 0) p++;
862     if (*p == '}') return TRUE;
863    
864     if (*p++ != ',') return FALSE;
865     if (*p == '}') return TRUE;
866    
867     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
868     while ((digitab[*p] & ctype_digit) != 0) p++;
869    
870     return (*p == '}');
871     }
872    
873    
874    
875     /*************************************************
876     * Read repeat counts *
877     *************************************************/
878    
879     /* Read an item of the form {n,m} and return the values. This is called only
880     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
881     so the syntax is guaranteed to be correct, but we need to check the values.
882    
883     Arguments:
884     p pointer to first char after '{'
885     minp pointer to int for min
886     maxp pointer to int for max
887     returned as -1 if no max
888     errorcodeptr points to error code variable
889    
890     Returns: pointer to '}' on success;
891     current ptr on error, with errorcodeptr set non-zero
892     */
893    
894     static const uschar *
895     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
896     {
897     int min = 0;
898     int max = -1;
899    
900 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
901     an integer overflow. */
902    
903 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
904 nigel 81 if (min < 0 || min > 65535)
905     {
906     *errorcodeptr = ERR5;
907     return p;
908     }
909 nigel 77
910 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
911     Also, max must not be less than min. */
912    
913 nigel 77 if (*p == '}') max = min; else
914     {
915     if (*(++p) != '}')
916     {
917     max = 0;
918     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
919 nigel 81 if (max < 0 || max > 65535)
920     {
921     *errorcodeptr = ERR5;
922     return p;
923     }
924 nigel 77 if (max < min)
925     {
926     *errorcodeptr = ERR4;
927     return p;
928     }
929     }
930     }
931    
932 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
933     '}'. */
934 nigel 77
935 nigel 81 *minp = min;
936     *maxp = max;
937 nigel 77 return p;
938     }
939    
940    
941    
942     /*************************************************
943 nigel 93 * Find forward referenced subpattern *
944 nigel 91 *************************************************/
945    
946 nigel 93 /* This function scans along a pattern's text looking for capturing
947     subpatterns, and counting them. If it finds a named pattern that matches the
948     name it is given, it returns its number. Alternatively, if the name is NULL, it
949     returns when it reaches a given numbered subpattern. This is used for forward
950     references to subpatterns. We know that if (?P< is encountered, the name will
951     be terminated by '>' because that is checked in the first pass.
952 nigel 91
953     Arguments:
954 nigel 93 ptr current position in the pattern
955     count current count of capturing parens so far encountered
956     name name to seek, or NULL if seeking a numbered subpattern
957     lorn name length, or subpattern number if name is NULL
958     xmode TRUE if we are in /x mode
959 nigel 91
960     Returns: the number of the named subpattern, or -1 if not found
961     */
962    
963     static int
964 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
965     BOOL xmode)
966 nigel 91 {
967     const uschar *thisname;
968 nigel 93
969 nigel 91 for (; *ptr != 0; ptr++)
970     {
971 nigel 93 int term;
972    
973     /* Skip over backslashed characters and also entire \Q...\E */
974    
975     if (*ptr == '\\')
976     {
977     if (*(++ptr) == 0) return -1;
978     if (*ptr == 'Q') for (;;)
979     {
980     while (*(++ptr) != 0 && *ptr != '\\');
981     if (*ptr == 0) return -1;
982     if (*(++ptr) == 'E') break;
983     }
984     continue;
985     }
986    
987     /* Skip over character classes */
988    
989     if (*ptr == '[')
990     {
991     while (*(++ptr) != ']')
992     {
993 ph10 220 if (*ptr == 0) return -1;
994 nigel 93 if (*ptr == '\\')
995     {
996     if (*(++ptr) == 0) return -1;
997     if (*ptr == 'Q') for (;;)
998     {
999     while (*(++ptr) != 0 && *ptr != '\\');
1000     if (*ptr == 0) return -1;
1001     if (*(++ptr) == 'E') break;
1002     }
1003     continue;
1004     }
1005     }
1006     continue;
1007     }
1008    
1009     /* Skip comments in /x mode */
1010    
1011     if (xmode && *ptr == '#')
1012     {
1013     while (*(++ptr) != 0 && *ptr != '\n');
1014     if (*ptr == 0) return -1;
1015     continue;
1016     }
1017    
1018     /* An opening parens must now be a real metacharacter */
1019    
1020 nigel 91 if (*ptr != '(') continue;
1021 ph10 210 if (ptr[1] != '?' && ptr[1] != '*')
1022 nigel 93 {
1023     count++;
1024     if (name == NULL && count == lorn) return count;
1025     continue;
1026     }
1027    
1028     ptr += 2;
1029     if (*ptr == 'P') ptr++; /* Allow optional P */
1030    
1031     /* We have to disambiguate (?<! and (?<= from (?<name> */
1032    
1033     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1034     *ptr != '\'')
1035     continue;
1036    
1037 nigel 91 count++;
1038 nigel 93
1039     if (name == NULL && count == lorn) return count;
1040     term = *ptr++;
1041     if (term == '<') term = '>';
1042 nigel 91 thisname = ptr;
1043 nigel 93 while (*ptr != term) ptr++;
1044     if (name != NULL && lorn == ptr - thisname &&
1045     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1046 nigel 91 return count;
1047     }
1048 nigel 93
1049 nigel 91 return -1;
1050     }
1051    
1052    
1053    
1054     /*************************************************
1055 nigel 77 * Find first significant op code *
1056     *************************************************/
1057    
1058     /* This is called by several functions that scan a compiled expression looking
1059     for a fixed first character, or an anchoring op code etc. It skips over things
1060     that do not influence this. For some calls, a change of option is important.
1061     For some calls, it makes sense to skip negative forward and all backward
1062     assertions, and also the \b assertion; for others it does not.
1063    
1064     Arguments:
1065     code pointer to the start of the group
1066     options pointer to external options
1067     optbit the option bit whose changing is significant, or
1068     zero if none are
1069     skipassert TRUE if certain assertions are to be skipped
1070    
1071     Returns: pointer to the first significant opcode
1072     */
1073    
1074     static const uschar*
1075     first_significant_code(const uschar *code, int *options, int optbit,
1076     BOOL skipassert)
1077     {
1078     for (;;)
1079     {
1080     switch ((int)*code)
1081     {
1082     case OP_OPT:
1083     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1084     *options = (int)code[1];
1085     code += 2;
1086     break;
1087    
1088     case OP_ASSERT_NOT:
1089     case OP_ASSERTBACK:
1090     case OP_ASSERTBACK_NOT:
1091     if (!skipassert) return code;
1092     do code += GET(code, 1); while (*code == OP_ALT);
1093     code += _pcre_OP_lengths[*code];
1094     break;
1095    
1096     case OP_WORD_BOUNDARY:
1097     case OP_NOT_WORD_BOUNDARY:
1098     if (!skipassert) return code;
1099     /* Fall through */
1100    
1101     case OP_CALLOUT:
1102     case OP_CREF:
1103 nigel 93 case OP_RREF:
1104     case OP_DEF:
1105 nigel 77 code += _pcre_OP_lengths[*code];
1106     break;
1107    
1108     default:
1109     return code;
1110     }
1111     }
1112     /* Control never reaches here */
1113     }
1114    
1115    
1116    
1117    
1118     /*************************************************
1119     * Find the fixed length of a pattern *
1120     *************************************************/
1121    
1122     /* Scan a pattern and compute the fixed length of subject that will match it,
1123     if the length is fixed. This is needed for dealing with backward assertions.
1124     In UTF8 mode, the result is in characters rather than bytes.
1125    
1126     Arguments:
1127     code points to the start of the pattern (the bracket)
1128     options the compiling options
1129    
1130     Returns: the fixed length, or -1 if there is no fixed length,
1131     or -2 if \C was encountered
1132     */
1133    
1134     static int
1135     find_fixedlength(uschar *code, int options)
1136     {
1137     int length = -1;
1138    
1139     register int branchlength = 0;
1140     register uschar *cc = code + 1 + LINK_SIZE;
1141    
1142     /* Scan along the opcodes for this branch. If we get to the end of the
1143     branch, check the length against that of the other branches. */
1144    
1145     for (;;)
1146     {
1147     int d;
1148     register int op = *cc;
1149     switch (op)
1150     {
1151 nigel 93 case OP_CBRA:
1152 nigel 77 case OP_BRA:
1153     case OP_ONCE:
1154     case OP_COND:
1155 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1156 nigel 77 if (d < 0) return d;
1157     branchlength += d;
1158     do cc += GET(cc, 1); while (*cc == OP_ALT);
1159     cc += 1 + LINK_SIZE;
1160     break;
1161    
1162     /* Reached end of a branch; if it's a ket it is the end of a nested
1163     call. If it's ALT it is an alternation in a nested call. If it is
1164     END it's the end of the outer call. All can be handled by the same code. */
1165    
1166     case OP_ALT:
1167     case OP_KET:
1168     case OP_KETRMAX:
1169     case OP_KETRMIN:
1170     case OP_END:
1171     if (length < 0) length = branchlength;
1172     else if (length != branchlength) return -1;
1173     if (*cc != OP_ALT) return length;
1174     cc += 1 + LINK_SIZE;
1175     branchlength = 0;
1176     break;
1177    
1178     /* Skip over assertive subpatterns */
1179    
1180     case OP_ASSERT:
1181     case OP_ASSERT_NOT:
1182     case OP_ASSERTBACK:
1183     case OP_ASSERTBACK_NOT:
1184     do cc += GET(cc, 1); while (*cc == OP_ALT);
1185     /* Fall through */
1186    
1187     /* Skip over things that don't match chars */
1188    
1189     case OP_REVERSE:
1190     case OP_CREF:
1191 nigel 93 case OP_RREF:
1192     case OP_DEF:
1193 nigel 77 case OP_OPT:
1194     case OP_CALLOUT:
1195     case OP_SOD:
1196     case OP_SOM:
1197     case OP_EOD:
1198     case OP_EODN:
1199     case OP_CIRC:
1200     case OP_DOLL:
1201     case OP_NOT_WORD_BOUNDARY:
1202     case OP_WORD_BOUNDARY:
1203     cc += _pcre_OP_lengths[*cc];
1204     break;
1205    
1206     /* Handle literal characters */
1207    
1208     case OP_CHAR:
1209     case OP_CHARNC:
1210 nigel 91 case OP_NOT:
1211 nigel 77 branchlength++;
1212     cc += 2;
1213     #ifdef SUPPORT_UTF8
1214     if ((options & PCRE_UTF8) != 0)
1215     {
1216     while ((*cc & 0xc0) == 0x80) cc++;
1217     }
1218     #endif
1219     break;
1220    
1221     /* Handle exact repetitions. The count is already in characters, but we
1222     need to skip over a multibyte character in UTF8 mode. */
1223    
1224     case OP_EXACT:
1225     branchlength += GET2(cc,1);
1226     cc += 4;
1227     #ifdef SUPPORT_UTF8
1228     if ((options & PCRE_UTF8) != 0)
1229     {
1230     while((*cc & 0x80) == 0x80) cc++;
1231     }
1232     #endif
1233     break;
1234    
1235     case OP_TYPEEXACT:
1236     branchlength += GET2(cc,1);
1237 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1238 nigel 77 cc += 4;
1239     break;
1240    
1241     /* Handle single-char matchers */
1242    
1243     case OP_PROP:
1244     case OP_NOTPROP:
1245 nigel 87 cc += 2;
1246 nigel 77 /* Fall through */
1247    
1248     case OP_NOT_DIGIT:
1249     case OP_DIGIT:
1250     case OP_NOT_WHITESPACE:
1251     case OP_WHITESPACE:
1252     case OP_NOT_WORDCHAR:
1253     case OP_WORDCHAR:
1254     case OP_ANY:
1255     branchlength++;
1256     cc++;
1257     break;
1258    
1259     /* The single-byte matcher isn't allowed */
1260    
1261     case OP_ANYBYTE:
1262     return -2;
1263    
1264     /* Check a class for variable quantification */
1265    
1266     #ifdef SUPPORT_UTF8
1267     case OP_XCLASS:
1268     cc += GET(cc, 1) - 33;
1269     /* Fall through */
1270     #endif
1271    
1272     case OP_CLASS:
1273     case OP_NCLASS:
1274     cc += 33;
1275    
1276     switch (*cc)
1277     {
1278     case OP_CRSTAR:
1279     case OP_CRMINSTAR:
1280     case OP_CRQUERY:
1281     case OP_CRMINQUERY:
1282     return -1;
1283    
1284     case OP_CRRANGE:
1285     case OP_CRMINRANGE:
1286     if (GET2(cc,1) != GET2(cc,3)) return -1;
1287     branchlength += GET2(cc,1);
1288     cc += 5;
1289     break;
1290    
1291     default:
1292     branchlength++;
1293     }
1294     break;
1295    
1296     /* Anything else is variable length */
1297    
1298     default:
1299     return -1;
1300     }
1301     }
1302     /* Control never gets here */
1303     }
1304    
1305    
1306    
1307    
1308     /*************************************************
1309     * Scan compiled regex for numbered bracket *
1310     *************************************************/
1311    
1312     /* This little function scans through a compiled pattern until it finds a
1313     capturing bracket with the given number.
1314    
1315     Arguments:
1316     code points to start of expression
1317     utf8 TRUE in UTF-8 mode
1318     number the required bracket number
1319    
1320     Returns: pointer to the opcode for the bracket, or NULL if not found
1321     */
1322    
1323     static const uschar *
1324     find_bracket(const uschar *code, BOOL utf8, int number)
1325     {
1326     for (;;)
1327     {
1328     register int c = *code;
1329     if (c == OP_END) return NULL;
1330 nigel 91
1331     /* XCLASS is used for classes that cannot be represented just by a bit
1332     map. This includes negated single high-valued characters. The length in
1333     the table is zero; the actual length is stored in the compiled code. */
1334    
1335     if (c == OP_XCLASS) code += GET(code, 1);
1336    
1337 nigel 93 /* Handle capturing bracket */
1338 nigel 91
1339 nigel 93 else if (c == OP_CBRA)
1340 nigel 77 {
1341 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1342 nigel 77 if (n == number) return (uschar *)code;
1343 nigel 93 code += _pcre_OP_lengths[c];
1344 nigel 77 }
1345 nigel 91
1346 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1347     repeated character types, we have to test for \p and \P, which have an extra
1348 ph10 218 two bytes of parameters. */
1349 nigel 91
1350 nigel 77 else
1351     {
1352 ph10 218 switch(c)
1353     {
1354     case OP_TYPESTAR:
1355     case OP_TYPEMINSTAR:
1356     case OP_TYPEPLUS:
1357     case OP_TYPEMINPLUS:
1358     case OP_TYPEQUERY:
1359     case OP_TYPEMINQUERY:
1360     case OP_TYPEPOSSTAR:
1361     case OP_TYPEPOSPLUS:
1362     case OP_TYPEPOSQUERY:
1363     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1364 ph10 220 break;
1365 ph10 221
1366     case OP_TYPEUPTO:
1367     case OP_TYPEMINUPTO:
1368     case OP_TYPEEXACT:
1369     case OP_TYPEPOSUPTO:
1370     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1371     break;
1372 ph10 220 }
1373    
1374 ph10 218 /* Add in the fixed length from the table */
1375 ph10 220
1376 nigel 77 code += _pcre_OP_lengths[c];
1377 ph10 220
1378 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1379     a multi-byte character. The length in the table is a minimum, so we have to
1380     arrange to skip the extra bytes. */
1381 ph10 220
1382 ph10 107 #ifdef SUPPORT_UTF8
1383 nigel 77 if (utf8) switch(c)
1384     {
1385     case OP_CHAR:
1386     case OP_CHARNC:
1387     case OP_EXACT:
1388     case OP_UPTO:
1389     case OP_MINUPTO:
1390 nigel 93 case OP_POSUPTO:
1391 nigel 77 case OP_STAR:
1392     case OP_MINSTAR:
1393 nigel 93 case OP_POSSTAR:
1394 nigel 77 case OP_PLUS:
1395     case OP_MINPLUS:
1396 nigel 93 case OP_POSPLUS:
1397 nigel 77 case OP_QUERY:
1398     case OP_MINQUERY:
1399 nigel 93 case OP_POSQUERY:
1400     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1401 nigel 77 break;
1402     }
1403 ph10 111 #endif
1404 nigel 77 }
1405     }
1406     }
1407    
1408    
1409    
1410     /*************************************************
1411     * Scan compiled regex for recursion reference *
1412     *************************************************/
1413    
1414     /* This little function scans through a compiled pattern until it finds an
1415     instance of OP_RECURSE.
1416    
1417     Arguments:
1418     code points to start of expression
1419     utf8 TRUE in UTF-8 mode
1420    
1421     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1422     */
1423    
1424     static const uschar *
1425     find_recurse(const uschar *code, BOOL utf8)
1426     {
1427     for (;;)
1428     {
1429     register int c = *code;
1430     if (c == OP_END) return NULL;
1431 nigel 91 if (c == OP_RECURSE) return code;
1432 ph10 220
1433 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1434     map. This includes negated single high-valued characters. The length in
1435     the table is zero; the actual length is stored in the compiled code. */
1436    
1437     if (c == OP_XCLASS) code += GET(code, 1);
1438    
1439 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1440     repeated character types, we have to test for \p and \P, which have an extra
1441 ph10 218 two bytes of parameters. */
1442 nigel 91
1443 nigel 77 else
1444     {
1445 ph10 218 switch(c)
1446     {
1447     case OP_TYPESTAR:
1448     case OP_TYPEMINSTAR:
1449     case OP_TYPEPLUS:
1450     case OP_TYPEMINPLUS:
1451     case OP_TYPEQUERY:
1452     case OP_TYPEMINQUERY:
1453     case OP_TYPEPOSSTAR:
1454     case OP_TYPEPOSPLUS:
1455     case OP_TYPEPOSQUERY:
1456     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1457 ph10 220 break;
1458 ph10 221
1459     case OP_TYPEPOSUPTO:
1460     case OP_TYPEUPTO:
1461     case OP_TYPEMINUPTO:
1462     case OP_TYPEEXACT:
1463     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1464     break;
1465 ph10 220 }
1466    
1467 ph10 218 /* Add in the fixed length from the table */
1468    
1469 nigel 77 code += _pcre_OP_lengths[c];
1470 ph10 220
1471 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1472     by a multi-byte character. The length in the table is a minimum, so we have
1473     to arrange to skip the extra bytes. */
1474 ph10 220
1475 ph10 107 #ifdef SUPPORT_UTF8
1476 nigel 77 if (utf8) switch(c)
1477     {
1478     case OP_CHAR:
1479     case OP_CHARNC:
1480     case OP_EXACT:
1481     case OP_UPTO:
1482     case OP_MINUPTO:
1483 nigel 93 case OP_POSUPTO:
1484 nigel 77 case OP_STAR:
1485     case OP_MINSTAR:
1486 nigel 93 case OP_POSSTAR:
1487 nigel 77 case OP_PLUS:
1488     case OP_MINPLUS:
1489 nigel 93 case OP_POSPLUS:
1490 nigel 77 case OP_QUERY:
1491     case OP_MINQUERY:
1492 nigel 93 case OP_POSQUERY:
1493     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1494 nigel 77 break;
1495     }
1496 ph10 111 #endif
1497 nigel 77 }
1498     }
1499     }
1500    
1501    
1502    
1503     /*************************************************
1504     * Scan compiled branch for non-emptiness *
1505     *************************************************/
1506    
1507     /* This function scans through a branch of a compiled pattern to see whether it
1508 nigel 93 can match the empty string or not. It is called from could_be_empty()
1509     below and from compile_branch() when checking for an unlimited repeat of a
1510     group that can match nothing. Note that first_significant_code() skips over
1511 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1512     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1513     bracket whose current branch will already have been scanned.
1514 nigel 77
1515     Arguments:
1516     code points to start of search
1517     endcode points to where to stop
1518     utf8 TRUE if in UTF8 mode
1519    
1520     Returns: TRUE if what is matched could be empty
1521     */
1522    
1523     static BOOL
1524     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1525     {
1526     register int c;
1527 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1528 nigel 77 code < endcode;
1529     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1530     {
1531     const uschar *ccode;
1532    
1533     c = *code;
1534 ph10 286
1535     /* Skip over forward assertions; the other assertions are skipped by
1536 ph10 282 first_significant_code() with a TRUE final argument. */
1537 ph10 286
1538 ph10 282 if (c == OP_ASSERT)
1539 ph10 286 {
1540 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1541     c = *code;
1542     continue;
1543 ph10 286 }
1544 ph10 172
1545 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1546 nigel 77
1547 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1548     {
1549 ph10 172 code += _pcre_OP_lengths[c];
1550 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1551     c = *code;
1552     continue;
1553     }
1554    
1555     /* For other groups, scan the branches. */
1556 ph10 172
1557 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1558 nigel 77 {
1559     BOOL empty_branch;
1560     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1561    
1562     /* Scan a closed bracket */
1563    
1564     empty_branch = FALSE;
1565     do
1566     {
1567     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1568     empty_branch = TRUE;
1569     code += GET(code, 1);
1570     }
1571     while (*code == OP_ALT);
1572     if (!empty_branch) return FALSE; /* All branches are non-empty */
1573 ph10 172 c = *code;
1574 nigel 93 continue;
1575 nigel 77 }
1576    
1577 nigel 93 /* Handle the other opcodes */
1578    
1579     switch (c)
1580 nigel 77 {
1581 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1582     cannot be represented just by a bit map. This includes negated single
1583     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1584 ph10 220 actual length is stored in the compiled code, so we must update "code"
1585 ph10 216 here. */
1586 nigel 77
1587     #ifdef SUPPORT_UTF8
1588     case OP_XCLASS:
1589 ph10 216 ccode = code += GET(code, 1);
1590 nigel 77 goto CHECK_CLASS_REPEAT;
1591     #endif
1592    
1593     case OP_CLASS:
1594     case OP_NCLASS:
1595     ccode = code + 33;
1596    
1597     #ifdef SUPPORT_UTF8
1598     CHECK_CLASS_REPEAT:
1599     #endif
1600    
1601     switch (*ccode)
1602     {
1603     case OP_CRSTAR: /* These could be empty; continue */
1604     case OP_CRMINSTAR:
1605     case OP_CRQUERY:
1606     case OP_CRMINQUERY:
1607     break;
1608    
1609     default: /* Non-repeat => class must match */
1610     case OP_CRPLUS: /* These repeats aren't empty */
1611     case OP_CRMINPLUS:
1612     return FALSE;
1613    
1614     case OP_CRRANGE:
1615     case OP_CRMINRANGE:
1616     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1617     break;
1618     }
1619     break;
1620    
1621     /* Opcodes that must match a character */
1622    
1623     case OP_PROP:
1624     case OP_NOTPROP:
1625     case OP_EXTUNI:
1626     case OP_NOT_DIGIT:
1627     case OP_DIGIT:
1628     case OP_NOT_WHITESPACE:
1629     case OP_WHITESPACE:
1630     case OP_NOT_WORDCHAR:
1631     case OP_WORDCHAR:
1632     case OP_ANY:
1633     case OP_ANYBYTE:
1634     case OP_CHAR:
1635     case OP_CHARNC:
1636     case OP_NOT:
1637     case OP_PLUS:
1638     case OP_MINPLUS:
1639 nigel 93 case OP_POSPLUS:
1640 nigel 77 case OP_EXACT:
1641     case OP_NOTPLUS:
1642     case OP_NOTMINPLUS:
1643 nigel 93 case OP_NOTPOSPLUS:
1644 nigel 77 case OP_NOTEXACT:
1645     case OP_TYPEPLUS:
1646     case OP_TYPEMINPLUS:
1647 nigel 93 case OP_TYPEPOSPLUS:
1648 nigel 77 case OP_TYPEEXACT:
1649     return FALSE;
1650 ph10 227
1651     /* These are going to continue, as they may be empty, but we have to
1652     fudge the length for the \p and \P cases. */
1653    
1654 ph10 224 case OP_TYPESTAR:
1655     case OP_TYPEMINSTAR:
1656     case OP_TYPEPOSSTAR:
1657     case OP_TYPEQUERY:
1658     case OP_TYPEMINQUERY:
1659     case OP_TYPEPOSQUERY:
1660     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1661 ph10 227 break;
1662    
1663 ph10 224 /* Same for these */
1664 ph10 227
1665 ph10 224 case OP_TYPEUPTO:
1666     case OP_TYPEMINUPTO:
1667     case OP_TYPEPOSUPTO:
1668     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1669     break;
1670 nigel 77
1671     /* End of branch */
1672    
1673     case OP_KET:
1674     case OP_KETRMAX:
1675     case OP_KETRMIN:
1676     case OP_ALT:
1677     return TRUE;
1678    
1679 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1680     MINUPTO, and POSUPTO may be followed by a multibyte character */
1681 nigel 77
1682     #ifdef SUPPORT_UTF8
1683     case OP_STAR:
1684     case OP_MINSTAR:
1685 nigel 93 case OP_POSSTAR:
1686 nigel 77 case OP_QUERY:
1687     case OP_MINQUERY:
1688 nigel 93 case OP_POSQUERY:
1689 nigel 77 case OP_UPTO:
1690     case OP_MINUPTO:
1691 nigel 93 case OP_POSUPTO:
1692 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1693     break;
1694     #endif
1695     }
1696     }
1697    
1698     return TRUE;
1699     }
1700    
1701    
1702    
1703     /*************************************************
1704     * Scan compiled regex for non-emptiness *
1705     *************************************************/
1706    
1707     /* This function is called to check for left recursive calls. We want to check
1708     the current branch of the current pattern to see if it could match the empty
1709     string. If it could, we must look outwards for branches at other levels,
1710     stopping when we pass beyond the bracket which is the subject of the recursion.
1711    
1712     Arguments:
1713     code points to start of the recursion
1714     endcode points to where to stop (current RECURSE item)
1715     bcptr points to the chain of current (unclosed) branch starts
1716     utf8 TRUE if in UTF-8 mode
1717    
1718     Returns: TRUE if what is matched could be empty
1719     */
1720    
1721     static BOOL
1722     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1723     BOOL utf8)
1724     {
1725     while (bcptr != NULL && bcptr->current >= code)
1726     {
1727     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1728     bcptr = bcptr->outer;
1729     }
1730     return TRUE;
1731     }
1732    
1733    
1734    
1735     /*************************************************
1736     * Check for POSIX class syntax *
1737     *************************************************/
1738    
1739     /* This function is called when the sequence "[:" or "[." or "[=" is
1740 ph10 295 encountered in a character class. It checks whether this is followed by a
1741     sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1742     reach an unescaped ']' without the special preceding character, return FALSE.
1743 nigel 77
1744 ph10 295 Originally, this function only recognized a sequence of letters between the
1745     terminators, but it seems that Perl recognizes any sequence of characters,
1746     though of course unknown POSIX names are subsequently rejected. Perl gives an
1747     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1748     didn't consider this to be a POSIX class. Likewise for [:1234:].
1749    
1750     The problem in trying to be exactly like Perl is in the handling of escapes. We
1751     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1752     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1753     below handles the special case of \], but does not try to do any other escape
1754     processing. This makes it different from Perl for cases such as [:l\ower:]
1755     where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1756     "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1757     I think.
1758    
1759     Arguments:
1760 nigel 77 ptr pointer to the initial [
1761     endptr where to return the end pointer
1762    
1763     Returns: TRUE or FALSE
1764     */
1765    
1766     static BOOL
1767 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1768 nigel 77 {
1769     int terminator; /* Don't combine these lines; the Solaris cc */
1770     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1771 ph10 295 for (++ptr; *ptr != 0; ptr++)
1772 nigel 77 {
1773 ph10 295 if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1774     {
1775     if (*ptr == ']') return FALSE;
1776     if (*ptr == terminator && ptr[1] == ']')
1777     {
1778     *endptr = ptr;
1779     return TRUE;
1780     }
1781     }
1782     }
1783 nigel 77 return FALSE;
1784     }
1785    
1786    
1787    
1788    
1789     /*************************************************
1790     * Check POSIX class name *
1791     *************************************************/
1792    
1793     /* This function is called to check the name given in a POSIX-style class entry
1794     such as [:alnum:].
1795    
1796     Arguments:
1797     ptr points to the first letter
1798     len the length of the name
1799    
1800     Returns: a value representing the name, or -1 if unknown
1801     */
1802    
1803     static int
1804     check_posix_name(const uschar *ptr, int len)
1805     {
1806 ph10 240 const char *pn = posix_names;
1807 nigel 77 register int yield = 0;
1808     while (posix_name_lengths[yield] != 0)
1809     {
1810     if (len == posix_name_lengths[yield] &&
1811 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
1812 ph10 243 pn += posix_name_lengths[yield] + 1;
1813 nigel 77 yield++;
1814     }
1815     return -1;
1816     }
1817    
1818    
1819     /*************************************************
1820     * Adjust OP_RECURSE items in repeated group *
1821     *************************************************/
1822    
1823     /* OP_RECURSE items contain an offset from the start of the regex to the group
1824     that is referenced. This means that groups can be replicated for fixed
1825     repetition simply by copying (because the recursion is allowed to refer to
1826     earlier groups that are outside the current group). However, when a group is
1827     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1828     it, after it has been compiled. This means that any OP_RECURSE items within it
1829     that refer to the group itself or any contained groups have to have their
1830 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1831     the partially compiled regex must be temporarily terminated with OP_END.
1832 nigel 77
1833 nigel 93 This function has been extended with the possibility of forward references for
1834     recursions and subroutine calls. It must also check the list of such references
1835     for the group we are dealing with. If it finds that one of the recursions in
1836     the current group is on this list, it adjusts the offset in the list, not the
1837     value in the reference (which is a group number).
1838    
1839 nigel 77 Arguments:
1840     group points to the start of the group
1841     adjust the amount by which the group is to be moved
1842     utf8 TRUE in UTF-8 mode
1843     cd contains pointers to tables etc.
1844 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1845 nigel 77
1846     Returns: nothing
1847     */
1848    
1849     static void
1850 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1851     uschar *save_hwm)
1852 nigel 77 {
1853     uschar *ptr = group;
1854 ph10 224
1855 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1856     {
1857 nigel 93 int offset;
1858     uschar *hc;
1859    
1860     /* See if this recursion is on the forward reference list. If so, adjust the
1861     reference. */
1862    
1863     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1864     {
1865     offset = GET(hc, 0);
1866     if (cd->start_code + offset == ptr + 1)
1867     {
1868     PUT(hc, 0, offset + adjust);
1869     break;
1870     }
1871     }
1872    
1873     /* Otherwise, adjust the recursion offset if it's after the start of this
1874     group. */
1875    
1876     if (hc >= cd->hwm)
1877     {
1878     offset = GET(ptr, 1);
1879     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1880     }
1881    
1882 nigel 77 ptr += 1 + LINK_SIZE;
1883     }
1884     }
1885    
1886    
1887    
1888     /*************************************************
1889     * Insert an automatic callout point *
1890     *************************************************/
1891    
1892     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1893     callout points before each pattern item.
1894    
1895     Arguments:
1896     code current code pointer
1897     ptr current pattern pointer
1898     cd pointers to tables etc
1899    
1900     Returns: new code pointer
1901     */
1902    
1903     static uschar *
1904     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1905     {
1906     *code++ = OP_CALLOUT;
1907     *code++ = 255;
1908     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1909     PUT(code, LINK_SIZE, 0); /* Default length */
1910     return code + 2*LINK_SIZE;
1911     }
1912    
1913    
1914    
1915     /*************************************************
1916     * Complete a callout item *
1917     *************************************************/
1918    
1919     /* A callout item contains the length of the next item in the pattern, which
1920     we can't fill in till after we have reached the relevant point. This is used
1921     for both automatic and manual callouts.
1922    
1923     Arguments:
1924     previous_callout points to previous callout item
1925     ptr current pattern pointer
1926     cd pointers to tables etc
1927    
1928     Returns: nothing
1929     */
1930    
1931     static void
1932     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1933     {
1934     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1935     PUT(previous_callout, 2 + LINK_SIZE, length);
1936     }
1937    
1938    
1939    
1940     #ifdef SUPPORT_UCP
1941     /*************************************************
1942     * Get othercase range *
1943     *************************************************/
1944    
1945     /* This function is passed the start and end of a class range, in UTF-8 mode
1946     with UCP support. It searches up the characters, looking for internal ranges of
1947     characters in the "other" case. Each call returns the next one, updating the
1948     start address.
1949    
1950     Arguments:
1951     cptr points to starting character value; updated
1952     d end value
1953     ocptr where to put start of othercase range
1954     odptr where to put end of othercase range
1955    
1956     Yield: TRUE when range returned; FALSE when no more
1957     */
1958    
1959     static BOOL
1960 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1961     unsigned int *odptr)
1962 nigel 77 {
1963 nigel 93 unsigned int c, othercase, next;
1964 nigel 77
1965     for (c = *cptr; c <= d; c++)
1966 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1967 nigel 77
1968     if (c > d) return FALSE;
1969    
1970     *ocptr = othercase;
1971     next = othercase + 1;
1972    
1973     for (++c; c <= d; c++)
1974     {
1975 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1976 nigel 77 next++;
1977     }
1978    
1979     *odptr = next - 1;
1980     *cptr = c;
1981    
1982     return TRUE;
1983     }
1984     #endif /* SUPPORT_UCP */
1985    
1986    
1987 nigel 93
1988 nigel 77 /*************************************************
1989 nigel 93 * Check if auto-possessifying is possible *
1990     *************************************************/
1991    
1992     /* This function is called for unlimited repeats of certain items, to see
1993     whether the next thing could possibly match the repeated item. If not, it makes
1994     sense to automatically possessify the repeated item.
1995    
1996     Arguments:
1997     op_code the repeated op code
1998     this data for this item, depends on the opcode
1999     utf8 TRUE in UTF-8 mode
2000     utf8_char used for utf8 character bytes, NULL if not relevant
2001     ptr next character in pattern
2002     options options bits
2003     cd contains pointers to tables etc.
2004    
2005     Returns: TRUE if possessifying is wanted
2006     */
2007    
2008     static BOOL
2009     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2010     const uschar *ptr, int options, compile_data *cd)
2011     {
2012     int next;
2013    
2014     /* Skip whitespace and comments in extended mode */
2015    
2016     if ((options & PCRE_EXTENDED) != 0)
2017     {
2018     for (;;)
2019     {
2020     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2021     if (*ptr == '#')
2022     {
2023     while (*(++ptr) != 0)
2024     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2025     }
2026     else break;
2027     }
2028     }
2029    
2030     /* If the next item is one that we can handle, get its value. A non-negative
2031     value is a character, a negative value is an escape value. */
2032    
2033     if (*ptr == '\\')
2034     {
2035     int temperrorcode = 0;
2036     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2037     if (temperrorcode != 0) return FALSE;
2038     ptr++; /* Point after the escape sequence */
2039     }
2040    
2041     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2042     {
2043     #ifdef SUPPORT_UTF8
2044     if (utf8) { GETCHARINC(next, ptr); } else
2045     #endif
2046     next = *ptr++;
2047     }
2048    
2049     else return FALSE;
2050    
2051     /* Skip whitespace and comments in extended mode */
2052    
2053     if ((options & PCRE_EXTENDED) != 0)
2054     {
2055     for (;;)
2056     {
2057     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2058     if (*ptr == '#')
2059     {
2060     while (*(++ptr) != 0)
2061     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2062     }
2063     else break;
2064     }
2065     }
2066    
2067     /* If the next thing is itself optional, we have to give up. */
2068    
2069     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2070     return FALSE;
2071    
2072     /* Now compare the next item with the previous opcode. If the previous is a
2073     positive single character match, "item" either contains the character or, if
2074     "item" is greater than 127 in utf8 mode, the character's bytes are in
2075     utf8_char. */
2076    
2077    
2078     /* Handle cases when the next item is a character. */
2079    
2080     if (next >= 0) switch(op_code)
2081     {
2082     case OP_CHAR:
2083     #ifdef SUPPORT_UTF8
2084     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2085     #endif
2086     return item != next;
2087    
2088     /* For CHARNC (caseless character) we must check the other case. If we have
2089     Unicode property support, we can use it to test the other case of
2090     high-valued characters. */
2091    
2092     case OP_CHARNC:
2093     #ifdef SUPPORT_UTF8
2094     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2095     #endif
2096     if (item == next) return FALSE;
2097     #ifdef SUPPORT_UTF8
2098     if (utf8)
2099     {
2100     unsigned int othercase;
2101     if (next < 128) othercase = cd->fcc[next]; else
2102     #ifdef SUPPORT_UCP
2103     othercase = _pcre_ucp_othercase((unsigned int)next);
2104     #else
2105     othercase = NOTACHAR;
2106     #endif
2107     return (unsigned int)item != othercase;
2108     }
2109     else
2110     #endif /* SUPPORT_UTF8 */
2111     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2112    
2113     /* For OP_NOT, "item" must be a single-byte character. */
2114    
2115     case OP_NOT:
2116     if (next < 0) return FALSE; /* Not a character */
2117     if (item == next) return TRUE;
2118     if ((options & PCRE_CASELESS) == 0) return FALSE;
2119     #ifdef SUPPORT_UTF8
2120     if (utf8)
2121     {
2122     unsigned int othercase;
2123     if (next < 128) othercase = cd->fcc[next]; else
2124     #ifdef SUPPORT_UCP
2125     othercase = _pcre_ucp_othercase(next);
2126     #else
2127     othercase = NOTACHAR;
2128     #endif
2129     return (unsigned int)item == othercase;
2130     }
2131     else
2132     #endif /* SUPPORT_UTF8 */
2133     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2134    
2135     case OP_DIGIT:
2136     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2137    
2138     case OP_NOT_DIGIT:
2139     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2140    
2141     case OP_WHITESPACE:
2142     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2143    
2144     case OP_NOT_WHITESPACE:
2145     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2146    
2147     case OP_WORDCHAR:
2148     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2149    
2150     case OP_NOT_WORDCHAR:
2151     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2152    
2153 ph10 180 case OP_HSPACE:
2154     case OP_NOT_HSPACE:
2155     switch(next)
2156     {
2157     case 0x09:
2158     case 0x20:
2159     case 0xa0:
2160     case 0x1680:
2161     case 0x180e:
2162     case 0x2000:
2163     case 0x2001:
2164     case 0x2002:
2165     case 0x2003:
2166     case 0x2004:
2167     case 0x2005:
2168     case 0x2006:
2169     case 0x2007:
2170     case 0x2008:
2171     case 0x2009:
2172     case 0x200A:
2173     case 0x202f:
2174     case 0x205f:
2175     case 0x3000:
2176     return op_code != OP_HSPACE;
2177     default:
2178     return op_code == OP_HSPACE;
2179     }
2180    
2181     case OP_VSPACE:
2182     case OP_NOT_VSPACE:
2183     switch(next)
2184     {
2185     case 0x0a:
2186     case 0x0b:
2187     case 0x0c:
2188     case 0x0d:
2189     case 0x85:
2190     case 0x2028:
2191     case 0x2029:
2192     return op_code != OP_VSPACE;
2193     default:
2194     return op_code == OP_VSPACE;
2195     }
2196    
2197 nigel 93 default:
2198     return FALSE;
2199     }
2200    
2201    
2202     /* Handle the case when the next item is \d, \s, etc. */
2203    
2204     switch(op_code)
2205     {
2206     case OP_CHAR:
2207     case OP_CHARNC:
2208     #ifdef SUPPORT_UTF8
2209     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2210     #endif
2211     switch(-next)
2212     {
2213     case ESC_d:
2214     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2215    
2216     case ESC_D:
2217     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2218    
2219     case ESC_s:
2220     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2221    
2222     case ESC_S:
2223     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2224    
2225     case ESC_w:
2226     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2227    
2228     case ESC_W:
2229     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2230 ph10 182
2231 ph10 180 case ESC_h:
2232     case ESC_H:
2233     switch(item)
2234     {
2235     case 0x09:
2236     case 0x20:
2237     case 0xa0:
2238     case 0x1680:
2239     case 0x180e:
2240     case 0x2000:
2241     case 0x2001:
2242     case 0x2002:
2243     case 0x2003:
2244     case 0x2004:
2245     case 0x2005:
2246     case 0x2006:
2247     case 0x2007:
2248     case 0x2008:
2249     case 0x2009:
2250     case 0x200A:
2251     case 0x202f:
2252     case 0x205f:
2253     case 0x3000:
2254     return -next != ESC_h;
2255     default:
2256     return -next == ESC_h;
2257 ph10 182 }
2258    
2259 ph10 180 case ESC_v:
2260     case ESC_V:
2261     switch(item)
2262     {
2263     case 0x0a:
2264     case 0x0b:
2265     case 0x0c:
2266     case 0x0d:
2267     case 0x85:
2268     case 0x2028:
2269     case 0x2029:
2270     return -next != ESC_v;
2271     default:
2272     return -next == ESC_v;
2273 ph10 182 }
2274 nigel 93
2275     default:
2276     return FALSE;
2277     }
2278    
2279     case OP_DIGIT:
2280 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2281     next == -ESC_h || next == -ESC_v;
2282 nigel 93
2283     case OP_NOT_DIGIT:
2284     return next == -ESC_d;
2285    
2286     case OP_WHITESPACE:
2287     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2288    
2289     case OP_NOT_WHITESPACE:
2290 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2291 nigel 93
2292 ph10 180 case OP_HSPACE:
2293     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2294    
2295     case OP_NOT_HSPACE:
2296     return next == -ESC_h;
2297 ph10 182
2298 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2299 ph10 182 case OP_VSPACE:
2300 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2301    
2302     case OP_NOT_VSPACE:
2303 ph10 182 return next == -ESC_v;
2304 ph10 180
2305 nigel 93 case OP_WORDCHAR:
2306 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2307 nigel 93
2308     case OP_NOT_WORDCHAR:
2309     return next == -ESC_w || next == -ESC_d;
2310 ph10 182
2311 nigel 93 default:
2312     return FALSE;
2313     }
2314    
2315     /* Control does not reach here */
2316     }
2317    
2318    
2319    
2320     /*************************************************
2321 nigel 77 * Compile one branch *
2322     *************************************************/
2323    
2324 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2325 nigel 77 changed during the branch, the pointer is used to change the external options
2326 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2327     to find out the amount of memory needed, as well as during the real compile
2328     phase. The value of lengthptr distinguishes the two phases.
2329 nigel 77
2330     Arguments:
2331     optionsptr pointer to the option bits
2332     codeptr points to the pointer to the current code point
2333     ptrptr points to the current pattern pointer
2334     errorcodeptr points to error code variable
2335     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2336     reqbyteptr set to the last literal character required, else < 0
2337     bcptr points to current branch chain
2338     cd contains pointers to tables etc.
2339 nigel 93 lengthptr NULL during the real compile phase
2340     points to length accumulator during pre-compile phase
2341 nigel 77
2342     Returns: TRUE on success
2343     FALSE, with *errorcodeptr set non-zero on error
2344     */
2345    
2346     static BOOL
2347 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2348     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2349     compile_data *cd, int *lengthptr)
2350 nigel 77 {
2351     int repeat_type, op_type;
2352     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2353     int bravalue = 0;
2354     int greedy_default, greedy_non_default;
2355     int firstbyte, reqbyte;
2356     int zeroreqbyte, zerofirstbyte;
2357     int req_caseopt, reqvary, tempreqvary;
2358     int options = *optionsptr;
2359     int after_manual_callout = 0;
2360 nigel 93 int length_prevgroup = 0;
2361 nigel 77 register int c;
2362     register uschar *code = *codeptr;
2363 nigel 93 uschar *last_code = code;
2364     uschar *orig_code = code;
2365 nigel 77 uschar *tempcode;
2366     BOOL inescq = FALSE;
2367     BOOL groupsetfirstbyte = FALSE;
2368     const uschar *ptr = *ptrptr;
2369     const uschar *tempptr;
2370     uschar *previous = NULL;
2371     uschar *previous_callout = NULL;
2372 nigel 93 uschar *save_hwm = NULL;
2373 nigel 77 uschar classbits[32];
2374    
2375     #ifdef SUPPORT_UTF8
2376     BOOL class_utf8;
2377     BOOL utf8 = (options & PCRE_UTF8) != 0;
2378     uschar *class_utf8data;
2379     uschar utf8_char[6];
2380     #else
2381     BOOL utf8 = FALSE;
2382 nigel 93 uschar *utf8_char = NULL;
2383 nigel 77 #endif
2384    
2385 nigel 93 #ifdef DEBUG
2386     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2387     #endif
2388    
2389 nigel 77 /* Set up the default and non-default settings for greediness */
2390    
2391     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2392     greedy_non_default = greedy_default ^ 1;
2393    
2394     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2395     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2396     matches a non-fixed char first char; reqbyte just remains unset if we never
2397     find one.
2398    
2399     When we hit a repeat whose minimum is zero, we may have to adjust these values
2400     to take the zero repeat into account. This is implemented by setting them to
2401     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2402     item types that can be repeated set these backoff variables appropriately. */
2403    
2404     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2405    
2406     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2407     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2408     value > 255. It is added into the firstbyte or reqbyte variables to record the
2409     case status of the value. This is used only for ASCII characters. */
2410    
2411     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2412    
2413     /* Switch on next character until the end of the branch */
2414    
2415     for (;; ptr++)
2416     {
2417     BOOL negate_class;
2418 ph10 286 BOOL should_flip_negation;
2419 nigel 77 BOOL possessive_quantifier;
2420     BOOL is_quantifier;
2421 nigel 93 BOOL is_recurse;
2422 ph10 180 BOOL reset_bracount;
2423 nigel 77 int class_charcount;
2424     int class_lastchar;
2425     int newoptions;
2426     int recno;
2427 ph10 172 int refsign;
2428 nigel 77 int skipbytes;
2429     int subreqbyte;
2430     int subfirstbyte;
2431 nigel 93 int terminator;
2432 nigel 77 int mclength;
2433     uschar mcbuffer[8];
2434    
2435 nigel 93 /* Get next byte in the pattern */
2436 nigel 77
2437     c = *ptr;
2438    
2439 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2440     previous cycle of this loop. */
2441    
2442     if (lengthptr != NULL)
2443     {
2444     #ifdef DEBUG
2445     if (code > cd->hwm) cd->hwm = code; /* High water info */
2446     #endif
2447     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2448     {
2449     *errorcodeptr = ERR52;
2450     goto FAILED;
2451     }
2452    
2453     /* There is at least one situation where code goes backwards: this is the
2454     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2455     the class is simply eliminated. However, it is created first, so we have to
2456     allow memory for it. Therefore, don't ever reduce the length at this point.
2457     */
2458    
2459     if (code < last_code) code = last_code;
2460 ph10 202
2461     /* Paranoid check for integer overflow */
2462    
2463     if (OFLOW_MAX - *lengthptr < code - last_code)
2464     {
2465     *errorcodeptr = ERR20;
2466     goto FAILED;
2467     }
2468    
2469 nigel 93 *lengthptr += code - last_code;
2470     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2471    
2472     /* If "previous" is set and it is not at the start of the work space, move
2473     it back to there, in order to avoid filling up the work space. Otherwise,
2474     if "previous" is NULL, reset the current code pointer to the start. */
2475    
2476     if (previous != NULL)
2477     {
2478     if (previous > orig_code)
2479     {
2480     memmove(orig_code, previous, code - previous);
2481     code -= previous - orig_code;
2482     previous = orig_code;
2483     }
2484     }
2485     else code = orig_code;
2486    
2487     /* Remember where this code item starts so we can pick up the length
2488     next time round. */
2489    
2490     last_code = code;
2491     }
2492    
2493     /* In the real compile phase, just check the workspace used by the forward
2494     reference list. */
2495    
2496     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2497     {
2498     *errorcodeptr = ERR52;
2499     goto FAILED;
2500     }
2501    
2502 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2503    
2504     if (inescq && c != 0)
2505     {
2506     if (c == '\\' && ptr[1] == 'E')
2507     {
2508     inescq = FALSE;
2509     ptr++;
2510     continue;
2511     }
2512     else
2513     {
2514     if (previous_callout != NULL)
2515     {
2516 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2517     complete_callout(previous_callout, ptr, cd);
2518 nigel 77 previous_callout = NULL;
2519     }
2520     if ((options & PCRE_AUTO_CALLOUT) != 0)
2521     {
2522     previous_callout = code;
2523     code = auto_callout(code, ptr, cd);
2524     }
2525     goto NORMAL_CHAR;
2526     }
2527     }
2528    
2529     /* Fill in length of a previous callout, except when the next thing is
2530     a quantifier. */
2531    
2532     is_quantifier = c == '*' || c == '+' || c == '?' ||
2533     (c == '{' && is_counted_repeat(ptr+1));
2534    
2535     if (!is_quantifier && previous_callout != NULL &&
2536     after_manual_callout-- <= 0)
2537     {
2538 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2539     complete_callout(previous_callout, ptr, cd);
2540 nigel 77 previous_callout = NULL;
2541     }
2542    
2543     /* In extended mode, skip white space and comments */
2544    
2545     if ((options & PCRE_EXTENDED) != 0)
2546     {
2547     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2548     if (c == '#')
2549     {
2550 nigel 93 while (*(++ptr) != 0)
2551 nigel 91 {
2552 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2553 nigel 91 }
2554 nigel 93 if (*ptr != 0) continue;
2555    
2556 nigel 91 /* Else fall through to handle end of string */
2557     c = 0;
2558 nigel 77 }
2559     }
2560    
2561     /* No auto callout for quantifiers. */
2562    
2563     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2564     {
2565     previous_callout = code;
2566     code = auto_callout(code, ptr, cd);
2567     }
2568    
2569     switch(c)
2570     {
2571 nigel 93 /* ===================================================================*/
2572     case 0: /* The branch terminates at string end */
2573     case '|': /* or | or ) */
2574 nigel 77 case ')':
2575     *firstbyteptr = firstbyte;
2576     *reqbyteptr = reqbyte;
2577     *codeptr = code;
2578     *ptrptr = ptr;
2579 nigel 93 if (lengthptr != NULL)
2580     {
2581 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2582     {
2583     *errorcodeptr = ERR20;
2584     goto FAILED;
2585     }
2586 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2587     DPRINTF((">> end branch\n"));
2588     }
2589 nigel 77 return TRUE;
2590    
2591 nigel 93
2592     /* ===================================================================*/
2593 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2594     the setting of any following char as a first character. */
2595    
2596     case '^':
2597     if ((options & PCRE_MULTILINE) != 0)
2598     {
2599     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2600     }
2601     previous = NULL;
2602     *code++ = OP_CIRC;
2603     break;
2604    
2605     case '$':
2606     previous = NULL;
2607     *code++ = OP_DOLL;
2608     break;
2609    
2610     /* There can never be a first char if '.' is first, whatever happens about
2611     repeats. The value of reqbyte doesn't change either. */
2612    
2613     case '.':
2614     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2615     zerofirstbyte = firstbyte;
2616     zeroreqbyte = reqbyte;
2617     previous = code;
2618     *code++ = OP_ANY;
2619     break;
2620    
2621 nigel 93
2622     /* ===================================================================*/
2623 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2624     32-byte bitmap of the permitted characters, except in the special case
2625     where there is only one such character. For negated classes, we build the
2626     map as usual, then invert it at the end. However, we use a different opcode
2627     so that data characters > 255 can be handled correctly.
2628 nigel 77
2629     If the class contains characters outside the 0-255 range, a different
2630     opcode is compiled. It may optionally have a bit map for characters < 256,
2631     but those above are are explicitly listed afterwards. A flag byte tells
2632     whether the bitmap is present, and whether this is a negated class or not.
2633     */
2634    
2635     case '[':
2636     previous = code;
2637    
2638     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2639     they are encountered at the top level, so we'll do that too. */
2640    
2641     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2642 ph10 295 check_posix_syntax(ptr, &tempptr))
2643 nigel 77 {
2644     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2645     goto FAILED;
2646     }
2647    
2648 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2649 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2650 ph10 205 skip them too. This makes for compatibility with Perl. */
2651 ph10 208
2652 ph10 205 negate_class = FALSE;
2653     for (;;)
2654 nigel 77 {
2655     c = *(++ptr);
2656 ph10 205 if (c == '\\')
2657     {
2658 ph10 208 if (ptr[1] == 'E') ptr++;
2659 ph10 205 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2660 ph10 208 else break;
2661 ph10 205 }
2662     else if (!negate_class && c == '^')
2663     negate_class = TRUE;
2664     else break;
2665 ph10 208 }
2666 nigel 77
2667 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
2668     negation flag at the end, so that support for characters > 255 works
2669 ph10 264 correctly (they are all included in the class). */
2670    
2671     should_flip_negation = FALSE;
2672    
2673 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
2674 nigel 93 of just a single character (as long as it's < 256). However, For higher
2675     valued UTF-8 characters, we don't yet do any optimization. */
2676 nigel 77
2677     class_charcount = 0;
2678     class_lastchar = -1;
2679    
2680 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2681     temporary bit of memory, in case the class contains only 1 character (less
2682     than 256), because in that case the compiled code doesn't use the bit map.
2683     */
2684    
2685     memset(classbits, 0, 32 * sizeof(uschar));
2686    
2687 nigel 77 #ifdef SUPPORT_UTF8
2688     class_utf8 = FALSE; /* No chars >= 256 */
2689 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2690 nigel 77 #endif
2691    
2692     /* Process characters until ] is reached. By writing this as a "do" it
2693 nigel 93 means that an initial ] is taken as a data character. At the start of the
2694     loop, c contains the first byte of the character. */
2695 nigel 77
2696 nigel 93 if (c != 0) do
2697 nigel 77 {
2698 nigel 93 const uschar *oldptr;
2699    
2700 nigel 77 #ifdef SUPPORT_UTF8
2701     if (utf8 && c > 127)
2702     { /* Braces are required because the */
2703     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2704     }
2705     #endif
2706    
2707     /* Inside \Q...\E everything is literal except \E */
2708    
2709     if (inescq)
2710     {
2711 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2712 nigel 77 {
2713 nigel 93 inescq = FALSE; /* Reset literal state */
2714     ptr++; /* Skip the 'E' */
2715     continue; /* Carry on with next */
2716 nigel 77 }
2717 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2718 nigel 77 }
2719    
2720     /* Handle POSIX class names. Perl allows a negation extension of the
2721     form [:^name:]. A square bracket that doesn't match the syntax is
2722     treated as a literal. We also recognize the POSIX constructions
2723     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2724     5.6 and 5.8 do. */
2725    
2726     if (c == '[' &&
2727     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2728 ph10 295 check_posix_syntax(ptr, &tempptr))
2729 nigel 77 {
2730     BOOL local_negate = FALSE;
2731 nigel 87 int posix_class, taboffset, tabopt;
2732 nigel 77 register const uschar *cbits = cd->cbits;
2733 nigel 87 uschar pbits[32];
2734 nigel 77
2735     if (ptr[1] != ':')
2736     {
2737     *errorcodeptr = ERR31;
2738     goto FAILED;
2739     }
2740    
2741     ptr += 2;
2742     if (*ptr == '^')
2743     {
2744     local_negate = TRUE;
2745 ph10 286 should_flip_negation = TRUE; /* Note negative special */
2746 nigel 77 ptr++;
2747     }
2748    
2749     posix_class = check_posix_name(ptr, tempptr - ptr);
2750     if (posix_class < 0)
2751     {
2752     *errorcodeptr = ERR30;
2753     goto FAILED;
2754     }
2755    
2756     /* If matching is caseless, upper and lower are converted to
2757     alpha. This relies on the fact that the class table starts with
2758     alpha, lower, upper as the first 3 entries. */
2759    
2760     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2761     posix_class = 0;
2762    
2763 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2764     because we may be adding and subtracting from it, and we don't want to
2765     subtract bits that may be in the main map already. At the end we or the
2766     result into the bit map that is being built. */
2767 nigel 77
2768     posix_class *= 3;
2769 nigel 87
2770     /* Copy in the first table (always present) */
2771    
2772     memcpy(pbits, cbits + posix_class_maps[posix_class],
2773     32 * sizeof(uschar));
2774    
2775     /* If there is a second table, add or remove it as required. */
2776    
2777     taboffset = posix_class_maps[posix_class + 1];
2778     tabopt = posix_class_maps[posix_class + 2];
2779    
2780     if (taboffset >= 0)
2781 nigel 77 {
2782 nigel 87 if (tabopt >= 0)
2783     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2784 nigel 77 else
2785 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2786 nigel 77 }
2787    
2788 nigel 87 /* Not see if we need to remove any special characters. An option
2789     value of 1 removes vertical space and 2 removes underscore. */
2790    
2791     if (tabopt < 0) tabopt = -tabopt;
2792     if (tabopt == 1) pbits[1] &= ~0x3c;
2793     else if (tabopt == 2) pbits[11] &= 0x7f;
2794    
2795     /* Add the POSIX table or its complement into the main table that is
2796     being built and we are done. */
2797    
2798     if (local_negate)
2799     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2800     else
2801     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2802    
2803 nigel 77 ptr = tempptr + 1;
2804     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2805     continue; /* End of POSIX syntax handling */
2806     }
2807    
2808     /* Backslash may introduce a single character, or it may introduce one
2809 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2810     case. Inside a class (and only there) it is treated as backspace.
2811     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2812 ph10 205 to 'or' into the one we are building. We assume they have more than one
2813 nigel 77 character in them, so set class_charcount bigger than one. */
2814    
2815     if (c == '\\')
2816     {
2817 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2818     if (*errorcodeptr != 0) goto FAILED;
2819 nigel 77
2820 ph10 275 if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
2821 nigel 77 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2822 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2823 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2824     {
2825     if (ptr[1] == '\\' && ptr[2] == 'E')
2826     {
2827     ptr += 2; /* avoid empty string */
2828     }
2829     else inescq = TRUE;
2830     continue;
2831     }
2832 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2833 nigel 77
2834     if (c < 0)
2835     {
2836     register const uschar *cbits = cd->cbits;
2837     class_charcount += 2; /* Greater than 1 is what matters */
2838 nigel 93
2839     /* Save time by not doing this in the pre-compile phase. */
2840    
2841     if (lengthptr == NULL) switch (-c)
2842 nigel 77 {
2843     case ESC_d:
2844     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2845     continue;
2846    
2847     case ESC_D:
2848 ph10 286 should_flip_negation = TRUE;
2849 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2850     continue;
2851    
2852     case ESC_w:
2853     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2854     continue;
2855    
2856     case ESC_W:
2857 ph10 286 should_flip_negation = TRUE;
2858 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2859     continue;
2860    
2861     case ESC_s:
2862     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2863     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2864     continue;
2865    
2866     case ESC_S:
2867 ph10 286 should_flip_negation = TRUE;
2868 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2869     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2870     continue;
2871    
2872 nigel 93 default: /* Not recognized; fall through */
2873     break; /* Need "default" setting to stop compiler warning. */
2874     }
2875    
2876     /* In the pre-compile phase, just do the recognition. */
2877    
2878     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2879     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2880 ph10 180
2881 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
2882     they use extra memory. */
2883 ph10 180
2884 ph10 178 if (-c == ESC_h)
2885     {
2886     SETBIT(classbits, 0x09); /* VT */
2887     SETBIT(classbits, 0x20); /* SPACE */
2888 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
2889 ph10 178 #ifdef SUPPORT_UTF8
2890     if (utf8)
2891 ph10 180 {
2892 ph10 178 class_utf8 = TRUE;
2893     *class_utf8data++ = XCL_SINGLE;
2894 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2895 ph10 178 *class_utf8data++ = XCL_SINGLE;
2896 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2897     *class_utf8data++ = XCL_RANGE;
2898     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2899     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2900 ph10 178 *class_utf8data++ = XCL_SINGLE;
2901 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2902 ph10 178 *class_utf8data++ = XCL_SINGLE;
2903 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2904 ph10 178 *class_utf8data++ = XCL_SINGLE;
2905 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2906     }
2907     #endif
2908     continue;
2909     }
2910 nigel 93
2911 ph10 178 if (-c == ESC_H)
2912     {
2913     for (c = 0; c < 32; c++)
2914     {
2915     int x = 0xff;
2916     switch (c)
2917 ph10 180 {
2918 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
2919     case 0x20/8: x ^= 1 << (0x20%8); break;
2920     case 0xa0/8: x ^= 1 << (0xa0%8); break;
2921     default: break;
2922     }
2923     classbits[c] |= x;
2924 ph10 180 }
2925    
2926 ph10 178 #ifdef SUPPORT_UTF8
2927     if (utf8)
2928 ph10 180 {
2929 ph10 178 class_utf8 = TRUE;
2930 ph10 180 *class_utf8data++ = XCL_RANGE;
2931     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2932     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2933     *class_utf8data++ = XCL_RANGE;
2934     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2935     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2936     *class_utf8data++ = XCL_RANGE;
2937     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2938     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2939     *class_utf8data++ = XCL_RANGE;
2940     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2941     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2942     *class_utf8data++ = XCL_RANGE;
2943     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2944     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2945     *class_utf8data++ = XCL_RANGE;
2946     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2947     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2948     *class_utf8data++ = XCL_RANGE;
2949     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2950     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2951     }
2952     #endif
2953     continue;
2954     }
2955 ph10 178
2956     if (-c == ESC_v)
2957     {
2958     SETBIT(classbits, 0x0a); /* LF */
2959     SETBIT(classbits, 0x0b); /* VT */
2960 ph10 180 SETBIT(classbits, 0x0c); /* FF */
2961     SETBIT(classbits, 0x0d); /* CR */
2962     SETBIT(classbits, 0x85); /* NEL */
2963 ph10 178 #ifdef SUPPORT_UTF8
2964     if (utf8)
2965 ph10 180 {
2966 ph10 178 class_utf8 = TRUE;
2967 ph10 180 *class_utf8data++ = XCL_RANGE;
2968     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2969     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2970     }
2971     #endif
2972     continue;
2973     }
2974 ph10 178
2975     if (-c == ESC_V)
2976     {
2977     for (c = 0; c < 32; c++)
2978     {
2979     int x = 0xff;
2980     switch (c)
2981 ph10 180 {
2982 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
2983     x ^= 1 << (0x0b%8);
2984     x ^= 1 << (0x0c%8);
2985 ph10 180 x ^= 1 << (0x0d%8);
2986 ph10 178 break;
2987     case 0x85/8: x ^= 1 << (0x85%8); break;
2988     default: break;
2989     }
2990     classbits[c] |= x;
2991 ph10 180 }
2992    
2993 ph10 178 #ifdef SUPPORT_UTF8
2994     if (utf8)
2995 ph10 180 {
2996 ph10 178 class_utf8 = TRUE;
2997 ph10 180 *class_utf8data++ = XCL_RANGE;
2998     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2999     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3000     *class_utf8data++ = XCL_RANGE;
3001     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3002     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3003     }
3004     #endif
3005     continue;
3006     }
3007 ph10 178
3008 nigel 93 /* We need to deal with \P and \p in both phases. */
3009    
3010 nigel 77 #ifdef SUPPORT_UCP
3011 nigel 93 if (-c == ESC_p || -c == ESC_P)
3012     {
3013     BOOL negated;
3014     int pdata;
3015     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3016     if (ptype < 0) goto FAILED;
3017     class_utf8 = TRUE;
3018     *class_utf8data++ = ((-c == ESC_p) != negated)?
3019     XCL_PROP : XCL_NOTPROP;
3020     *class_utf8data++ = ptype;
3021     *class_utf8data++ = pdata;
3022     class_charcount -= 2; /* Not a < 256 character */
3023 nigel 77 continue;
3024 nigel 93 }
3025 nigel 77 #endif
3026 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
3027     strict mode. By default, for compatibility with Perl, they are
3028     treated as literals. */
3029 nigel 77
3030 nigel 93 if ((options & PCRE_EXTRA) != 0)
3031     {
3032     *errorcodeptr = ERR7;
3033     goto FAILED;
3034     }
3035 nigel 77
3036 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3037     c = *ptr; /* Get the final character and fall through */
3038 nigel 77 }
3039    
3040     /* Fall through if we have a single character (c >= 0). This may be
3041 nigel 93 greater than 256 in UTF-8 mode. */
3042 nigel 77
3043     } /* End of backslash handling */
3044    
3045     /* A single character may be followed by '-' to form a range. However,
3046     Perl does not permit ']' to be the end of the range. A '-' character
3047 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3048     entirely. The code for handling \Q and \E is messy. */
3049 nigel 77
3050 nigel 93 CHECK_RANGE:
3051     while (ptr[1] == '\\' && ptr[2] == 'E')
3052 nigel 77 {
3053 nigel 93 inescq = FALSE;
3054     ptr += 2;
3055     }
3056    
3057     oldptr = ptr;
3058 ph10 231
3059 ph10 230 /* Remember \r or \n */
3060 ph10 231
3061     if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3062    
3063 ph10 230 /* Check for range */
3064 nigel 93
3065     if (!inescq && ptr[1] == '-')
3066     {
3067 nigel 77 int d;
3068     ptr += 2;
3069 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3070 nigel 77
3071 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3072     mode. */
3073    
3074     while (*ptr == '\\' && ptr[1] == 'Q')
3075     {
3076     ptr += 2;
3077     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3078     inescq = TRUE;
3079     break;
3080     }
3081    
3082     if (*ptr == 0 || (!inescq && *ptr == ']'))
3083     {
3084     ptr = oldptr;
3085     goto LONE_SINGLE_CHARACTER;
3086     }
3087    
3088 nigel 77 #ifdef SUPPORT_UTF8
3089     if (utf8)
3090     { /* Braces are required because the */
3091     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3092     }
3093     else
3094     #endif
3095     d = *ptr; /* Not UTF-8 mode */
3096    
3097     /* The second part of a range can be a single-character escape, but
3098     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3099     in such circumstances. */
3100    
3101 nigel 93 if (!inescq && d == '\\')
3102 nigel 77 {
3103 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3104     if (*errorcodeptr != 0) goto FAILED;
3105 nigel 77
3106 ph10 275 /* \b is backspace; \X is literal X; \R is literal R; any other
3107 nigel 93 special means the '-' was literal */
3108 nigel 77
3109     if (d < 0)
3110     {
3111     if (d == -ESC_b) d = '\b';
3112 nigel 93 else if (d == -ESC_X) d = 'X';
3113     else if (d == -ESC_R) d = 'R'; else
3114 nigel 77 {
3115 nigel 93 ptr = oldptr;
3116 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3117     }
3118     }
3119     }
3120    
3121 nigel 93 /* Check that the two values are in the correct order. Optimize
3122     one-character ranges */
3123 nigel 77
3124 nigel 93 if (d < c)
3125     {
3126     *errorcodeptr = ERR8;
3127     goto FAILED;
3128     }
3129    
3130 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3131    
3132 ph10 230 /* Remember \r or \n */
3133 ph10 231
3134     if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3135    
3136 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3137     matching, we have to use an XCLASS with extra data items. Caseless
3138     matching for characters > 127 is available only if UCP support is
3139     available. */
3140    
3141     #ifdef SUPPORT_UTF8
3142     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3143     {
3144     class_utf8 = TRUE;
3145    
3146     /* With UCP support, we can find the other case equivalents of
3147     the relevant characters. There may be several ranges. Optimize how
3148     they fit with the basic range. */
3149    
3150     #ifdef SUPPORT_UCP
3151     if ((options & PCRE_CASELESS) != 0)
3152     {
3153 nigel 93 unsigned int occ, ocd;
3154     unsigned int cc = c;
3155     unsigned int origd = d;
3156 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3157     {
3158 ph10 180 if (occ >= (unsigned int)c &&
3159     ocd <= (unsigned int)d)
3160 ph10 176 continue; /* Skip embedded ranges */
3161 nigel 77
3162 ph10 180 if (occ < (unsigned int)c &&
3163 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3164 nigel 77 { /* if there is overlap, */
3165     c = occ; /* noting that if occ < c */
3166     continue; /* we can't have ocd > d */
3167     } /* because a subrange is */
3168 ph10 180 if (ocd > (unsigned int)d &&
3169 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3170 nigel 77 { /* the basic range. */
3171     d = ocd;
3172     continue;
3173     }
3174    
3175     if (occ == ocd)
3176     {
3177     *class_utf8data++ = XCL_SINGLE;
3178     }
3179     else
3180     {
3181     *class_utf8data++ = XCL_RANGE;
3182     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3183     }
3184     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3185     }
3186     }
3187     #endif /* SUPPORT_UCP */
3188    
3189     /* Now record the original range, possibly modified for UCP caseless
3190     overlapping ranges. */
3191    
3192     *class_utf8data++ = XCL_RANGE;
3193     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3194     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3195    
3196     /* With UCP support, we are done. Without UCP support, there is no
3197     caseless matching for UTF-8 characters > 127; we can use the bit map
3198     for the smaller ones. */
3199    
3200     #ifdef SUPPORT_UCP
3201     continue; /* With next character in the class */
3202     #else
3203     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3204    
3205     /* Adjust upper limit and fall through to set up the map */
3206    
3207     d = 127;
3208    
3209     #endif /* SUPPORT_UCP */
3210     }
3211     #endif /* SUPPORT_UTF8 */
3212    
3213     /* We use the bit map for all cases when not in UTF-8 mode; else
3214     ranges that lie entirely within 0-127 when there is UCP support; else
3215     for partial ranges without UCP support. */
3216    
3217 nigel 93 class_charcount += d - c + 1;
3218     class_lastchar = d;
3219    
3220     /* We can save a bit of time by skipping this in the pre-compile. */
3221    
3222     if (lengthptr == NULL) for (; c <= d; c++)
3223 nigel 77 {
3224     classbits[c/8] |= (1 << (c&7));
3225     if ((options & PCRE_CASELESS) != 0)
3226     {
3227     int uc = cd->fcc[c]; /* flip case */
3228     classbits[uc/8] |= (1 << (uc&7));
3229     }
3230     }
3231    
3232     continue; /* Go get the next char in the class */
3233     }
3234    
3235     /* Handle a lone single character - we can get here for a normal
3236     non-escape char, or after \ that introduces a single character or for an
3237     apparent range that isn't. */
3238    
3239     LONE_SINGLE_CHARACTER:
3240 ph10 231
3241 nigel 77 /* Handle a character that cannot go in the bit map */
3242    
3243     #ifdef SUPPORT_UTF8
3244     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3245     {
3246     class_utf8 = TRUE;
3247     *class_utf8data++ = XCL_SINGLE;
3248     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3249    
3250     #ifdef SUPPORT_UCP
3251     if ((options & PCRE_CASELESS) != 0)
3252     {
3253 nigel 93 unsigned int othercase;
3254     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3255 nigel 77 {
3256     *class_utf8data++ = XCL_SINGLE;
3257     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3258     }
3259     }
3260     #endif /* SUPPORT_UCP */
3261    
3262     }
3263     else
3264     #endif /* SUPPORT_UTF8 */
3265    
3266     /* Handle a single-byte character */
3267     {
3268     classbits[c/8] |= (1 << (c&7));
3269     if ((options & PCRE_CASELESS) != 0)
3270     {
3271     c = cd->fcc[c]; /* flip case */
3272     classbits[c/8] |= (1 << (c&7));
3273     }
3274     class_charcount++;
3275     class_lastchar = c;
3276     }
3277     }
3278    
3279 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3280 nigel 77
3281 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3282 nigel 77
3283 nigel 93 if (c == 0) /* Missing terminating ']' */
3284     {
3285     *errorcodeptr = ERR6;
3286     goto FAILED;
3287     }
3288 ph10 231
3289    
3290 ph10 230 /* This code has been disabled because it would mean that \s counts as
3291     an explicit \r or \n reference, and that's not really what is wanted. Now
3292     we set the flag only if there is a literal "\r" or "\n" in the class. */
3293 ph10 227
3294 ph10 230 #if 0
3295 ph10 226 /* Remember whether \r or \n are in this class */
3296 ph10 227
3297 ph10 226 if (negate_class)
3298     {
3299 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3300 ph10 226 }
3301     else
3302     {
3303 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3304 ph10 227 }
3305 ph10 230 #endif
3306 ph10 227
3307 ph10 231
3308 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3309 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3310     use of \p or \P, in other words, no use of any XCLASS features, we can
3311     optimize.
3312    
3313 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3314     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3315     operate on single-bytes only. This is an historical hangover. Maybe one day
3316     we can tidy these opcodes to handle multi-byte characters.
3317 nigel 77
3318     The optimization throws away the bit map. We turn the item into a
3319     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3320     that OP_NOT does not support multibyte characters. In the positive case, it
3321     can cause firstbyte to be set. Otherwise, there can be no first char if
3322     this item is first, whatever repeat count may follow. In the case of
3323     reqbyte, save the previous value for reinstating. */
3324    
3325     #ifdef SUPPORT_UTF8
3326 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3327 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3328 nigel 77 #else
3329     if (class_charcount == 1)
3330     #endif
3331     {
3332     zeroreqbyte = reqbyte;
3333    
3334     /* The OP_NOT opcode works on one-byte characters only. */
3335    
3336     if (negate_class)
3337     {
3338     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3339     zerofirstbyte = firstbyte;
3340     *code++ = OP_NOT;
3341     *code++ = class_lastchar;
3342     break;
3343     }
3344    
3345     /* For a single, positive character, get the value into mcbuffer, and
3346     then we can handle this with the normal one-character code. */
3347    
3348     #ifdef SUPPORT_UTF8
3349     if (utf8 && class_lastchar > 127)
3350     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3351     else
3352     #endif
3353     {
3354     mcbuffer[0] = class_lastchar;
3355     mclength = 1;
3356     }
3357     goto ONE_CHAR;
3358     } /* End of 1-char optimization */
3359    
3360     /* The general case - not the one-char optimization. If this is the first
3361     thing in the branch, there can be no first char setting, whatever the
3362     repeat count. Any reqbyte setting must remain unchanged after any kind of
3363     repeat. */
3364    
3365     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3366     zerofirstbyte = firstbyte;
3367     zeroreqbyte = reqbyte;
3368    
3369     /* If there are characters with values > 255, we have to compile an
3370 ph10 286 extended class, with its own opcode, unless there was a negated special
3371     such as \S in the class, because in that case all characters > 255 are in
3372     the class, so any that were explicitly given as well can be ignored. If
3373 ph10 264 (when there are explicit characters > 255 that must be listed) there are no
3374     characters < 256, we can omit the bitmap in the actual compiled code. */
3375 nigel 77
3376     #ifdef SUPPORT_UTF8
3377 ph10 264 if (class_utf8 && !should_flip_negation)
3378 nigel 77 {
3379     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3380     *code++ = OP_XCLASS;
3381     code += LINK_SIZE;
3382     *code = negate_class? XCL_NOT : 0;
3383    
3384 nigel 93 /* If the map is required, move up the extra data to make room for it;
3385     otherwise just move the code pointer to the end of the extra data. */
3386 nigel 77
3387     if (class_charcount > 0)
3388     {
3389     *code++ |= XCL_MAP;
3390 nigel 93 memmove(code + 32, code, class_utf8data - code);
3391 nigel 77 memcpy(code, classbits, 32);
3392 nigel 93 code = class_utf8data + 32;
3393 nigel 77 }
3394 nigel 93 else code = class_utf8data;
3395 nigel 77
3396     /* Now fill in the complete length of the item */
3397    
3398     PUT(previous, 1, code - previous);
3399     break; /* End of class handling */
3400     }
3401     #endif
3402    
3403 ph10 286 /* If there are no characters > 255, set the opcode to OP_CLASS or
3404     OP_NCLASS, depending on whether the whole class was negated and whether
3405     there were negative specials such as \S in the class. Then copy the 32-byte
3406 ph10 264 map into the code vector, negating it if necessary. */
3407 ph10 286
3408 ph10 264 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3409 nigel 77 if (negate_class)
3410     {
3411 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3412     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3413 nigel 77 }
3414     else
3415     {
3416     memcpy(code, classbits, 32);
3417     }
3418     code += 32;
3419     break;
3420    
3421 nigel 93
3422     /* ===================================================================*/
3423 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3424     has been tested above. */
3425    
3426     case '{':
3427     if (!is_quantifier) goto NORMAL_CHAR;
3428     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3429     if (*errorcodeptr != 0) goto FAILED;
3430     goto REPEAT;
3431    
3432     case '*':
3433     repeat_min = 0;
3434     repeat_max = -1;
3435     goto REPEAT;
3436    
3437     case '+':
3438     repeat_min = 1;
3439     repeat_max = -1;
3440     goto REPEAT;
3441    
3442     case '?':
3443     repeat_min = 0;
3444     repeat_max = 1;
3445    
3446     REPEAT:
3447     if (previous == NULL)
3448     {
3449     *errorcodeptr = ERR9;
3450     goto FAILED;
3451     }
3452    
3453     if (repeat_min == 0)
3454     {
3455     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3456     reqbyte = zeroreqbyte; /* Ditto */
3457     }
3458    
3459     /* Remember whether this is a variable length repeat */
3460    
3461     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3462    
3463     op_type = 0; /* Default single-char op codes */
3464     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3465    
3466     /* Save start of previous item, in case we have to move it up to make space
3467     for an inserted OP_ONCE for the additional '+' extension. */
3468    
3469     tempcode = previous;
3470    
3471     /* If the next character is '+', we have a possessive quantifier. This
3472     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3473     If the next character is '?' this is a minimizing repeat, by default,
3474     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3475     repeat type to the non-default. */
3476    
3477     if (ptr[1] == '+')
3478     {
3479     repeat_type = 0; /* Force greedy */
3480     possessive_quantifier = TRUE;
3481     ptr++;
3482     }
3483     else if (ptr[1] == '?')
3484     {
3485     repeat_type = greedy_non_default;
3486     ptr++;
3487     }
3488     else repeat_type = greedy_default;
3489    
3490     /* If previous was a character match, abolish the item and generate a
3491     repeat item instead. If a char item has a minumum of more than one, ensure
3492     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3493     the first thing in a branch because the x will have gone into firstbyte
3494     instead. */
3495    
3496     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3497     {
3498     /* Deal with UTF-8 characters that take up more than one byte. It's
3499     easier to write this out separately than try to macrify it. Use c to
3500     hold the length of the character in bytes, plus 0x80 to flag that it's a
3501     length rather than a small character. */
3502    
3503     #ifdef SUPPORT_UTF8
3504     if (utf8 && (code[-1] & 0x80) != 0)
3505     {
3506     uschar *lastchar = code - 1;
3507     while((*lastchar & 0xc0) == 0x80) lastchar--;
3508     c = code - lastchar; /* Length of UTF-8 character */
3509     memcpy(utf8_char, lastchar, c); /* Save the char */
3510     c |= 0x80; /* Flag c as a length */
3511     }
3512     else
3513     #endif
3514    
3515     /* Handle the case of a single byte - either with no UTF8 support, or
3516     with UTF-8 disabled, or for a UTF-8 character < 128. */
3517    
3518     {
3519     c = code[-1];
3520     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3521     }
3522    
3523 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3524     the line is something that cannot possibly match this character. If so,
3525     automatically possessifying this item gains some performance in the case
3526     where the match fails. */
3527    
3528     if (!possessive_quantifier &&
3529     repeat_max < 0 &&
3530     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3531     options, cd))
3532     {
3533     repeat_type = 0; /* Force greedy */
3534     possessive_quantifier = TRUE;
3535     }
3536    
3537 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3538     }
3539    
3540     /* If previous was a single negated character ([^a] or similar), we use
3541     one of the special opcodes, replacing it. The code is shared with single-
3542     character repeats by setting opt_type to add a suitable offset into
3543 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3544     currently used only for single-byte chars. */
3545 nigel 77
3546     else if (*previous == OP_NOT)
3547     {
3548     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3549     c = previous[1];
3550 nigel 93 if (!possessive_quantifier &&
3551     repeat_max < 0 &&
3552     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3553     {
3554     repeat_type = 0; /* Force greedy */
3555     possessive_quantifier = TRUE;
3556     }
3557 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3558     }
3559    
3560     /* If previous was a character type match (\d or similar), abolish it and
3561     create a suitable repeat item. The code is shared with single-character
3562     repeats by setting op_type to add a suitable offset into repeat_type. Note
3563     the the Unicode property types will be present only when SUPPORT_UCP is
3564     defined, but we don't wrap the little bits of code here because it just
3565     makes it horribly messy. */
3566    
3567     else if (*previous < OP_EODN)
3568     {
3569     uschar *oldcode;
3570 nigel 87 int prop_type, prop_value;
3571 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3572     c = *previous;
3573    
3574 nigel 93 if (!possessive_quantifier &&
3575     repeat_max < 0 &&
3576     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3577     {
3578     repeat_type = 0; /* Force greedy */
3579     possessive_quantifier = TRUE;
3580     }
3581    
3582 nigel 77 OUTPUT_SINGLE_REPEAT:
3583 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3584     {
3585     prop_type = previous[1];
3586     prop_value = previous[2];
3587     }
3588     else prop_type = prop_value = -1;
3589 nigel 77
3590     oldcode = code;
3591     code = previous; /* Usually overwrite previous item */
3592    
3593     /* If the maximum is zero then the minimum must also be zero; Perl allows
3594     this case, so we do too - by simply omitting the item altogether. */
3595    
3596     if (repeat_max == 0) goto END_REPEAT;
3597    
3598     /* All real repeats make it impossible to handle partial matching (maybe
3599     one day we will be able to remove this restriction). */
3600    
3601 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3602 nigel 77
3603     /* Combine the op_type with the repeat_type */
3604    
3605     repeat_type += op_type;
3606    
3607     /* A minimum of zero is handled either as the special case * or ?, or as
3608     an UPTO, with the maximum given. */
3609    
3610     if (repeat_min == 0)
3611     {
3612     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3613     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3614     else
3615     {
3616     *code++ = OP_UPTO + repeat_type;
3617     PUT2INC(code, 0, repeat_max);
3618     }
3619     }
3620    
3621     /* A repeat minimum of 1 is optimized into some special cases. If the
3622 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3623 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3624     one less than the maximum. */
3625    
3626     else if (repeat_min == 1)
3627     {
3628     if (repeat_max == -1)
3629     *code++ = OP_PLUS + repeat_type;
3630     else
3631     {
3632     code = oldcode; /* leave previous item in place */
3633     if (repeat_max == 1) goto END_REPEAT;
3634     *code++ = OP_UPTO + repeat_type;
3635     PUT2INC(code, 0, repeat_max - 1);
3636     }
3637     }
3638    
3639     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3640     handled as an EXACT followed by an UPTO. */
3641    
3642     else
3643     {
3644     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3645     PUT2INC(code, 0, repeat_min);
3646    
3647     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3648     we have to insert the character for the previous code. For a repeated
3649 nigel 87 Unicode property match, there are two extra bytes that define the
3650 nigel 77 required property. In UTF-8 mode, long characters have their length in
3651     c, with the 0x80 bit as a flag. */
3652    
3653     if (repeat_max < 0)
3654     {
3655     #ifdef SUPPORT_UTF8
3656     if (utf8 && c >= 128)
3657     {
3658     memcpy(code, utf8_char, c & 7);
3659     code += c & 7;
3660     }
3661     else
3662     #endif
3663     {
3664     *code++ = c;
3665 nigel 87 if (prop_type >= 0)
3666     {
3667     *code++ = prop_type;
3668     *code++ = prop_value;
3669     }
3670 nigel 77 }
3671     *code++ = OP_STAR + repeat_type;
3672     }
3673    
3674     /* Else insert an UPTO if the max is greater than the min, again
3675 nigel 93 preceded by the character, for the previously inserted code. If the
3676     UPTO is just for 1 instance, we can use QUERY instead. */
3677 nigel 77
3678     else if (repeat_max != repeat_min)
3679     {
3680     #ifdef SUPPORT_UTF8
3681     if (utf8 && c >= 128)
3682     {
3683     memcpy(code, utf8_char, c & 7);
3684     code += c & 7;
3685     }
3686     else
3687     #endif
3688     *code++ = c;
3689 nigel 87 if (prop_type >= 0)
3690     {
3691     *code++ = prop_type;
3692     *code++ = prop_value;
3693     }
3694 nigel 77 repeat_max -= repeat_min;
3695 nigel 93
3696     if (repeat_max == 1)
3697     {
3698     *code++ = OP_QUERY + repeat_type;
3699     }
3700     else
3701     {
3702     *code++ = OP_UPTO + repeat_type;
3703     PUT2INC(code, 0, repeat_max);
3704     }
3705 nigel 77 }
3706     }
3707    
3708     /* The character or character type itself comes last in all cases. */
3709    
3710     #ifdef SUPPORT_UTF8
3711     if (utf8 && c >= 128)
3712     {
3713     memcpy(code, utf8_char, c & 7);
3714     code += c & 7;
3715     }
3716     else
3717     #endif
3718     *code++ = c;
3719    
3720 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3721     define the required property. */
3722 nigel 77
3723     #ifdef SUPPORT_UCP
3724 nigel 87 if (prop_type >= 0)
3725     {
3726     *code++ = prop_type;
3727     *code++ = prop_value;
3728     }
3729 nigel 77 #endif
3730     }
3731    
3732     /* If previous was a character class or a back reference, we put the repeat
3733     stuff after it, but just skip the item if the repeat was {0,0}. */
3734    
3735     else if (*previous == OP_CLASS ||
3736     *previous == OP_NCLASS ||
3737     #ifdef SUPPORT_UTF8
3738     *previous == OP_XCLASS ||
3739     #endif
3740     *previous == OP_REF)
3741     {
3742     if (repeat_max == 0)
3743     {
3744     code = previous;
3745     goto END_REPEAT;
3746     }
3747    
3748     /* All real repeats make it impossible to handle partial matching (maybe
3749     one day we will be able to remove this restriction). */
3750    
3751 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3752 nigel 77
3753     if (repeat_min == 0 && repeat_max == -1)
3754     *code++ = OP_CRSTAR + repeat_type;
3755     else if (repeat_min == 1 && repeat_max == -1)
3756     *code++ = OP_CRPLUS + repeat_type;
3757     else if (repeat_min == 0 && repeat_max == 1)
3758     *code++ = OP_CRQUERY + repeat_type;
3759     else
3760     {
3761     *code++ = OP_CRRANGE + repeat_type;
3762     PUT2INC(code, 0, repeat_min);
3763     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3764     PUT2INC(code, 0, repeat_max);
3765     }
3766     }
3767    
3768     /* If previous was a bracket group, we may have to replicate it in certain
3769     cases. */
3770    
3771 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3772     *previous == OP_ONCE || *previous == OP_COND)
3773 nigel 77 {
3774     register int i;
3775     int ketoffset = 0;
3776     int len = code - previous;
3777     uschar *bralink = NULL;
3778    
3779 nigel 93 /* Repeating a DEFINE group is pointless */
3780    
3781     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3782     {
3783     *errorcodeptr = ERR55;
3784     goto FAILED;
3785     }
3786    
3787 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3788     by scanning through from the start, and compute the offset back to it
3789     from the current code pointer. There may be an OP_OPT setting following
3790     the final KET, so we can't find the end just by going back from the code
3791     pointer. */
3792    
3793     if (repeat_max == -1)
3794     {
3795     register uschar *ket = previous;
3796     do ket += GET(ket, 1); while (*ket != OP_KET);
3797     ketoffset = code - ket;
3798     }
3799    
3800     /* The case of a zero minimum is special because of the need to stick
3801     OP_BRAZERO in front of it, and because the group appears once in the
3802     data, whereas in other cases it appears the minimum number of times. For
3803     this reason, it is simplest to treat this case separately, as otherwise
3804     the code gets far too messy. There are several special subcases when the
3805     minimum is zero. */
3806    
3807     if (repeat_min == 0)
3808     {
3809     /* If the maximum is also zero, we just omit the group from the output
3810     altogether. */
3811    
3812     if (repeat_max == 0)
3813     {
3814     code = previous;
3815     goto END_REPEAT;
3816     }
3817    
3818     /* If the maximum is 1 or unlimited, we just have to stick in the
3819     BRAZERO and do no more at this point. However, we do need to adjust
3820     any OP_RECURSE calls inside the group that refer to the group itself or
3821 nigel 93 any internal or forward referenced group, because the offset is from
3822     the start of the whole regex. Temporarily terminate the pattern while
3823     doing this. */
3824 nigel 77
3825     if (repeat_max <= 1)
3826     {
3827     *code = OP_END;
3828 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3829 nigel 77 memmove(previous+1, previous, len);
3830     code++;
3831     *previous++ = OP_BRAZERO + repeat_type;
3832     }
3833    
3834     /* If the maximum is greater than 1 and limited, we have to replicate
3835     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3836     The first one has to be handled carefully because it's the original
3837     copy, which has to be moved up. The remainder can be handled by code
3838     that is common with the non-zero minimum case below. We have to
3839     adjust the value or repeat_max, since one less copy is required. Once
3840     again, we may have to adjust any OP_RECURSE calls inside the group. */
3841    
3842     else
3843     {
3844     int offset;
3845     *code = OP_END;
3846 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3847 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3848     code += 2 + LINK_SIZE;
3849     *previous++ = OP_BRAZERO + repeat_type;
3850     *previous++ = OP_BRA;
3851    
3852     /* We chain together the bracket offset fields that have to be
3853     filled in later when the ends of the brackets are reached. */
3854    
3855     offset = (bralink == NULL)? 0 : previous - bralink;
3856     bralink = previous;
3857     PUTINC(previous, 0, offset);
3858     }
3859    
3860     repeat_max--;
3861     }
3862    
3863     /* If the minimum is greater than zero, replicate the group as many
3864     times as necessary, and adjust the maximum to the number of subsequent
3865     copies that we need. If we set a first char from the group, and didn't
3866 nigel 93 set a required char, copy the latter from the former. If there are any
3867     forward reference subroutine calls in the group, there will be entries on
3868     the workspace list; replicate these with an appropriate increment. */
3869 nigel 77
3870     else
3871     {
3872     if (repeat_min > 1)
3873     {
3874 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3875 ph10 202 just adjust the length as if we had. Do some paranoid checks for
3876     potential integer overflow. */
3877 nigel 93
3878     if (lengthptr != NULL)
3879 ph10 202 {
3880     int delta = (repeat_min - 1)*length_prevgroup;
3881     if ((double)(repeat_min - 1)*(double)length_prevgroup >
3882     (double)INT_MAX ||
3883     OFLOW_MAX - *lengthptr < delta)
3884     {
3885     *errorcodeptr = ERR20;
3886     goto FAILED;
3887     }
3888     *lengthptr += delta;
3889     }
3890 nigel 93
3891     /* This is compiling for real */
3892    
3893     else
3894 nigel 77 {
3895 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3896     for (i = 1; i < repeat_min; i++)
3897     {
3898     uschar *hc;
3899     uschar *this_hwm = cd->hwm;
3900     memcpy(code, previous, len);
3901     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3902     {
3903     PUT(cd->hwm, 0, GET(hc, 0) + len);
3904     cd->hwm += LINK_SIZE;
3905     }
3906     save_hwm = this_hwm;
3907     code += len;
3908     }
3909 nigel 77 }
3910     }
3911 nigel 93
3912 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3913     }
3914    
3915     /* This code is common to both the zero and non-zero minimum cases. If
3916     the maximum is limited, it replicates the group in a nested fashion,
3917     remembering the bracket starts on a stack. In the case of a zero minimum,
3918     the first one was set up above. In all cases the repeat_max now specifies
3919 nigel 93 the number of additional copies needed. Again, we must remember to
3920     replicate entries on the forward reference list. */
3921 nigel 77
3922     if (repeat_max >= 0)
3923     {
3924 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3925     just adjust the length as if we had. For each repetition we must add 1
3926     to the length for BRAZERO and for all but the last repetition we must
3927 ph10 202 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3928     paranoid checks to avoid integer overflow. */
3929 nigel 93
3930     if (lengthptr != NULL && repeat_max > 0)
3931 ph10 202 {
3932     int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3933     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3934     if ((double)repeat_max *
3935     (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3936     > (double)INT_MAX ||
3937     OFLOW_MAX - *lengthptr < delta)
3938     {
3939     *errorcodeptr = ERR20;
3940     goto FAILED;
3941     }
3942     *lengthptr += delta;
3943     }
3944 nigel 93
3945     /* This is compiling for real */
3946    
3947     else for (i = repeat_max - 1; i >= 0; i--)
3948 nigel 77 {
3949 nigel 93 uschar *hc;
3950     uschar *this_hwm = cd->hwm;
3951    
3952 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3953    
3954     /* All but the final copy start a new nesting, maintaining the
3955     chain of brackets outstanding. */
3956    
3957     if (i != 0)
3958     {
3959     int offset;
3960     *code++ = OP_BRA;
3961     offset = (bralink == NULL)? 0 : code - bralink;
3962     bralink = code;
3963     PUTINC(code, 0, offset);
3964     }
3965    
3966     memcpy(code, previous, len);
3967 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3968     {
3969     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3970     cd->hwm += LINK_SIZE;
3971     }
3972     save_hwm = this_hwm;
3973 nigel 77 code += len;
3974     }
3975    
3976     /* Now chain through the pending brackets, and fill in their length
3977     fields (which are holding the chain links pro tem). */
3978    
3979     while (bralink != NULL)
3980     {
3981     int oldlinkoffset;
3982     int offset = code - bralink + 1;
3983     uschar *bra = code - offset;
3984     oldlinkoffset = GET(bra, 1);
3985     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3986     *code++ = OP_KET;
3987     PUTINC(code, 0, offset);
3988     PUT(bra, 1, offset);
3989     }
3990     }
3991    
3992     /* If the maximum is unlimited, set a repeater in the final copy. We
3993     can't just offset backwards from the current code point, because we
3994     don't know if there's been an options resetting after the ket. The
3995 nigel 93 correct offset was computed above.
3996 nigel 77
3997 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3998     this group is a non-atomic one that could match an empty string. If so,
3999     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4000     that runtime checking can be done. [This check is also applied to
4001     atomic groups at runtime, but in a different way.] */
4002    
4003     else
4004     {
4005     uschar *ketcode = code - ketoffset;
4006     uschar *bracode = ketcode - GET(ketcode, 1);
4007     *ketcode = OP_KETRMAX + repeat_type;
4008     if (lengthptr == NULL && *bracode != OP_ONCE)
4009     {
4010     uschar *scode = bracode;
4011     do
4012     {
4013     if (could_be_empty_branch(scode, ketcode, utf8))
4014     {
4015     *bracode += OP_SBRA - OP_BRA;
4016     break;
4017     }
4018     scode += GET(scode, 1);
4019     }
4020     while (*scode == OP_ALT);
4021     }
4022     }
4023 nigel 77 }
4024    
4025     /* Else there's some kind of shambles */
4026    
4027     else
4028     {
4029     *errorcodeptr = ERR11;
4030     goto FAILED;
4031     }
4032    
4033 nigel 93 /* If the character following a repeat is '+', or if certain optimization
4034     tests above succeeded, possessive_quantifier is TRUE. For some of the
4035     simpler opcodes, there is an special alternative opcode for this. For
4036     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4037     The '+' notation is just syntactic sugar, taken from Sun's Java package,
4038     but the special opcodes can optimize it a bit. The repeated item starts at
4039     tempcode, not at previous, which might be the first part of a string whose
4040     (former) last char we repeated.
4041 nigel 77
4042 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4043     an 'upto' may follow. We skip over an 'exact' item, and then test the
4044     length of what remains before proceeding. */
4045    
4046 nigel 77 if (possessive_quantifier)
4047     {
4048 nigel 93 int len;
4049     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4050     *tempcode == OP_NOTEXACT)
4051 ph10 285 tempcode += _pcre_OP_lengths[*tempcode] +
4052 ph10 286 ((*tempcode == OP_TYPEEXACT &&
4053     (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4054 nigel 93 len = code - tempcode;
4055     if (len > 0) switch (*tempcode)
4056     {
4057     case OP_STAR: *tempcode = OP_POSSTAR; break;
4058     case OP_PLUS: *tempcode = OP_POSPLUS; break;
4059     case OP_QUERY: *tempcode = OP_POSQUERY; break;
4060     case OP_UPTO: *tempcode = OP_POSUPTO; break;
4061    
4062     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4063     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4064     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4065     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4066    
4067     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4068     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4069     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4070     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4071    
4072     default:
4073     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4074     code += 1 + LINK_SIZE;
4075     len += 1 + LINK_SIZE;
4076     tempcode[0] = OP_ONCE;
4077     *code++ = OP_KET;
4078     PUTINC(code, 0, len);
4079     PUT(tempcode, 1, len);
4080     break;
4081     }
4082 nigel 77 }
4083    
4084     /* In all case we no longer have a previous item. We also set the
4085     "follows varying string" flag for subsequently encountered reqbytes if
4086     it isn't already set and we have just passed a varying length item. */
4087    
4088     END_REPEAT:
4089     previous = NULL;
4090     cd->req_varyopt |= reqvary;
4091     break;
4092    
4093    
4094 nigel 93 /* ===================================================================*/
4095     /* Start of nested parenthesized sub-expression, or comment or lookahead or
4096     lookbehind or option setting or condition or all the other extended
4097 ph10 210 parenthesis forms. */
4098 nigel 77
4099     case '(':
4100     newoptions = options;
4101     skipbytes = 0;
4102 nigel 93 bravalue = OP_CBRA;
4103     save_hwm = cd->hwm;
4104 ph10 180 reset_bracount = FALSE;
4105 ph10 211
4106 ph10 210 /* First deal with various "verbs" that can be introduced by '*'. */
4107 ph10 211
4108 ph10 210 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4109     {
4110 ph10 211 int i, namelen;
4111 ph10 243 const char *vn = verbnames;
4112 ph10 210 const uschar *name = ++ptr;
4113     previous = NULL;
4114     while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4115     if (*ptr == ':')
4116     {
4117     *errorcodeptr = ERR59; /* Not supported */
4118 ph10 211 goto FAILED;
4119     }
4120 ph10 210 if (*ptr != ')')
4121     {
4122     *errorcodeptr = ERR60;
4123