/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 265 - (hide annotations) (download)
Wed Nov 14 11:35:48 2007 UTC (7 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 194331 byte(s)
Fix negative POSIX class bug with Unicode characters.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 nigel 77 static const short int escapes[] = {
102     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 nigel 77 0, 0, -ESC_z /* x - z */
112     };
113    
114 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
115 nigel 77 static const short int escapes[] = {
116     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139     };
140     #endif
141    
142    
143 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144     searched linearly. Put all the names into a single string, in order to reduce
145 ph10 240 the number of relocations when a shared library is dynamically linked. */
146 ph10 210
147     typedef struct verbitem {
148     int len;
149     int op;
150 ph10 211 } verbitem;
151 ph10 210
152 ph10 240 static const char verbnames[] =
153 ph10 243 "ACCEPT\0"
154     "COMMIT\0"
155     "F\0"
156     "FAIL\0"
157     "PRUNE\0"
158     "SKIP\0"
159     "THEN";
160 ph10 240
161 ph10 210 static verbitem verbs[] = {
162 ph10 240 { 6, OP_ACCEPT },
163     { 6, OP_COMMIT },
164     { 1, OP_FAIL },
165     { 4, OP_FAIL },
166     { 5, OP_PRUNE },
167     { 4, OP_SKIP },
168     { 4, OP_THEN }
169 ph10 210 };
170    
171     static int verbcount = sizeof(verbs)/sizeof(verbitem);
172    
173    
174 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
175     now all in a single string, to reduce the number of relocations when a shared
176 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
177     length entry. The first three must be alpha, lower, upper, as this is assumed
178     for handling case independence. */
179 nigel 77
180 ph10 240 static const char posix_names[] =
181 ph10 243 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182     "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 ph10 240 "word\0" "xdigit";
184 nigel 77
185     static const uschar posix_name_lengths[] = {
186     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187    
188 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
189     base map, with an optional addition or removal of another map. Then, for some
190     classes, there is some additional tweaking: for [:blank:] the vertical space
191     characters are removed, and for [:alpha:] and [:alnum:] the underscore
192     character is removed. The triples in the table consist of the base map offset,
193     second map offset or -1 if no second map, and a non-negative value for map
194     addition or a negative value for map subtraction (if there are two maps). The
195     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196     remove vertical space characters, 2 => remove underscore. */
197 nigel 77
198     static const int posix_class_maps[] = {
199 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
200     cbit_lower, -1, 0, /* lower */
201     cbit_upper, -1, 0, /* upper */
202     cbit_word, -1, 2, /* alnum - word without underscore */
203     cbit_print, cbit_cntrl, 0, /* ascii */
204     cbit_space, -1, 1, /* blank - a GNU extension */
205     cbit_cntrl, -1, 0, /* cntrl */
206     cbit_digit, -1, 0, /* digit */
207     cbit_graph, -1, 0, /* graph */
208     cbit_print, -1, 0, /* print */
209     cbit_punct, -1, 0, /* punct */
210     cbit_space, -1, 0, /* space */
211     cbit_word, -1, 0, /* word - a Perl extension */
212     cbit_xdigit,-1, 0 /* xdigit */
213 nigel 77 };
214    
215    
216 nigel 93 #define STRING(a) # a
217     #define XSTRING(s) STRING(s)
218    
219 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
220 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
221     they are documented. Always add a new error instead. Messages marked DEAD below
222 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
223     the number of relocations needed when a shared library is loaded dynamically,
224     it is now one long string. We cannot use a table of offsets, because the
225     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226     simply count through to the one we want - this isn't a performance issue
227 ph10 240 because these strings are used only when there is a compilation error. */
228 nigel 77
229 ph10 240 static const char error_texts[] =
230     "no error\0"
231     "\\ at end of pattern\0"
232     "\\c at end of pattern\0"
233     "unrecognized character follows \\\0"
234     "numbers out of order in {} quantifier\0"
235 nigel 77 /* 5 */
236 ph10 240 "number too big in {} quantifier\0"
237     "missing terminating ] for character class\0"
238     "invalid escape sequence in character class\0"
239     "range out of order in character class\0"
240     "nothing to repeat\0"
241 nigel 77 /* 10 */
242 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243     "internal error: unexpected repeat\0"
244     "unrecognized character after (?\0"
245     "POSIX named classes are supported only within a class\0"
246     "missing )\0"
247 nigel 77 /* 15 */
248 ph10 240 "reference to non-existent subpattern\0"
249     "erroffset passed as NULL\0"
250     "unknown option bit(s) set\0"
251     "missing ) after comment\0"
252     "parentheses nested too deeply\0" /** DEAD **/
253 nigel 77 /* 20 */
254 ph10 240 "regular expression is too large\0"
255     "failed to get memory\0"
256     "unmatched parentheses\0"
257     "internal error: code overflow\0"
258     "unrecognized character after (?<\0"
259 nigel 77 /* 25 */
260 ph10 240 "lookbehind assertion is not fixed length\0"
261     "malformed number or name after (?(\0"
262     "conditional group contains more than two branches\0"
263     "assertion expected after (?(\0"
264     "(?R or (?[+-]digits must be followed by )\0"
265 nigel 77 /* 30 */
266 ph10 240 "unknown POSIX class name\0"
267     "POSIX collating elements are not supported\0"
268     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269     "spare error\0" /** DEAD **/
270     "character value in \\x{...} sequence is too large\0"
271 nigel 77 /* 35 */
272 ph10 240 "invalid condition (?(0)\0"
273     "\\C not allowed in lookbehind assertion\0"
274     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275     "number after (?C is > 255\0"
276     "closing ) for (?C expected\0"
277 nigel 77 /* 40 */
278 ph10 240 "recursive call could loop indefinitely\0"
279     "unrecognized character after (?P\0"
280     "syntax error in subpattern name (missing terminator)\0"
281     "two named subpatterns have the same name\0"
282     "invalid UTF-8 string\0"
283 nigel 77 /* 45 */
284 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
285     "malformed \\P or \\p sequence\0"
286     "unknown property name after \\P or \\p\0"
287     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 nigel 91 /* 50 */
290 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
291     "octal value is greater than \\377 (not in UTF-8 mode)\0"
292     "internal error: overran compiling workspace\0"
293     "internal error: previously-checked referenced subpattern not found\0"
294     "DEFINE group contains more than one branch\0"
295 nigel 93 /* 55 */
296 ph10 240 "repeating a DEFINE group is not allowed\0"
297     "inconsistent NEWLINE options\0"
298     "\\g is not followed by a braced name or an optionally braced non-zero number\0"
299     "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
300     "(*VERB) with an argument is not supported\0"
301 ph10 211 /* 60 */
302 ph10 240 "(*VERB) not recognized\0"
303     "number is too big";
304 nigel 77
305    
306     /* Table to identify digits and hex digits. This is used when compiling
307     patterns. Note that the tables in chartables are dependent on the locale, and
308     may mark arbitrary characters as digits - but the PCRE compiling code expects
309     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
310     a private table here. It costs 256 bytes, but it is a lot faster than doing
311     character value tests (at least in some simple cases I timed), and in some
312     applications one wants PCRE to compile efficiently as well as match
313     efficiently.
314    
315     For convenience, we use the same bit definitions as in chartables:
316    
317     0x04 decimal digit
318     0x08 hexadecimal digit
319    
320     Then we can use ctype_digit and ctype_xdigit in the code. */
321    
322 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
323 nigel 77 static const unsigned char digitab[] =
324     {
325     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
326     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
331     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
332     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
333     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
334     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
335     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
336     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
337     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
338     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
339     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
340     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
341     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
342     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
343     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
344     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
347     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
348     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
350     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
351     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
352     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
355     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
356     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
357    
358 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
359 nigel 77 static const unsigned char digitab[] =
360     {
361     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
362     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
363     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
364     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
365     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
366     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
367     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
368     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
369     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
370     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
371     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
372 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
373 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
374     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
375     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
376     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
377     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
378     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
379     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
380     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
381     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
382     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
383     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
385     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
387     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
388     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
391     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
392     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
393    
394     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
395     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
396     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
397     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
398     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
399     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
400     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
401     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
402     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
403     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
404     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
405     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
406 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
407 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
408     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
409     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
410     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
411     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
412     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
413     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
414     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
415     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
416     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
417     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
418     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
419     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
420     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
421     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
422     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
423     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
424     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
425     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
426     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
427     #endif
428    
429    
430     /* Definition to allow mutual recursion */
431    
432     static BOOL
433 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
434 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
435 nigel 77
436    
437    
438     /*************************************************
439 ph10 240 * Find an error text *
440     *************************************************/
441    
442 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
443     some of the text is of unknown length, we can't use a table of offsets.
444     Instead, just count through the strings. This is not a performance issue
445 ph10 240 because it happens only when there has been a compilation error.
446    
447     Argument: the error number
448     Returns: pointer to the error string
449     */
450    
451     static const char *
452     find_error_text(int n)
453     {
454     const char *s = error_texts;
455 ph10 243 for (; n > 0; n--) while (*s++ != 0);
456 ph10 240 return s;
457     }
458    
459    
460     /*************************************************
461 nigel 77 * Handle escapes *
462     *************************************************/
463    
464     /* This function is called when a \ has been encountered. It either returns a
465     positive value for a simple escape such as \n, or a negative value which
466 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
467     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
468     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
469     ptr is pointing at the \. On exit, it is on the final character of the escape
470     sequence.
471 nigel 77
472     Arguments:
473     ptrptr points to the pattern position pointer
474     errorcodeptr points to the errorcode variable
475     bracount number of previous extracting brackets
476     options the options bits
477     isclass TRUE if inside a character class
478    
479     Returns: zero or positive => a data character
480     negative => a special escape sequence
481 ph10 213 on error, errorcodeptr is set
482 nigel 77 */
483    
484     static int
485     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
486     int options, BOOL isclass)
487     {
488 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
489     const uschar *ptr = *ptrptr + 1;
490 nigel 77 int c, i;
491    
492 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
493     ptr--; /* Set pointer back to the last byte */
494    
495 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
496    
497     if (c == 0) *errorcodeptr = ERR1;
498    
499     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
500     a table. A non-zero result is something that can be returned immediately.
501     Otherwise further processing may be required. */
502    
503 ph10 97 #ifndef EBCDIC /* ASCII coding */
504 nigel 77 else if (c < '0' || c > 'z') {} /* Not alphameric */
505     else if ((i = escapes[c - '0']) != 0) c = i;
506    
507 ph10 97 #else /* EBCDIC coding */
508 nigel 77 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
509     else if ((i = escapes[c - 0x48]) != 0) c = i;
510     #endif
511    
512     /* Escapes that need further processing, or are illegal. */
513    
514     else
515     {
516     const uschar *oldptr;
517 nigel 93 BOOL braced, negated;
518    
519 nigel 77 switch (c)
520     {
521     /* A number of Perl escapes are not handled by PCRE. We give an explicit
522     error. */
523    
524     case 'l':
525     case 'L':
526     case 'N':
527     case 'u':
528     case 'U':
529     *errorcodeptr = ERR37;
530     break;
531    
532 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
533     is an absolute backreference. If negative, it is a relative backreference.
534 ph10 172 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
535     reference to a named group. This is part of Perl's movement towards a
536     unified syntax for back references. As this is synonymous with \k{name}, we
537 ph10 171 fudge it up by pretending it really was \k. */
538 nigel 93
539     case 'g':
540     if (ptr[1] == '{')
541     {
542 ph10 171 const uschar *p;
543     for (p = ptr+2; *p != 0 && *p != '}'; p++)
544     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
545 ph10 172 if (*p != 0 && *p != '}')
546 ph10 171 {
547     c = -ESC_k;
548     break;
549 ph10 172 }
550 nigel 93 braced = TRUE;
551     ptr++;
552     }
553     else braced = FALSE;
554    
555     if (ptr[1] == '-')
556     {
557     negated = TRUE;
558     ptr++;
559     }
560     else negated = FALSE;
561    
562     c = 0;
563     while ((digitab[ptr[1]] & ctype_digit) != 0)
564     c = c * 10 + *(++ptr) - '0';
565 ph10 220
566 ph10 213 if (c < 0)
567     {
568     *errorcodeptr = ERR61;
569     break;
570 ph10 220 }
571 nigel 93
572     if (c == 0 || (braced && *(++ptr) != '}'))
573     {
574     *errorcodeptr = ERR57;
575 ph10 213 break;
576 nigel 93 }
577    
578     if (negated)
579     {
580     if (c > bracount)
581     {
582     *errorcodeptr = ERR15;
583 ph10 213 break;
584 nigel 93 }
585     c = bracount - (c - 1);
586     }
587    
588     c = -(ESC_REF + c);
589     break;
590    
591 nigel 77 /* The handling of escape sequences consisting of a string of digits
592     starting with one that is not zero is not straightforward. By experiment,
593     the way Perl works seems to be as follows:
594    
595     Outside a character class, the digits are read as a decimal number. If the
596     number is less than 10, or if there are that many previous extracting
597     left brackets, then it is a back reference. Otherwise, up to three octal
598     digits are read to form an escaped byte. Thus \123 is likely to be octal
599     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
600     value is greater than 377, the least significant 8 bits are taken. Inside a
601     character class, \ followed by a digit is always an octal number. */
602    
603     case '1': case '2': case '3': case '4': case '5':
604     case '6': case '7': case '8': case '9':
605    
606     if (!isclass)
607     {
608     oldptr = ptr;
609     c -= '0';
610     while ((digitab[ptr[1]] & ctype_digit) != 0)
611     c = c * 10 + *(++ptr) - '0';
612 ph10 213 if (c < 0)
613     {
614     *errorcodeptr = ERR61;
615 ph10 220 break;
616     }
617 nigel 77 if (c < 10 || c <= bracount)
618     {
619     c = -(ESC_REF + c);
620     break;
621     }
622     ptr = oldptr; /* Put the pointer back and fall through */
623     }
624    
625     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
626     generates a binary zero byte and treats the digit as a following literal.
627     Thus we have to pull back the pointer by one. */
628    
629     if ((c = *ptr) >= '8')
630     {
631     ptr--;
632     c = 0;
633     break;
634     }
635    
636     /* \0 always starts an octal number, but we may drop through to here with a
637 nigel 91 larger first octal digit. The original code used just to take the least
638     significant 8 bits of octal numbers (I think this is what early Perls used
639     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
640     than 3 octal digits. */
641 nigel 77
642     case '0':
643     c -= '0';
644     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
645     c = c * 8 + *(++ptr) - '0';
646 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
647 nigel 77 break;
648    
649 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
650     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
651     treated as a data character. */
652 nigel 77
653     case 'x':
654 nigel 87 if (ptr[1] == '{')
655 nigel 77 {
656     const uschar *pt = ptr + 2;
657 nigel 87 int count = 0;
658    
659 nigel 77 c = 0;
660     while ((digitab[*pt] & ctype_xdigit) != 0)
661     {
662 nigel 87 register int cc = *pt++;
663     if (c == 0 && cc == '0') continue; /* Leading zeroes */
664 nigel 77 count++;
665 nigel 87
666 ph10 97 #ifndef EBCDIC /* ASCII coding */
667 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
668 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
669 ph10 97 #else /* EBCDIC coding */
670 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
671 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
672 nigel 77 #endif
673     }
674 nigel 87
675 nigel 77 if (*pt == '}')
676     {
677 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
678 nigel 77 ptr = pt;
679     break;
680     }
681 nigel 87
682 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
683     recognize this construct; fall through to the normal \x handling. */
684     }
685    
686 nigel 87 /* Read just a single-byte hex-defined char */
687 nigel 77
688     c = 0;
689     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
690     {
691     int cc; /* Some compilers don't like ++ */
692     cc = *(++ptr); /* in initializers */
693 ph10 97 #ifndef EBCDIC /* ASCII coding */
694 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
695     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
696 ph10 97 #else /* EBCDIC coding */
697 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
698     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
699     #endif
700     }
701     break;
702    
703 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
704     This coding is ASCII-specific, but then the whole concept of \cx is
705     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
706 nigel 77
707     case 'c':
708     c = *(++ptr);
709     if (c == 0)
710     {
711     *errorcodeptr = ERR2;
712 ph10 213 break;
713 nigel 77 }
714    
715 ph10 97 #ifndef EBCDIC /* ASCII coding */
716 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
717     c ^= 0x40;
718 ph10 97 #else /* EBCDIC coding */
719 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
720     c ^= 0xC0;
721     #endif
722     break;
723    
724     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
725     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
726     for Perl compatibility, it is a literal. This code looks a bit odd, but
727     there used to be some cases other than the default, and there may be again
728     in future, so I haven't "optimized" it. */
729    
730     default:
731     if ((options & PCRE_EXTRA) != 0) switch(c)
732     {
733     default:
734     *errorcodeptr = ERR3;
735     break;
736     }
737     break;
738     }
739     }
740    
741     *ptrptr = ptr;
742     return c;
743     }
744    
745    
746    
747     #ifdef SUPPORT_UCP
748     /*************************************************
749     * Handle \P and \p *
750     *************************************************/
751    
752     /* This function is called after \P or \p has been encountered, provided that
753     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
754     pointing at the P or p. On exit, it is pointing at the final character of the
755     escape sequence.
756    
757     Argument:
758     ptrptr points to the pattern position pointer
759     negptr points to a boolean that is set TRUE for negation else FALSE
760 nigel 87 dptr points to an int that is set to the detailed property value
761 nigel 77 errorcodeptr points to the error code variable
762    
763 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
764 nigel 77 */
765    
766     static int
767 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
768 nigel 77 {
769     int c, i, bot, top;
770     const uschar *ptr = *ptrptr;
771 nigel 87 char name[32];
772 nigel 77
773     c = *(++ptr);
774     if (c == 0) goto ERROR_RETURN;
775    
776     *negptr = FALSE;
777    
778 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
779     negation. */
780 nigel 77
781     if (c == '{')
782     {
783     if (ptr[1] == '^')
784     {
785     *negptr = TRUE;
786     ptr++;
787     }
788 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
789 nigel 77 {
790     c = *(++ptr);
791     if (c == 0) goto ERROR_RETURN;
792     if (c == '}') break;
793     name[i] = c;
794     }
795 nigel 87 if (c !='}') goto ERROR_RETURN;
796 nigel 77 name[i] = 0;
797     }
798    
799     /* Otherwise there is just one following character */
800    
801     else
802     {
803     name[0] = c;
804     name[1] = 0;
805     }
806    
807     *ptrptr = ptr;
808    
809     /* Search for a recognized property name using binary chop */
810    
811     bot = 0;
812     top = _pcre_utt_size;
813    
814     while (bot < top)
815     {
816 nigel 87 i = (bot + top) >> 1;
817 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
818 nigel 87 if (c == 0)
819     {
820     *dptr = _pcre_utt[i].value;
821     return _pcre_utt[i].type;
822     }
823 nigel 77 if (c > 0) bot = i + 1; else top = i;
824     }
825    
826     *errorcodeptr = ERR47;
827     *ptrptr = ptr;
828     return -1;
829    
830     ERROR_RETURN:
831     *errorcodeptr = ERR46;
832     *ptrptr = ptr;
833     return -1;
834     }
835     #endif
836    
837    
838    
839    
840     /*************************************************
841     * Check for counted repeat *
842     *************************************************/
843    
844     /* This function is called when a '{' is encountered in a place where it might
845     start a quantifier. It looks ahead to see if it really is a quantifier or not.
846     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
847     where the ddds are digits.
848    
849     Arguments:
850     p pointer to the first char after '{'
851    
852     Returns: TRUE or FALSE
853     */
854    
855     static BOOL
856     is_counted_repeat(const uschar *p)
857     {
858     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
859     while ((digitab[*p] & ctype_digit) != 0) p++;
860     if (*p == '}') return TRUE;
861    
862     if (*p++ != ',') return FALSE;
863     if (*p == '}') return TRUE;
864    
865     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
866     while ((digitab[*p] & ctype_digit) != 0) p++;
867    
868     return (*p == '}');
869     }
870    
871    
872    
873     /*************************************************
874     * Read repeat counts *
875     *************************************************/
876    
877     /* Read an item of the form {n,m} and return the values. This is called only
878     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
879     so the syntax is guaranteed to be correct, but we need to check the values.
880    
881     Arguments:
882     p pointer to first char after '{'
883     minp pointer to int for min
884     maxp pointer to int for max
885     returned as -1 if no max
886     errorcodeptr points to error code variable
887    
888     Returns: pointer to '}' on success;
889     current ptr on error, with errorcodeptr set non-zero
890     */
891    
892     static const uschar *
893     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
894     {
895     int min = 0;
896     int max = -1;
897    
898 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
899     an integer overflow. */
900    
901 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
902 nigel 81 if (min < 0 || min > 65535)
903     {
904     *errorcodeptr = ERR5;
905     return p;
906     }
907 nigel 77
908 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
909     Also, max must not be less than min. */
910    
911 nigel 77 if (*p == '}') max = min; else
912     {
913     if (*(++p) != '}')
914     {
915     max = 0;
916     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
917 nigel 81 if (max < 0 || max > 65535)
918     {
919     *errorcodeptr = ERR5;
920     return p;
921     }
922 nigel 77 if (max < min)
923     {
924     *errorcodeptr = ERR4;
925     return p;
926     }
927     }
928     }
929    
930 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
931     '}'. */
932 nigel 77
933 nigel 81 *minp = min;
934     *maxp = max;
935 nigel 77 return p;
936     }
937    
938    
939    
940     /*************************************************
941 nigel 93 * Find forward referenced subpattern *
942 nigel 91 *************************************************/
943    
944 nigel 93 /* This function scans along a pattern's text looking for capturing
945     subpatterns, and counting them. If it finds a named pattern that matches the
946     name it is given, it returns its number. Alternatively, if the name is NULL, it
947     returns when it reaches a given numbered subpattern. This is used for forward
948     references to subpatterns. We know that if (?P< is encountered, the name will
949     be terminated by '>' because that is checked in the first pass.
950 nigel 91
951     Arguments:
952 nigel 93 ptr current position in the pattern
953     count current count of capturing parens so far encountered
954     name name to seek, or NULL if seeking a numbered subpattern
955     lorn name length, or subpattern number if name is NULL
956     xmode TRUE if we are in /x mode
957 nigel 91
958     Returns: the number of the named subpattern, or -1 if not found
959     */
960    
961     static int
962 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
963     BOOL xmode)
964 nigel 91 {
965     const uschar *thisname;
966 nigel 93
967 nigel 91 for (; *ptr != 0; ptr++)
968     {
969 nigel 93 int term;
970    
971     /* Skip over backslashed characters and also entire \Q...\E */
972    
973     if (*ptr == '\\')
974     {
975     if (*(++ptr) == 0) return -1;
976     if (*ptr == 'Q') for (;;)
977     {
978     while (*(++ptr) != 0 && *ptr != '\\');
979     if (*ptr == 0) return -1;
980     if (*(++ptr) == 'E') break;
981     }
982     continue;
983     }
984    
985     /* Skip over character classes */
986    
987     if (*ptr == '[')
988     {
989     while (*(++ptr) != ']')
990     {
991 ph10 220 if (*ptr == 0) return -1;
992 nigel 93 if (*ptr == '\\')
993     {
994     if (*(++ptr) == 0) return -1;
995     if (*ptr == 'Q') for (;;)
996     {
997     while (*(++ptr) != 0 && *ptr != '\\');
998     if (*ptr == 0) return -1;
999     if (*(++ptr) == 'E') break;
1000     }
1001     continue;
1002     }
1003     }
1004     continue;
1005     }
1006    
1007     /* Skip comments in /x mode */
1008    
1009     if (xmode && *ptr == '#')
1010     {
1011     while (*(++ptr) != 0 && *ptr != '\n');
1012     if (*ptr == 0) return -1;
1013     continue;
1014     }
1015    
1016     /* An opening parens must now be a real metacharacter */
1017    
1018 nigel 91 if (*ptr != '(') continue;
1019 ph10 210 if (ptr[1] != '?' && ptr[1] != '*')
1020 nigel 93 {
1021     count++;
1022     if (name == NULL && count == lorn) return count;
1023     continue;
1024     }
1025    
1026     ptr += 2;
1027     if (*ptr == 'P') ptr++; /* Allow optional P */
1028    
1029     /* We have to disambiguate (?<! and (?<= from (?<name> */
1030    
1031     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1032     *ptr != '\'')
1033     continue;
1034    
1035 nigel 91 count++;
1036 nigel 93
1037     if (name == NULL && count == lorn) return count;
1038     term = *ptr++;
1039     if (term == '<') term = '>';
1040 nigel 91 thisname = ptr;
1041 nigel 93 while (*ptr != term) ptr++;
1042     if (name != NULL && lorn == ptr - thisname &&
1043     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1044 nigel 91 return count;
1045     }
1046 nigel 93
1047 nigel 91 return -1;
1048     }
1049    
1050    
1051    
1052     /*************************************************
1053 nigel 77 * Find first significant op code *
1054     *************************************************/
1055    
1056     /* This is called by several functions that scan a compiled expression looking
1057     for a fixed first character, or an anchoring op code etc. It skips over things
1058     that do not influence this. For some calls, a change of option is important.
1059     For some calls, it makes sense to skip negative forward and all backward
1060     assertions, and also the \b assertion; for others it does not.
1061    
1062     Arguments:
1063     code pointer to the start of the group
1064     options pointer to external options
1065     optbit the option bit whose changing is significant, or
1066     zero if none are
1067     skipassert TRUE if certain assertions are to be skipped
1068    
1069     Returns: pointer to the first significant opcode
1070     */
1071    
1072     static const uschar*
1073     first_significant_code(const uschar *code, int *options, int optbit,
1074     BOOL skipassert)
1075     {
1076     for (;;)
1077     {
1078     switch ((int)*code)
1079     {
1080     case OP_OPT:
1081     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1082     *options = (int)code[1];
1083     code += 2;
1084     break;
1085    
1086     case OP_ASSERT_NOT:
1087     case OP_ASSERTBACK:
1088     case OP_ASSERTBACK_NOT:
1089     if (!skipassert) return code;
1090     do code += GET(code, 1); while (*code == OP_ALT);
1091     code += _pcre_OP_lengths[*code];
1092     break;
1093    
1094     case OP_WORD_BOUNDARY:
1095     case OP_NOT_WORD_BOUNDARY:
1096     if (!skipassert) return code;
1097     /* Fall through */
1098    
1099     case OP_CALLOUT:
1100     case OP_CREF:
1101 nigel 93 case OP_RREF:
1102     case OP_DEF:
1103 nigel 77 code += _pcre_OP_lengths[*code];
1104     break;
1105    
1106     default:
1107     return code;
1108     }
1109     }
1110     /* Control never reaches here */
1111     }
1112    
1113    
1114    
1115    
1116     /*************************************************
1117     * Find the fixed length of a pattern *
1118     *************************************************/
1119    
1120     /* Scan a pattern and compute the fixed length of subject that will match it,
1121     if the length is fixed. This is needed for dealing with backward assertions.
1122     In UTF8 mode, the result is in characters rather than bytes.
1123    
1124     Arguments:
1125     code points to the start of the pattern (the bracket)
1126     options the compiling options
1127    
1128     Returns: the fixed length, or -1 if there is no fixed length,
1129     or -2 if \C was encountered
1130     */
1131    
1132     static int
1133     find_fixedlength(uschar *code, int options)
1134     {
1135     int length = -1;
1136    
1137     register int branchlength = 0;
1138     register uschar *cc = code + 1 + LINK_SIZE;
1139    
1140     /* Scan along the opcodes for this branch. If we get to the end of the
1141     branch, check the length against that of the other branches. */
1142    
1143     for (;;)
1144     {
1145     int d;
1146     register int op = *cc;
1147     switch (op)
1148     {
1149 nigel 93 case OP_CBRA:
1150 nigel 77 case OP_BRA:
1151     case OP_ONCE:
1152     case OP_COND:
1153 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1154 nigel 77 if (d < 0) return d;
1155     branchlength += d;
1156     do cc += GET(cc, 1); while (*cc == OP_ALT);
1157     cc += 1 + LINK_SIZE;
1158     break;
1159    
1160     /* Reached end of a branch; if it's a ket it is the end of a nested
1161     call. If it's ALT it is an alternation in a nested call. If it is
1162     END it's the end of the outer call. All can be handled by the same code. */
1163    
1164     case OP_ALT:
1165     case OP_KET:
1166     case OP_KETRMAX:
1167     case OP_KETRMIN:
1168     case OP_END:
1169     if (length < 0) length = branchlength;
1170     else if (length != branchlength) return -1;
1171     if (*cc != OP_ALT) return length;
1172     cc += 1 + LINK_SIZE;
1173     branchlength = 0;
1174     break;
1175    
1176     /* Skip over assertive subpatterns */
1177    
1178     case OP_ASSERT:
1179     case OP_ASSERT_NOT:
1180     case OP_ASSERTBACK:
1181     case OP_ASSERTBACK_NOT:
1182     do cc += GET(cc, 1); while (*cc == OP_ALT);
1183     /* Fall through */
1184    
1185     /* Skip over things that don't match chars */
1186    
1187     case OP_REVERSE:
1188     case OP_CREF:
1189 nigel 93 case OP_RREF:
1190     case OP_DEF:
1191 nigel 77 case OP_OPT:
1192     case OP_CALLOUT:
1193     case OP_SOD:
1194     case OP_SOM:
1195     case OP_EOD:
1196     case OP_EODN:
1197     case OP_CIRC:
1198     case OP_DOLL:
1199     case OP_NOT_WORD_BOUNDARY:
1200     case OP_WORD_BOUNDARY:
1201     cc += _pcre_OP_lengths[*cc];
1202     break;
1203    
1204     /* Handle literal characters */
1205    
1206     case OP_CHAR:
1207     case OP_CHARNC:
1208 nigel 91 case OP_NOT:
1209 nigel 77 branchlength++;
1210     cc += 2;
1211     #ifdef SUPPORT_UTF8
1212     if ((options & PCRE_UTF8) != 0)
1213     {
1214     while ((*cc & 0xc0) == 0x80) cc++;
1215     }
1216     #endif
1217     break;
1218    
1219     /* Handle exact repetitions. The count is already in characters, but we
1220     need to skip over a multibyte character in UTF8 mode. */
1221    
1222     case OP_EXACT:
1223     branchlength += GET2(cc,1);
1224     cc += 4;
1225     #ifdef SUPPORT_UTF8
1226     if ((options & PCRE_UTF8) != 0)
1227     {
1228     while((*cc & 0x80) == 0x80) cc++;
1229     }
1230     #endif
1231     break;
1232    
1233     case OP_TYPEEXACT:
1234     branchlength += GET2(cc,1);
1235 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1236 nigel 77 cc += 4;
1237     break;
1238    
1239     /* Handle single-char matchers */
1240    
1241     case OP_PROP:
1242     case OP_NOTPROP:
1243 nigel 87 cc += 2;
1244 nigel 77 /* Fall through */
1245    
1246     case OP_NOT_DIGIT:
1247     case OP_DIGIT:
1248     case OP_NOT_WHITESPACE:
1249     case OP_WHITESPACE:
1250     case OP_NOT_WORDCHAR:
1251     case OP_WORDCHAR:
1252     case OP_ANY:
1253     branchlength++;
1254     cc++;
1255     break;
1256    
1257     /* The single-byte matcher isn't allowed */
1258    
1259     case OP_ANYBYTE:
1260     return -2;
1261    
1262     /* Check a class for variable quantification */
1263    
1264     #ifdef SUPPORT_UTF8
1265     case OP_XCLASS:
1266     cc += GET(cc, 1) - 33;
1267     /* Fall through */
1268     #endif
1269    
1270     case OP_CLASS:
1271     case OP_NCLASS:
1272     cc += 33;
1273    
1274     switch (*cc)
1275     {
1276     case OP_CRSTAR:
1277     case OP_CRMINSTAR:
1278     case OP_CRQUERY:
1279     case OP_CRMINQUERY:
1280     return -1;
1281    
1282     case OP_CRRANGE:
1283     case OP_CRMINRANGE:
1284     if (GET2(cc,1) != GET2(cc,3)) return -1;
1285     branchlength += GET2(cc,1);
1286     cc += 5;
1287     break;
1288    
1289     default:
1290     branchlength++;
1291     }
1292     break;
1293    
1294     /* Anything else is variable length */
1295    
1296     default:
1297     return -1;
1298     }
1299     }
1300     /* Control never gets here */
1301     }
1302    
1303    
1304    
1305    
1306     /*************************************************
1307     * Scan compiled regex for numbered bracket *
1308     *************************************************/
1309    
1310     /* This little function scans through a compiled pattern until it finds a
1311     capturing bracket with the given number.
1312    
1313     Arguments:
1314     code points to start of expression
1315     utf8 TRUE in UTF-8 mode
1316     number the required bracket number
1317    
1318     Returns: pointer to the opcode for the bracket, or NULL if not found
1319     */
1320    
1321     static const uschar *
1322     find_bracket(const uschar *code, BOOL utf8, int number)
1323     {
1324     for (;;)
1325     {
1326     register int c = *code;
1327     if (c == OP_END) return NULL;
1328 nigel 91
1329     /* XCLASS is used for classes that cannot be represented just by a bit
1330     map. This includes negated single high-valued characters. The length in
1331     the table is zero; the actual length is stored in the compiled code. */
1332    
1333     if (c == OP_XCLASS) code += GET(code, 1);
1334    
1335 nigel 93 /* Handle capturing bracket */
1336 nigel 91
1337 nigel 93 else if (c == OP_CBRA)
1338 nigel 77 {
1339 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1340 nigel 77 if (n == number) return (uschar *)code;
1341 nigel 93 code += _pcre_OP_lengths[c];
1342 nigel 77 }
1343 nigel 91
1344 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1345     repeated character types, we have to test for \p and \P, which have an extra
1346 ph10 218 two bytes of parameters. */
1347 nigel 91
1348 nigel 77 else
1349     {
1350 ph10 218 switch(c)
1351     {
1352     case OP_TYPESTAR:
1353     case OP_TYPEMINSTAR:
1354     case OP_TYPEPLUS:
1355     case OP_TYPEMINPLUS:
1356     case OP_TYPEQUERY:
1357     case OP_TYPEMINQUERY:
1358     case OP_TYPEPOSSTAR:
1359     case OP_TYPEPOSPLUS:
1360     case OP_TYPEPOSQUERY:
1361     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1362 ph10 220 break;
1363 ph10 221
1364     case OP_TYPEUPTO:
1365     case OP_TYPEMINUPTO:
1366     case OP_TYPEEXACT:
1367     case OP_TYPEPOSUPTO:
1368     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1369     break;
1370 ph10 220 }
1371    
1372 ph10 218 /* Add in the fixed length from the table */
1373 ph10 220
1374 nigel 77 code += _pcre_OP_lengths[c];
1375 ph10 220
1376 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1377     a multi-byte character. The length in the table is a minimum, so we have to
1378     arrange to skip the extra bytes. */
1379 ph10 220
1380 ph10 107 #ifdef SUPPORT_UTF8
1381 nigel 77 if (utf8) switch(c)
1382     {
1383     case OP_CHAR:
1384     case OP_CHARNC:
1385     case OP_EXACT:
1386     case OP_UPTO:
1387     case OP_MINUPTO:
1388 nigel 93 case OP_POSUPTO:
1389 nigel 77 case OP_STAR:
1390     case OP_MINSTAR:
1391 nigel 93 case OP_POSSTAR:
1392 nigel 77 case OP_PLUS:
1393     case OP_MINPLUS:
1394 nigel 93 case OP_POSPLUS:
1395 nigel 77 case OP_QUERY:
1396     case OP_MINQUERY:
1397 nigel 93 case OP_POSQUERY:
1398     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1399 nigel 77 break;
1400     }
1401 ph10 111 #endif
1402 nigel 77 }
1403     }
1404     }
1405    
1406    
1407    
1408     /*************************************************
1409     * Scan compiled regex for recursion reference *
1410     *************************************************/
1411    
1412     /* This little function scans through a compiled pattern until it finds an
1413     instance of OP_RECURSE.
1414    
1415     Arguments:
1416     code points to start of expression
1417     utf8 TRUE in UTF-8 mode
1418    
1419     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1420     */
1421    
1422     static const uschar *
1423     find_recurse(const uschar *code, BOOL utf8)
1424     {
1425     for (;;)
1426     {
1427     register int c = *code;
1428     if (c == OP_END) return NULL;
1429 nigel 91 if (c == OP_RECURSE) return code;
1430 ph10 220
1431 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1432     map. This includes negated single high-valued characters. The length in
1433     the table is zero; the actual length is stored in the compiled code. */
1434    
1435     if (c == OP_XCLASS) code += GET(code, 1);
1436    
1437 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1438     repeated character types, we have to test for \p and \P, which have an extra
1439 ph10 218 two bytes of parameters. */
1440 nigel 91
1441 nigel 77 else
1442     {
1443 ph10 218 switch(c)
1444     {
1445     case OP_TYPESTAR:
1446     case OP_TYPEMINSTAR:
1447     case OP_TYPEPLUS:
1448     case OP_TYPEMINPLUS:
1449     case OP_TYPEQUERY:
1450     case OP_TYPEMINQUERY:
1451     case OP_TYPEPOSSTAR:
1452     case OP_TYPEPOSPLUS:
1453     case OP_TYPEPOSQUERY:
1454     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1455 ph10 220 break;
1456 ph10 221
1457     case OP_TYPEPOSUPTO:
1458     case OP_TYPEUPTO:
1459     case OP_TYPEMINUPTO:
1460     case OP_TYPEEXACT:
1461     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1462     break;
1463 ph10 220 }
1464    
1465 ph10 218 /* Add in the fixed length from the table */
1466    
1467 nigel 77 code += _pcre_OP_lengths[c];
1468 ph10 220
1469 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1470     by a multi-byte character. The length in the table is a minimum, so we have
1471     to arrange to skip the extra bytes. */
1472 ph10 220
1473 ph10 107 #ifdef SUPPORT_UTF8
1474 nigel 77 if (utf8) switch(c)
1475     {
1476     case OP_CHAR:
1477     case OP_CHARNC:
1478     case OP_EXACT:
1479     case OP_UPTO:
1480     case OP_MINUPTO:
1481 nigel 93 case OP_POSUPTO:
1482 nigel 77 case OP_STAR:
1483     case OP_MINSTAR:
1484 nigel 93 case OP_POSSTAR:
1485 nigel 77 case OP_PLUS:
1486     case OP_MINPLUS:
1487 nigel 93 case OP_POSPLUS:
1488 nigel 77 case OP_QUERY:
1489     case OP_MINQUERY:
1490 nigel 93 case OP_POSQUERY:
1491     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1492 nigel 77 break;
1493     }
1494 ph10 111 #endif
1495 nigel 77 }
1496     }
1497     }
1498    
1499    
1500    
1501     /*************************************************
1502     * Scan compiled branch for non-emptiness *
1503     *************************************************/
1504    
1505     /* This function scans through a branch of a compiled pattern to see whether it
1506 nigel 93 can match the empty string or not. It is called from could_be_empty()
1507     below and from compile_branch() when checking for an unlimited repeat of a
1508     group that can match nothing. Note that first_significant_code() skips over
1509     assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1510     struck an inner bracket whose current branch will already have been scanned.
1511 nigel 77
1512     Arguments:
1513     code points to start of search
1514     endcode points to where to stop
1515     utf8 TRUE if in UTF8 mode
1516    
1517     Returns: TRUE if what is matched could be empty
1518     */
1519    
1520     static BOOL
1521     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1522     {
1523     register int c;
1524 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1525 nigel 77 code < endcode;
1526     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1527     {
1528     const uschar *ccode;
1529    
1530     c = *code;
1531 ph10 172
1532 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1533 nigel 77
1534 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1535     {
1536 ph10 172 code += _pcre_OP_lengths[c];
1537 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1538     c = *code;
1539     continue;
1540     }
1541    
1542     /* For other groups, scan the branches. */
1543 ph10 172
1544 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1545 nigel 77 {
1546     BOOL empty_branch;
1547     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1548    
1549     /* Scan a closed bracket */
1550    
1551     empty_branch = FALSE;
1552     do
1553     {
1554     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1555     empty_branch = TRUE;
1556     code += GET(code, 1);
1557     }
1558     while (*code == OP_ALT);
1559     if (!empty_branch) return FALSE; /* All branches are non-empty */
1560 ph10 172 c = *code;
1561 nigel 93 continue;
1562 nigel 77 }
1563    
1564 nigel 93 /* Handle the other opcodes */
1565    
1566     switch (c)
1567 nigel 77 {
1568 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1569     cannot be represented just by a bit map. This includes negated single
1570     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1571 ph10 220 actual length is stored in the compiled code, so we must update "code"
1572 ph10 216 here. */
1573 nigel 77
1574     #ifdef SUPPORT_UTF8
1575     case OP_XCLASS:
1576 ph10 216 ccode = code += GET(code, 1);
1577 nigel 77 goto CHECK_CLASS_REPEAT;
1578     #endif
1579    
1580     case OP_CLASS:
1581     case OP_NCLASS:
1582     ccode = code + 33;
1583    
1584     #ifdef SUPPORT_UTF8
1585     CHECK_CLASS_REPEAT:
1586     #endif
1587    
1588     switch (*ccode)
1589     {
1590     case OP_CRSTAR: /* These could be empty; continue */
1591     case OP_CRMINSTAR:
1592     case OP_CRQUERY:
1593     case OP_CRMINQUERY:
1594     break;
1595    
1596     default: /* Non-repeat => class must match */
1597     case OP_CRPLUS: /* These repeats aren't empty */
1598     case OP_CRMINPLUS:
1599     return FALSE;
1600    
1601     case OP_CRRANGE:
1602     case OP_CRMINRANGE:
1603     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1604     break;
1605     }
1606     break;
1607    
1608     /* Opcodes that must match a character */
1609    
1610     case OP_PROP:
1611     case OP_NOTPROP:
1612     case OP_EXTUNI:
1613     case OP_NOT_DIGIT:
1614     case OP_DIGIT:
1615     case OP_NOT_WHITESPACE:
1616     case OP_WHITESPACE:
1617     case OP_NOT_WORDCHAR:
1618     case OP_WORDCHAR:
1619     case OP_ANY:
1620     case OP_ANYBYTE:
1621     case OP_CHAR:
1622     case OP_CHARNC:
1623     case OP_NOT:
1624     case OP_PLUS:
1625     case OP_MINPLUS:
1626 nigel 93 case OP_POSPLUS:
1627 nigel 77 case OP_EXACT:
1628     case OP_NOTPLUS:
1629     case OP_NOTMINPLUS:
1630 nigel 93 case OP_NOTPOSPLUS:
1631 nigel 77 case OP_NOTEXACT:
1632     case OP_TYPEPLUS:
1633     case OP_TYPEMINPLUS:
1634 nigel 93 case OP_TYPEPOSPLUS:
1635 nigel 77 case OP_TYPEEXACT:
1636     return FALSE;
1637 ph10 227
1638     /* These are going to continue, as they may be empty, but we have to
1639     fudge the length for the \p and \P cases. */
1640    
1641 ph10 224 case OP_TYPESTAR:
1642     case OP_TYPEMINSTAR:
1643     case OP_TYPEPOSSTAR:
1644     case OP_TYPEQUERY:
1645     case OP_TYPEMINQUERY:
1646     case OP_TYPEPOSQUERY:
1647     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1648 ph10 227 break;
1649    
1650 ph10 224 /* Same for these */
1651 ph10 227
1652 ph10 224 case OP_TYPEUPTO:
1653     case OP_TYPEMINUPTO:
1654     case OP_TYPEPOSUPTO:
1655     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1656     break;
1657 nigel 77
1658     /* End of branch */
1659    
1660     case OP_KET:
1661     case OP_KETRMAX:
1662     case OP_KETRMIN:
1663     case OP_ALT:
1664     return TRUE;
1665    
1666 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1667     MINUPTO, and POSUPTO may be followed by a multibyte character */
1668 nigel 77
1669     #ifdef SUPPORT_UTF8
1670     case OP_STAR:
1671     case OP_MINSTAR:
1672 nigel 93 case OP_POSSTAR:
1673 nigel 77 case OP_QUERY:
1674     case OP_MINQUERY:
1675 nigel 93 case OP_POSQUERY:
1676 nigel 77 case OP_UPTO:
1677     case OP_MINUPTO:
1678 nigel 93 case OP_POSUPTO:
1679 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1680     break;
1681     #endif
1682     }
1683     }
1684    
1685     return TRUE;
1686     }
1687    
1688    
1689    
1690     /*************************************************
1691     * Scan compiled regex for non-emptiness *
1692     *************************************************/
1693    
1694     /* This function is called to check for left recursive calls. We want to check
1695     the current branch of the current pattern to see if it could match the empty
1696     string. If it could, we must look outwards for branches at other levels,
1697     stopping when we pass beyond the bracket which is the subject of the recursion.
1698    
1699     Arguments:
1700     code points to start of the recursion
1701     endcode points to where to stop (current RECURSE item)
1702     bcptr points to the chain of current (unclosed) branch starts
1703     utf8 TRUE if in UTF-8 mode
1704    
1705     Returns: TRUE if what is matched could be empty
1706     */
1707    
1708     static BOOL
1709     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1710     BOOL utf8)
1711     {
1712     while (bcptr != NULL && bcptr->current >= code)
1713     {
1714     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1715     bcptr = bcptr->outer;
1716     }
1717     return TRUE;
1718     }
1719    
1720    
1721    
1722     /*************************************************
1723     * Check for POSIX class syntax *
1724     *************************************************/
1725    
1726     /* This function is called when the sequence "[:" or "[." or "[=" is
1727     encountered in a character class. It checks whether this is followed by an
1728     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1729     ".]" or "=]".
1730    
1731     Argument:
1732     ptr pointer to the initial [
1733     endptr where to return the end pointer
1734     cd pointer to compile data
1735    
1736     Returns: TRUE or FALSE
1737     */
1738    
1739     static BOOL
1740     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1741     {
1742     int terminator; /* Don't combine these lines; the Solaris cc */
1743     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1744     if (*(++ptr) == '^') ptr++;
1745     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1746     if (*ptr == terminator && ptr[1] == ']')
1747     {
1748     *endptr = ptr;
1749     return TRUE;
1750     }
1751     return FALSE;
1752     }
1753    
1754    
1755    
1756    
1757     /*************************************************
1758     * Check POSIX class name *
1759     *************************************************/
1760    
1761     /* This function is called to check the name given in a POSIX-style class entry
1762     such as [:alnum:].
1763    
1764     Arguments:
1765     ptr points to the first letter
1766     len the length of the name
1767    
1768     Returns: a value representing the name, or -1 if unknown
1769     */
1770    
1771     static int
1772     check_posix_name(const uschar *ptr, int len)
1773     {
1774 ph10 240 const char *pn = posix_names;
1775 nigel 77 register int yield = 0;
1776     while (posix_name_lengths[yield] != 0)
1777     {
1778     if (len == posix_name_lengths[yield] &&
1779 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
1780 ph10 243 pn += posix_name_lengths[yield] + 1;
1781 nigel 77 yield++;
1782     }
1783     return -1;
1784     }
1785    
1786    
1787     /*************************************************
1788     * Adjust OP_RECURSE items in repeated group *
1789     *************************************************/
1790    
1791     /* OP_RECURSE items contain an offset from the start of the regex to the group
1792     that is referenced. This means that groups can be replicated for fixed
1793     repetition simply by copying (because the recursion is allowed to refer to
1794     earlier groups that are outside the current group). However, when a group is
1795     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1796     it, after it has been compiled. This means that any OP_RECURSE items within it
1797     that refer to the group itself or any contained groups have to have their
1798 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1799     the partially compiled regex must be temporarily terminated with OP_END.
1800 nigel 77
1801 nigel 93 This function has been extended with the possibility of forward references for
1802     recursions and subroutine calls. It must also check the list of such references
1803     for the group we are dealing with. If it finds that one of the recursions in
1804     the current group is on this list, it adjusts the offset in the list, not the
1805     value in the reference (which is a group number).
1806    
1807 nigel 77 Arguments:
1808     group points to the start of the group
1809     adjust the amount by which the group is to be moved
1810     utf8 TRUE in UTF-8 mode
1811     cd contains pointers to tables etc.
1812 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1813 nigel 77
1814     Returns: nothing
1815     */
1816    
1817     static void
1818 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1819     uschar *save_hwm)
1820 nigel 77 {
1821     uschar *ptr = group;
1822 ph10 224
1823 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1824     {
1825 nigel 93 int offset;
1826     uschar *hc;
1827    
1828     /* See if this recursion is on the forward reference list. If so, adjust the
1829     reference. */
1830    
1831     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1832     {
1833     offset = GET(hc, 0);
1834     if (cd->start_code + offset == ptr + 1)
1835     {
1836     PUT(hc, 0, offset + adjust);
1837     break;
1838     }
1839     }
1840    
1841     /* Otherwise, adjust the recursion offset if it's after the start of this
1842     group. */
1843    
1844     if (hc >= cd->hwm)
1845     {
1846     offset = GET(ptr, 1);
1847     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1848     }
1849    
1850 nigel 77 ptr += 1 + LINK_SIZE;
1851     }
1852     }
1853    
1854    
1855    
1856     /*************************************************
1857     * Insert an automatic callout point *
1858     *************************************************/
1859    
1860     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1861     callout points before each pattern item.
1862    
1863     Arguments:
1864     code current code pointer
1865     ptr current pattern pointer
1866     cd pointers to tables etc
1867    
1868     Returns: new code pointer
1869     */
1870    
1871     static uschar *
1872     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1873     {
1874     *code++ = OP_CALLOUT;
1875     *code++ = 255;
1876     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1877     PUT(code, LINK_SIZE, 0); /* Default length */
1878     return code + 2*LINK_SIZE;
1879     }
1880    
1881    
1882    
1883     /*************************************************
1884     * Complete a callout item *
1885     *************************************************/
1886    
1887     /* A callout item contains the length of the next item in the pattern, which
1888     we can't fill in till after we have reached the relevant point. This is used
1889     for both automatic and manual callouts.
1890    
1891     Arguments:
1892     previous_callout points to previous callout item
1893     ptr current pattern pointer
1894     cd pointers to tables etc
1895    
1896     Returns: nothing
1897     */
1898    
1899     static void
1900     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1901     {
1902     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1903     PUT(previous_callout, 2 + LINK_SIZE, length);
1904     }
1905    
1906    
1907    
1908     #ifdef SUPPORT_UCP
1909     /*************************************************
1910     * Get othercase range *
1911     *************************************************/
1912    
1913     /* This function is passed the start and end of a class range, in UTF-8 mode
1914     with UCP support. It searches up the characters, looking for internal ranges of
1915     characters in the "other" case. Each call returns the next one, updating the
1916     start address.
1917    
1918     Arguments:
1919     cptr points to starting character value; updated
1920     d end value
1921     ocptr where to put start of othercase range
1922     odptr where to put end of othercase range
1923    
1924     Yield: TRUE when range returned; FALSE when no more
1925     */
1926    
1927     static BOOL
1928 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1929     unsigned int *odptr)
1930 nigel 77 {
1931 nigel 93 unsigned int c, othercase, next;
1932 nigel 77
1933     for (c = *cptr; c <= d; c++)
1934 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1935 nigel 77
1936     if (c > d) return FALSE;
1937    
1938     *ocptr = othercase;
1939     next = othercase + 1;
1940    
1941     for (++c; c <= d; c++)
1942     {
1943 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1944 nigel 77 next++;
1945     }
1946    
1947     *odptr = next - 1;
1948     *cptr = c;
1949    
1950     return TRUE;
1951     }
1952     #endif /* SUPPORT_UCP */
1953    
1954    
1955 nigel 93
1956 nigel 77 /*************************************************
1957 nigel 93 * Check if auto-possessifying is possible *
1958     *************************************************/
1959    
1960     /* This function is called for unlimited repeats of certain items, to see
1961     whether the next thing could possibly match the repeated item. If not, it makes
1962     sense to automatically possessify the repeated item.
1963    
1964     Arguments:
1965     op_code the repeated op code
1966     this data for this item, depends on the opcode
1967     utf8 TRUE in UTF-8 mode
1968     utf8_char used for utf8 character bytes, NULL if not relevant
1969     ptr next character in pattern
1970     options options bits
1971     cd contains pointers to tables etc.
1972    
1973     Returns: TRUE if possessifying is wanted
1974     */
1975    
1976     static BOOL
1977     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1978     const uschar *ptr, int options, compile_data *cd)
1979     {
1980     int next;
1981    
1982     /* Skip whitespace and comments in extended mode */
1983    
1984     if ((options & PCRE_EXTENDED) != 0)
1985     {
1986     for (;;)
1987     {
1988     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1989     if (*ptr == '#')
1990     {
1991     while (*(++ptr) != 0)
1992     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1993     }
1994     else break;
1995     }
1996     }
1997    
1998     /* If the next item is one that we can handle, get its value. A non-negative
1999     value is a character, a negative value is an escape value. */
2000    
2001     if (*ptr == '\\')
2002     {
2003     int temperrorcode = 0;
2004     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2005     if (temperrorcode != 0) return FALSE;
2006     ptr++; /* Point after the escape sequence */
2007     }
2008    
2009     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2010     {
2011     #ifdef SUPPORT_UTF8
2012     if (utf8) { GETCHARINC(next, ptr); } else
2013     #endif
2014     next = *ptr++;
2015     }
2016    
2017     else return FALSE;
2018    
2019     /* Skip whitespace and comments in extended mode */
2020    
2021     if ((options & PCRE_EXTENDED) != 0)
2022     {
2023     for (;;)
2024     {
2025     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2026     if (*ptr == '#')
2027     {
2028     while (*(++ptr) != 0)
2029     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2030     }
2031     else break;
2032     }
2033     }
2034    
2035     /* If the next thing is itself optional, we have to give up. */
2036    
2037     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2038     return FALSE;
2039    
2040     /* Now compare the next item with the previous opcode. If the previous is a
2041     positive single character match, "item" either contains the character or, if
2042     "item" is greater than 127 in utf8 mode, the character's bytes are in
2043     utf8_char. */
2044    
2045    
2046     /* Handle cases when the next item is a character. */
2047    
2048     if (next >= 0) switch(op_code)
2049     {
2050     case OP_CHAR:
2051     #ifdef SUPPORT_UTF8
2052     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2053     #endif
2054     return item != next;
2055    
2056     /* For CHARNC (caseless character) we must check the other case. If we have
2057     Unicode property support, we can use it to test the other case of
2058     high-valued characters. */
2059    
2060     case OP_CHARNC:
2061     #ifdef SUPPORT_UTF8
2062     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2063     #endif
2064     if (item == next) return FALSE;
2065     #ifdef SUPPORT_UTF8
2066     if (utf8)
2067     {
2068     unsigned int othercase;
2069     if (next < 128) othercase = cd->fcc[next]; else
2070     #ifdef SUPPORT_UCP
2071     othercase = _pcre_ucp_othercase((unsigned int)next);
2072     #else
2073     othercase = NOTACHAR;
2074     #endif
2075     return (unsigned int)item != othercase;
2076     }
2077     else
2078     #endif /* SUPPORT_UTF8 */
2079     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2080    
2081     /* For OP_NOT, "item" must be a single-byte character. */
2082    
2083     case OP_NOT:
2084     if (next < 0) return FALSE; /* Not a character */
2085     if (item == next) return TRUE;
2086     if ((options & PCRE_CASELESS) == 0) return FALSE;
2087     #ifdef SUPPORT_UTF8
2088     if (utf8)
2089     {
2090     unsigned int othercase;
2091     if (next < 128) othercase = cd->fcc[next]; else
2092     #ifdef SUPPORT_UCP
2093     othercase = _pcre_ucp_othercase(next);
2094     #else
2095     othercase = NOTACHAR;
2096     #endif
2097     return (unsigned int)item == othercase;
2098     }
2099     else
2100     #endif /* SUPPORT_UTF8 */
2101     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2102    
2103     case OP_DIGIT:
2104     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2105    
2106     case OP_NOT_DIGIT:
2107     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2108    
2109     case OP_WHITESPACE:
2110     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2111    
2112     case OP_NOT_WHITESPACE:
2113     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2114    
2115     case OP_WORDCHAR:
2116     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2117    
2118     case OP_NOT_WORDCHAR:
2119     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2120    
2121 ph10 180 case OP_HSPACE:
2122     case OP_NOT_HSPACE:
2123     switch(next)
2124     {
2125     case 0x09:
2126     case 0x20:
2127     case 0xa0:
2128     case 0x1680:
2129     case 0x180e:
2130     case 0x2000:
2131     case 0x2001:
2132     case 0x2002:
2133     case 0x2003:
2134     case 0x2004:
2135     case 0x2005:
2136     case 0x2006:
2137     case 0x2007:
2138     case 0x2008:
2139     case 0x2009:
2140     case 0x200A:
2141     case 0x202f:
2142     case 0x205f:
2143     case 0x3000:
2144     return op_code != OP_HSPACE;
2145     default:
2146     return op_code == OP_HSPACE;
2147     }
2148    
2149     case OP_VSPACE:
2150     case OP_NOT_VSPACE:
2151     switch(next)
2152     {
2153     case 0x0a:
2154     case 0x0b:
2155     case 0x0c:
2156     case 0x0d:
2157     case 0x85:
2158     case 0x2028:
2159     case 0x2029:
2160     return op_code != OP_VSPACE;
2161     default:
2162     return op_code == OP_VSPACE;
2163     }
2164    
2165 nigel 93 default:
2166     return FALSE;
2167     }
2168    
2169    
2170     /* Handle the case when the next item is \d, \s, etc. */
2171    
2172     switch(op_code)
2173     {
2174     case OP_CHAR:
2175     case OP_CHARNC:
2176     #ifdef SUPPORT_UTF8
2177     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2178     #endif
2179     switch(-next)
2180     {
2181     case ESC_d:
2182     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2183    
2184     case ESC_D:
2185     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2186    
2187     case ESC_s:
2188     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2189    
2190     case ESC_S:
2191     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2192    
2193     case ESC_w:
2194     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2195    
2196     case ESC_W:
2197     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2198 ph10 182
2199 ph10 180 case ESC_h:
2200     case ESC_H:
2201     switch(item)
2202     {
2203     case 0x09:
2204     case 0x20:
2205     case 0xa0:
2206     case 0x1680:
2207     case 0x180e:
2208     case 0x2000:
2209     case 0x2001:
2210     case 0x2002:
2211     case 0x2003:
2212     case 0x2004:
2213     case 0x2005:
2214     case 0x2006:
2215     case 0x2007:
2216     case 0x2008:
2217     case 0x2009:
2218     case 0x200A:
2219     case 0x202f:
2220     case 0x205f:
2221     case 0x3000:
2222     return -next != ESC_h;
2223     default:
2224     return -next == ESC_h;
2225 ph10 182 }
2226    
2227 ph10 180 case ESC_v:
2228     case ESC_V:
2229     switch(item)
2230     {
2231     case 0x0a:
2232     case 0x0b:
2233     case 0x0c:
2234     case 0x0d:
2235     case 0x85:
2236     case 0x2028:
2237     case 0x2029:
2238     return -next != ESC_v;
2239     default:
2240     return -next == ESC_v;
2241 ph10 182 }
2242 nigel 93
2243     default:
2244     return FALSE;
2245     }
2246    
2247     case OP_DIGIT:
2248 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2249     next == -ESC_h || next == -ESC_v;
2250 nigel 93
2251     case OP_NOT_DIGIT:
2252     return next == -ESC_d;
2253    
2254     case OP_WHITESPACE:
2255     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2256    
2257     case OP_NOT_WHITESPACE:
2258 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2259 nigel 93
2260 ph10 180 case OP_HSPACE:
2261     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2262    
2263     case OP_NOT_HSPACE:
2264     return next == -ESC_h;
2265 ph10 182
2266 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2267 ph10 182 case OP_VSPACE:
2268 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2269    
2270     case OP_NOT_VSPACE:
2271 ph10 182 return next == -ESC_v;
2272 ph10 180
2273 nigel 93 case OP_WORDCHAR:
2274 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2275 nigel 93
2276     case OP_NOT_WORDCHAR:
2277     return next == -ESC_w || next == -ESC_d;
2278 ph10 182
2279 nigel 93 default:
2280     return FALSE;
2281     }
2282    
2283     /* Control does not reach here */
2284     }
2285    
2286    
2287    
2288     /*************************************************
2289 nigel 77 * Compile one branch *
2290     *************************************************/
2291    
2292 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2293 nigel 77 changed during the branch, the pointer is used to change the external options
2294 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2295     to find out the amount of memory needed, as well as during the real compile
2296     phase. The value of lengthptr distinguishes the two phases.
2297 nigel 77
2298     Arguments:
2299     optionsptr pointer to the option bits
2300     codeptr points to the pointer to the current code point
2301     ptrptr points to the current pattern pointer
2302     errorcodeptr points to error code variable
2303     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2304     reqbyteptr set to the last literal character required, else < 0
2305     bcptr points to current branch chain
2306     cd contains pointers to tables etc.
2307 nigel 93 lengthptr NULL during the real compile phase
2308     points to length accumulator during pre-compile phase
2309 nigel 77
2310     Returns: TRUE on success
2311     FALSE, with *errorcodeptr set non-zero on error
2312     */
2313    
2314     static BOOL
2315 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2316     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2317     compile_data *cd, int *lengthptr)
2318 nigel 77 {
2319     int repeat_type, op_type;
2320     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2321     int bravalue = 0;
2322     int greedy_default, greedy_non_default;
2323     int firstbyte, reqbyte;
2324     int zeroreqbyte, zerofirstbyte;
2325     int req_caseopt, reqvary, tempreqvary;
2326     int options = *optionsptr;
2327     int after_manual_callout = 0;
2328 nigel 93 int length_prevgroup = 0;
2329 nigel 77 register int c;
2330     register uschar *code = *codeptr;
2331 nigel 93 uschar *last_code = code;
2332     uschar *orig_code = code;
2333 nigel 77 uschar *tempcode;
2334     BOOL inescq = FALSE;
2335     BOOL groupsetfirstbyte = FALSE;
2336     const uschar *ptr = *ptrptr;
2337     const uschar *tempptr;
2338     uschar *previous = NULL;
2339     uschar *previous_callout = NULL;
2340 nigel 93 uschar *save_hwm = NULL;
2341 nigel 77 uschar classbits[32];
2342    
2343     #ifdef SUPPORT_UTF8
2344     BOOL class_utf8;
2345     BOOL utf8 = (options & PCRE_UTF8) != 0;
2346     uschar *class_utf8data;
2347     uschar utf8_char[6];
2348     #else
2349     BOOL utf8 = FALSE;
2350 nigel 93 uschar *utf8_char = NULL;
2351 nigel 77 #endif
2352    
2353 nigel 93 #ifdef DEBUG
2354     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2355     #endif
2356    
2357 nigel 77 /* Set up the default and non-default settings for greediness */
2358    
2359     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2360     greedy_non_default = greedy_default ^ 1;
2361    
2362     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2363     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2364     matches a non-fixed char first char; reqbyte just remains unset if we never
2365     find one.
2366    
2367     When we hit a repeat whose minimum is zero, we may have to adjust these values
2368     to take the zero repeat into account. This is implemented by setting them to
2369     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2370     item types that can be repeated set these backoff variables appropriately. */
2371    
2372     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2373    
2374     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2375     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2376     value > 255. It is added into the firstbyte or reqbyte variables to record the
2377     case status of the value. This is used only for ASCII characters. */
2378    
2379     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2380    
2381     /* Switch on next character until the end of the branch */
2382    
2383     for (;; ptr++)
2384     {
2385     BOOL negate_class;
2386 ph10 264 BOOL should_flip_negation;
2387 nigel 77 BOOL possessive_quantifier;
2388     BOOL is_quantifier;
2389 nigel 93 BOOL is_recurse;
2390 ph10 180 BOOL reset_bracount;
2391 nigel 77 int class_charcount;
2392     int class_lastchar;
2393     int newoptions;
2394     int recno;
2395 ph10 172 int refsign;
2396 nigel 77 int skipbytes;
2397     int subreqbyte;
2398     int subfirstbyte;
2399 nigel 93 int terminator;
2400 nigel 77 int mclength;
2401     uschar mcbuffer[8];
2402    
2403 nigel 93 /* Get next byte in the pattern */
2404 nigel 77
2405     c = *ptr;
2406    
2407 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2408     previous cycle of this loop. */
2409    
2410     if (lengthptr != NULL)
2411     {
2412     #ifdef DEBUG
2413     if (code > cd->hwm) cd->hwm = code; /* High water info */
2414     #endif
2415     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2416     {
2417     *errorcodeptr = ERR52;
2418     goto FAILED;
2419     }
2420    
2421     /* There is at least one situation where code goes backwards: this is the
2422     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2423     the class is simply eliminated. However, it is created first, so we have to
2424     allow memory for it. Therefore, don't ever reduce the length at this point.
2425     */
2426    
2427     if (code < last_code) code = last_code;
2428 ph10 202
2429     /* Paranoid check for integer overflow */
2430    
2431     if (OFLOW_MAX - *lengthptr < code - last_code)
2432     {
2433     *errorcodeptr = ERR20;
2434     goto FAILED;
2435     }
2436    
2437 nigel 93 *lengthptr += code - last_code;
2438     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2439    
2440     /* If "previous" is set and it is not at the start of the work space, move
2441     it back to there, in order to avoid filling up the work space. Otherwise,
2442     if "previous" is NULL, reset the current code pointer to the start. */
2443    
2444     if (previous != NULL)
2445     {
2446     if (previous > orig_code)
2447     {
2448     memmove(orig_code, previous, code - previous);
2449     code -= previous - orig_code;
2450     previous = orig_code;
2451     }
2452     }
2453     else code = orig_code;
2454    
2455     /* Remember where this code item starts so we can pick up the length
2456     next time round. */
2457    
2458     last_code = code;
2459     }
2460    
2461     /* In the real compile phase, just check the workspace used by the forward
2462     reference list. */
2463    
2464     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2465     {
2466     *errorcodeptr = ERR52;
2467     goto FAILED;
2468     }
2469    
2470 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2471    
2472     if (inescq && c != 0)
2473     {
2474     if (c == '\\' && ptr[1] == 'E')
2475     {
2476     inescq = FALSE;
2477     ptr++;
2478     continue;
2479     }
2480     else
2481     {
2482     if (previous_callout != NULL)
2483     {
2484 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2485     complete_callout(previous_callout, ptr, cd);
2486 nigel 77 previous_callout = NULL;
2487     }
2488     if ((options & PCRE_AUTO_CALLOUT) != 0)
2489     {
2490     previous_callout = code;
2491     code = auto_callout(code, ptr, cd);
2492     }
2493     goto NORMAL_CHAR;
2494     }
2495     }
2496    
2497     /* Fill in length of a previous callout, except when the next thing is
2498     a quantifier. */
2499    
2500     is_quantifier = c == '*' || c == '+' || c == '?' ||
2501     (c == '{' && is_counted_repeat(ptr+1));
2502    
2503     if (!is_quantifier && previous_callout != NULL &&
2504     after_manual_callout-- <= 0)
2505     {
2506 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2507     complete_callout(previous_callout, ptr, cd);
2508 nigel 77 previous_callout = NULL;
2509     }
2510    
2511     /* In extended mode, skip white space and comments */
2512    
2513     if ((options & PCRE_EXTENDED) != 0)
2514     {
2515     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2516     if (c == '#')
2517     {
2518 nigel 93 while (*(++ptr) != 0)
2519 nigel 91 {
2520 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2521 nigel 91 }
2522 nigel 93 if (*ptr != 0) continue;
2523    
2524 nigel 91 /* Else fall through to handle end of string */
2525     c = 0;
2526 nigel 77 }
2527     }
2528    
2529     /* No auto callout for quantifiers. */
2530    
2531     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2532     {
2533     previous_callout = code;
2534     code = auto_callout(code, ptr, cd);
2535     }
2536    
2537     switch(c)
2538     {
2539 nigel 93 /* ===================================================================*/
2540     case 0: /* The branch terminates at string end */
2541     case '|': /* or | or ) */
2542 nigel 77 case ')':
2543     *firstbyteptr = firstbyte;
2544     *reqbyteptr = reqbyte;
2545     *codeptr = code;
2546     *ptrptr = ptr;
2547 nigel 93 if (lengthptr != NULL)
2548     {
2549 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2550     {
2551     *errorcodeptr = ERR20;
2552     goto FAILED;
2553     }
2554 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2555     DPRINTF((">> end branch\n"));
2556     }
2557 nigel 77 return TRUE;
2558    
2559 nigel 93
2560     /* ===================================================================*/
2561 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2562     the setting of any following char as a first character. */
2563    
2564     case '^':
2565     if ((options & PCRE_MULTILINE) != 0)
2566     {
2567     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2568     }
2569     previous = NULL;
2570     *code++ = OP_CIRC;
2571     break;
2572    
2573     case '$':
2574     previous = NULL;
2575     *code++ = OP_DOLL;
2576     break;
2577    
2578     /* There can never be a first char if '.' is first, whatever happens about
2579     repeats. The value of reqbyte doesn't change either. */
2580    
2581     case '.':
2582     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2583     zerofirstbyte = firstbyte;
2584     zeroreqbyte = reqbyte;
2585     previous = code;
2586     *code++ = OP_ANY;
2587     break;
2588    
2589 nigel 93
2590     /* ===================================================================*/
2591 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2592     32-byte bitmap of the permitted characters, except in the special case
2593     where there is only one such character. For negated classes, we build the
2594     map as usual, then invert it at the end. However, we use a different opcode
2595     so that data characters > 255 can be handled correctly.
2596 nigel 77
2597     If the class contains characters outside the 0-255 range, a different
2598     opcode is compiled. It may optionally have a bit map for characters < 256,
2599     but those above are are explicitly listed afterwards. A flag byte tells
2600     whether the bitmap is present, and whether this is a negated class or not.
2601     */
2602    
2603     case '[':
2604     previous = code;
2605    
2606     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2607     they are encountered at the top level, so we'll do that too. */
2608    
2609     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2610     check_posix_syntax(ptr, &tempptr, cd))
2611     {
2612     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2613     goto FAILED;
2614     }
2615    
2616 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2617 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2618 ph10 205 skip them too. This makes for compatibility with Perl. */
2619 ph10 208
2620 ph10 205 negate_class = FALSE;
2621     for (;;)
2622 nigel 77 {
2623     c = *(++ptr);
2624 ph10 205 if (c == '\\')
2625     {
2626 ph10 208 if (ptr[1] == 'E') ptr++;
2627 ph10 205 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2628 ph10 208 else break;
2629 ph10 205 }
2630     else if (!negate_class && c == '^')
2631     negate_class = TRUE;
2632     else break;
2633 ph10 208 }
2634 nigel 77
2635 ph10 264 /* If a class contains a negative special such as \S, we need to flip the
2636     negation flag at the end, so that support for characters > 255 works
2637     correctly (they are all included in the class). */
2638    
2639     should_flip_negation = FALSE;
2640    
2641 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
2642 nigel 93 of just a single character (as long as it's < 256). However, For higher
2643     valued UTF-8 characters, we don't yet do any optimization. */
2644 nigel 77
2645     class_charcount = 0;
2646     class_lastchar = -1;
2647    
2648 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2649     temporary bit of memory, in case the class contains only 1 character (less
2650     than 256), because in that case the compiled code doesn't use the bit map.
2651     */
2652    
2653     memset(classbits, 0, 32 * sizeof(uschar));
2654    
2655 nigel 77 #ifdef SUPPORT_UTF8
2656     class_utf8 = FALSE; /* No chars >= 256 */
2657 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2658 nigel 77 #endif
2659    
2660     /* Process characters until ] is reached. By writing this as a "do" it
2661 nigel 93 means that an initial ] is taken as a data character. At the start of the
2662     loop, c contains the first byte of the character. */
2663 nigel 77
2664 nigel 93 if (c != 0) do
2665 nigel 77 {
2666 nigel 93 const uschar *oldptr;
2667    
2668 nigel 77 #ifdef SUPPORT_UTF8
2669     if (utf8 && c > 127)
2670     { /* Braces are required because the */
2671     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2672     }
2673     #endif
2674    
2675     /* Inside \Q...\E everything is literal except \E */
2676    
2677     if (inescq)
2678     {
2679 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2680 nigel 77 {
2681 nigel 93 inescq = FALSE; /* Reset literal state */
2682     ptr++; /* Skip the 'E' */
2683     continue; /* Carry on with next */
2684 nigel 77 }
2685 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2686 nigel 77 }
2687    
2688     /* Handle POSIX class names. Perl allows a negation extension of the
2689     form [:^name:]. A square bracket that doesn't match the syntax is
2690     treated as a literal. We also recognize the POSIX constructions
2691     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2692     5.6 and 5.8 do. */
2693    
2694     if (c == '[' &&
2695     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2696     check_posix_syntax(ptr, &tempptr, cd))
2697     {
2698     BOOL local_negate = FALSE;
2699 nigel 87 int posix_class, taboffset, tabopt;
2700 nigel 77 register const uschar *cbits = cd->cbits;
2701 nigel 87 uschar pbits[32];
2702 nigel 77
2703     if (ptr[1] != ':')
2704     {
2705     *errorcodeptr = ERR31;
2706     goto FAILED;
2707     }
2708    
2709     ptr += 2;
2710     if (*ptr == '^')
2711     {
2712     local_negate = TRUE;
2713 ph10 265 should_flip_negation = TRUE; /* Note negative special */
2714 nigel 77 ptr++;
2715     }
2716    
2717     posix_class = check_posix_name(ptr, tempptr - ptr);
2718     if (posix_class < 0)
2719     {
2720     *errorcodeptr = ERR30;
2721     goto FAILED;
2722     }
2723    
2724     /* If matching is caseless, upper and lower are converted to
2725     alpha. This relies on the fact that the class table starts with
2726     alpha, lower, upper as the first 3 entries. */
2727    
2728     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2729     posix_class = 0;
2730    
2731 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2732     because we may be adding and subtracting from it, and we don't want to
2733     subtract bits that may be in the main map already. At the end we or the
2734     result into the bit map that is being built. */
2735 nigel 77
2736     posix_class *= 3;
2737 nigel 87
2738     /* Copy in the first table (always present) */
2739    
2740     memcpy(pbits, cbits + posix_class_maps[posix_class],
2741     32 * sizeof(uschar));
2742    
2743     /* If there is a second table, add or remove it as required. */
2744    
2745     taboffset = posix_class_maps[posix_class + 1];
2746     tabopt = posix_class_maps[posix_class + 2];
2747    
2748     if (taboffset >= 0)
2749 nigel 77 {
2750 nigel 87 if (tabopt >= 0)
2751     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2752 nigel 77 else
2753 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2754 nigel 77 }
2755    
2756 nigel 87 /* Not see if we need to remove any special characters. An option
2757     value of 1 removes vertical space and 2 removes underscore. */
2758    
2759     if (tabopt < 0) tabopt = -tabopt;
2760     if (tabopt == 1) pbits[1] &= ~0x3c;
2761     else if (tabopt == 2) pbits[11] &= 0x7f;
2762    
2763     /* Add the POSIX table or its complement into the main table that is
2764     being built and we are done. */
2765    
2766     if (local_negate)
2767     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2768     else
2769     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2770    
2771 nigel 77 ptr = tempptr + 1;
2772     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2773     continue; /* End of POSIX syntax handling */
2774     }
2775    
2776     /* Backslash may introduce a single character, or it may introduce one
2777 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2778     case. Inside a class (and only there) it is treated as backspace.
2779     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2780 ph10 205 to 'or' into the one we are building. We assume they have more than one
2781 nigel 77 character in them, so set class_charcount bigger than one. */
2782    
2783     if (c == '\\')
2784     {
2785 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2786     if (*errorcodeptr != 0) goto FAILED;
2787 nigel 77
2788     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2789     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2790 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2791 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2792     {
2793     if (ptr[1] == '\\' && ptr[2] == 'E')
2794     {
2795     ptr += 2; /* avoid empty string */
2796     }
2797     else inescq = TRUE;
2798     continue;
2799     }
2800 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2801 nigel 77
2802     if (c < 0)
2803     {
2804     register const uschar *cbits = cd->cbits;
2805     class_charcount += 2; /* Greater than 1 is what matters */
2806 nigel 93
2807     /* Save time by not doing this in the pre-compile phase. */
2808    
2809     if (lengthptr == NULL) switch (-c)
2810 nigel 77 {
2811     case ESC_d:
2812     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2813     continue;
2814    
2815     case ESC_D:
2816 ph10 264 should_flip_negation = TRUE;
2817 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2818     continue;
2819    
2820     case ESC_w:
2821     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2822     continue;
2823    
2824     case ESC_W:
2825 ph10 264 should_flip_negation = TRUE;
2826 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2827     continue;
2828    
2829     case ESC_s:
2830     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2831     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2832     continue;
2833    
2834     case ESC_S:
2835 ph10 264 should_flip_negation = TRUE;
2836 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2837     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2838     continue;
2839    
2840 nigel 93 case ESC_E: /* Perl ignores an orphan \E */
2841     continue;
2842 ph10 180
2843 nigel 93 default: /* Not recognized; fall through */
2844     break; /* Need "default" setting to stop compiler warning. */
2845     }
2846    
2847     /* In the pre-compile phase, just do the recognition. */
2848    
2849     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2850     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2851 ph10 180
2852 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
2853     they use extra memory. */
2854 ph10 180
2855 ph10 178 if (-c == ESC_h)
2856     {
2857     SETBIT(classbits, 0x09); /* VT */
2858     SETBIT(classbits, 0x20); /* SPACE */
2859 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
2860 ph10 178 #ifdef SUPPORT_UTF8
2861     if (utf8)
2862 ph10 180 {
2863 ph10 178 class_utf8 = TRUE;
2864     *class_utf8data++ = XCL_SINGLE;
2865 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2866 ph10 178 *class_utf8data++ = XCL_SINGLE;
2867 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2868     *class_utf8data++ = XCL_RANGE;
2869     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2870     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2871 ph10 178 *class_utf8data++ = XCL_SINGLE;
2872 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2873 ph10 178 *class_utf8data++ = XCL_SINGLE;
2874 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2875 ph10 178 *class_utf8data++ = XCL_SINGLE;
2876 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2877     }
2878     #endif
2879     continue;
2880     }
2881 nigel 93
2882 ph10 178 if (-c == ESC_H)
2883     {
2884     for (c = 0; c < 32; c++)
2885     {
2886     int x = 0xff;
2887     switch (c)
2888 ph10 180 {
2889 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
2890     case 0x20/8: x ^= 1 << (0x20%8); break;
2891     case 0xa0/8: x ^= 1 << (0xa0%8); break;
2892     default: break;
2893     }
2894     classbits[c] |= x;
2895 ph10 180 }
2896    
2897 ph10 178 #ifdef SUPPORT_UTF8
2898     if (utf8)
2899 ph10 180 {
2900 ph10 178 class_utf8 = TRUE;
2901 ph10 180 *class_utf8data++ = XCL_RANGE;
2902     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2903     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2904     *class_utf8data++ = XCL_RANGE;
2905     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2906     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2907     *class_utf8data++ = XCL_RANGE;
2908     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2909     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2910     *class_utf8data++ = XCL_RANGE;
2911     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2912     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2913     *class_utf8data++ = XCL_RANGE;
2914     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2915     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2916     *class_utf8data++ = XCL_RANGE;
2917     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2918     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2919     *class_utf8data++ = XCL_RANGE;
2920     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2921     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2922     }
2923     #endif
2924     continue;
2925     }
2926 ph10 178
2927     if (-c == ESC_v)
2928     {
2929     SETBIT(classbits, 0x0a); /* LF */
2930     SETBIT(classbits, 0x0b); /* VT */
2931 ph10 180 SETBIT(classbits, 0x0c); /* FF */
2932     SETBIT(classbits, 0x0d); /* CR */
2933     SETBIT(classbits, 0x85); /* NEL */
2934 ph10 178 #ifdef SUPPORT_UTF8
2935     if (utf8)
2936 ph10 180 {
2937 ph10 178 class_utf8 = TRUE;
2938 ph10 180 *class_utf8data++ = XCL_RANGE;
2939     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2940     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2941     }
2942     #endif
2943     continue;
2944     }
2945 ph10 178
2946     if (-c == ESC_V)
2947     {
2948     for (c = 0; c < 32; c++)
2949     {
2950     int x = 0xff;
2951     switch (c)
2952 ph10 180 {
2953 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
2954     x ^= 1 << (0x0b%8);
2955     x ^= 1 << (0x0c%8);
2956 ph10 180 x ^= 1 << (0x0d%8);
2957 ph10 178 break;
2958     case 0x85/8: x ^= 1 << (0x85%8); break;
2959     default: break;
2960     }
2961     classbits[c] |= x;
2962 ph10 180 }
2963    
2964 ph10 178 #ifdef SUPPORT_UTF8
2965     if (utf8)
2966 ph10 180 {
2967 ph10 178 class_utf8 = TRUE;
2968 ph10 180 *class_utf8data++ = XCL_RANGE;
2969     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2970     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2971     *class_utf8data++ = XCL_RANGE;
2972     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2973     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2974     }
2975     #endif
2976     continue;
2977     }
2978 ph10 178
2979 nigel 93 /* We need to deal with \P and \p in both phases. */
2980    
2981 nigel 77 #ifdef SUPPORT_UCP
2982 nigel 93 if (-c == ESC_p || -c == ESC_P)
2983     {
2984     BOOL negated;
2985     int pdata;
2986     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2987     if (ptype < 0) goto FAILED;
2988     class_utf8 = TRUE;
2989     *class_utf8data++ = ((-c == ESC_p) != negated)?
2990     XCL_PROP : XCL_NOTPROP;
2991     *class_utf8data++ = ptype;
2992     *class_utf8data++ = pdata;
2993     class_charcount -= 2; /* Not a < 256 character */
2994 nigel 77 continue;
2995 nigel 93 }
2996 nigel 77 #endif
2997 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
2998     strict mode. By default, for compatibility with Perl, they are
2999     treated as literals. */
3000 nigel 77
3001 nigel 93 if ((options & PCRE_EXTRA) != 0)
3002     {
3003     *errorcodeptr = ERR7;
3004     goto FAILED;
3005     }
3006 nigel 77
3007 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3008     c = *ptr; /* Get the final character and fall through */
3009 nigel 77 }
3010    
3011     /* Fall through if we have a single character (c >= 0). This may be
3012 nigel 93 greater than 256 in UTF-8 mode. */
3013 nigel 77
3014     } /* End of backslash handling */
3015    
3016     /* A single character may be followed by '-' to form a range. However,
3017     Perl does not permit ']' to be the end of the range. A '-' character
3018 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3019     entirely. The code for handling \Q and \E is messy. */
3020 nigel 77
3021 nigel 93 CHECK_RANGE:
3022     while (ptr[1] == '\\' && ptr[2] == 'E')
3023 nigel 77 {
3024 nigel 93 inescq = FALSE;
3025     ptr += 2;
3026     }
3027    
3028     oldptr = ptr;
3029 ph10 231
3030 ph10 230 /* Remember \r or \n */
3031 ph10 231
3032     if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3033    
3034 ph10 230 /* Check for range */
3035 nigel 93
3036     if (!inescq && ptr[1] == '-')
3037     {
3038 nigel 77 int d;
3039     ptr += 2;
3040 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3041 nigel 77
3042 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3043     mode. */
3044    
3045     while (*ptr == '\\' && ptr[1] == 'Q')
3046     {
3047     ptr += 2;
3048     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3049     inescq = TRUE;
3050     break;
3051     }
3052    
3053     if (*ptr == 0 || (!inescq && *ptr == ']'))
3054     {
3055     ptr = oldptr;
3056     goto LONE_SINGLE_CHARACTER;
3057     }
3058    
3059 nigel 77 #ifdef SUPPORT_UTF8
3060     if (utf8)
3061     { /* Braces are required because the */
3062     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3063     }
3064     else
3065     #endif
3066     d = *ptr; /* Not UTF-8 mode */
3067    
3068     /* The second part of a range can be a single-character escape, but
3069     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3070     in such circumstances. */
3071    
3072 nigel 93 if (!inescq && d == '\\')
3073 nigel 77 {
3074 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3075     if (*errorcodeptr != 0) goto FAILED;
3076 nigel 77
3077 nigel 93 /* \b is backslash; \X is literal X; \R is literal R; any other
3078     special means the '-' was literal */
3079 nigel 77
3080     if (d < 0)
3081     {
3082     if (d == -ESC_b) d = '\b';
3083 nigel 93 else if (d == -ESC_X) d = 'X';
3084     else if (d == -ESC_R) d = 'R'; else
3085 nigel 77 {
3086 nigel 93 ptr = oldptr;
3087 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3088     }
3089     }
3090     }
3091    
3092 nigel 93 /* Check that the two values are in the correct order. Optimize
3093     one-character ranges */
3094 nigel 77
3095 nigel 93 if (d < c)
3096     {
3097     *errorcodeptr = ERR8;
3098     goto FAILED;
3099     }
3100    
3101 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3102    
3103 ph10 230 /* Remember \r or \n */
3104 ph10 231
3105     if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3106    
3107 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3108     matching, we have to use an XCLASS with extra data items. Caseless
3109     matching for characters > 127 is available only if UCP support is
3110     available. */
3111    
3112     #ifdef SUPPORT_UTF8
3113     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3114     {
3115     class_utf8 = TRUE;
3116    
3117     /* With UCP support, we can find the other case equivalents of
3118     the relevant characters. There may be several ranges. Optimize how
3119     they fit with the basic range. */
3120    
3121     #ifdef SUPPORT_UCP
3122     if ((options & PCRE_CASELESS) != 0)
3123     {
3124 nigel 93 unsigned int occ, ocd;
3125     unsigned int cc = c;
3126     unsigned int origd = d;
3127 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3128     {
3129 ph10 180 if (occ >= (unsigned int)c &&
3130     ocd <= (unsigned int)d)
3131 ph10 176 continue; /* Skip embedded ranges */
3132 nigel 77
3133 ph10 180 if (occ < (unsigned int)c &&
3134 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3135 nigel 77 { /* if there is overlap, */
3136     c = occ; /* noting that if occ < c */
3137     continue; /* we can't have ocd > d */
3138     } /* because a subrange is */
3139 ph10 180 if (ocd > (unsigned int)d &&
3140 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3141 nigel 77 { /* the basic range. */
3142     d = ocd;
3143     continue;
3144     }
3145    
3146     if (occ == ocd)
3147     {
3148     *class_utf8data++ = XCL_SINGLE;
3149     }
3150     else
3151     {
3152     *class_utf8data++ = XCL_RANGE;
3153     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3154     }
3155     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3156     }
3157     }
3158     #endif /* SUPPORT_UCP */
3159    
3160     /* Now record the original range, possibly modified for UCP caseless
3161     overlapping ranges. */
3162    
3163     *class_utf8data++ = XCL_RANGE;
3164     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3165     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3166    
3167     /* With UCP support, we are done. Without UCP support, there is no
3168     caseless matching for UTF-8 characters > 127; we can use the bit map
3169     for the smaller ones. */
3170    
3171     #ifdef SUPPORT_UCP
3172     continue; /* With next character in the class */
3173     #else
3174     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3175    
3176     /* Adjust upper limit and fall through to set up the map */
3177    
3178     d = 127;
3179    
3180     #endif /* SUPPORT_UCP */
3181     }
3182     #endif /* SUPPORT_UTF8 */
3183    
3184     /* We use the bit map for all cases when not in UTF-8 mode; else
3185     ranges that lie entirely within 0-127 when there is UCP support; else
3186     for partial ranges without UCP support. */
3187    
3188 nigel 93 class_charcount += d - c + 1;
3189     class_lastchar = d;
3190    
3191     /* We can save a bit of time by skipping this in the pre-compile. */
3192    
3193     if (lengthptr == NULL) for (; c <= d; c++)
3194 nigel 77 {
3195     classbits[c/8] |= (1 << (c&7));
3196     if ((options & PCRE_CASELESS) != 0)
3197     {
3198     int uc = cd->fcc[c]; /* flip case */
3199     classbits[uc/8] |= (1 << (uc&7));
3200     }
3201     }
3202    
3203     continue; /* Go get the next char in the class */
3204     }
3205    
3206     /* Handle a lone single character - we can get here for a normal
3207     non-escape char, or after \ that introduces a single character or for an
3208     apparent range that isn't. */
3209    
3210     LONE_SINGLE_CHARACTER:
3211 ph10 231
3212 nigel 77 /* Handle a character that cannot go in the bit map */
3213    
3214     #ifdef SUPPORT_UTF8
3215     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3216     {
3217     class_utf8 = TRUE;
3218     *class_utf8data++ = XCL_SINGLE;
3219     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3220    
3221     #ifdef SUPPORT_UCP
3222     if ((options & PCRE_CASELESS) != 0)
3223     {
3224 nigel 93 unsigned int othercase;
3225     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3226 nigel 77 {
3227     *class_utf8data++ = XCL_SINGLE;
3228     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3229     }
3230     }
3231     #endif /* SUPPORT_UCP */
3232    
3233     }
3234     else
3235     #endif /* SUPPORT_UTF8 */
3236    
3237     /* Handle a single-byte character */
3238     {
3239     classbits[c/8] |= (1 << (c&7));
3240     if ((options & PCRE_CASELESS) != 0)
3241     {
3242     c = cd->fcc[c]; /* flip case */
3243     classbits[c/8] |= (1 << (c&7));
3244     }
3245     class_charcount++;
3246     class_lastchar = c;
3247     }
3248     }
3249    
3250 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3251 nigel 77
3252 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3253 nigel 77
3254 nigel 93 if (c == 0) /* Missing terminating ']' */
3255     {
3256     *errorcodeptr = ERR6;
3257     goto FAILED;
3258     }
3259 ph10 231
3260    
3261 ph10 230 /* This code has been disabled because it would mean that \s counts as
3262     an explicit \r or \n reference, and that's not really what is wanted. Now
3263     we set the flag only if there is a literal "\r" or "\n" in the class. */
3264 ph10 227
3265 ph10 230 #if 0
3266 ph10 226 /* Remember whether \r or \n are in this class */
3267 ph10 227
3268 ph10 226 if (negate_class)
3269     {
3270 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3271 ph10 226 }
3272     else
3273     {
3274 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3275 ph10 227 }
3276 ph10 230 #endif
3277 ph10 227
3278 ph10 231
3279 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3280 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3281     use of \p or \P, in other words, no use of any XCLASS features, we can
3282     optimize.
3283    
3284 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3285     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3286     operate on single-bytes only. This is an historical hangover. Maybe one day
3287     we can tidy these opcodes to handle multi-byte characters.
3288 nigel 77
3289     The optimization throws away the bit map. We turn the item into a
3290     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3291     that OP_NOT does not support multibyte characters. In the positive case, it
3292     can cause firstbyte to be set. Otherwise, there can be no first char if
3293     this item is first, whatever repeat count may follow. In the case of
3294     reqbyte, save the previous value for reinstating. */
3295    
3296     #ifdef SUPPORT_UTF8
3297 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3298 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3299 nigel 77 #else
3300     if (class_charcount == 1)
3301     #endif
3302     {
3303     zeroreqbyte = reqbyte;
3304    
3305     /* The OP_NOT opcode works on one-byte characters only. */
3306    
3307     if (negate_class)
3308     {
3309     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3310     zerofirstbyte = firstbyte;
3311     *code++ = OP_NOT;
3312     *code++ = class_lastchar;
3313     break;
3314     }
3315    
3316     /* For a single, positive character, get the value into mcbuffer, and
3317     then we can handle this with the normal one-character code. */
3318    
3319     #ifdef SUPPORT_UTF8
3320     if (utf8 && class_lastchar > 127)
3321     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3322     else
3323     #endif
3324     {
3325     mcbuffer[0] = class_lastchar;
3326     mclength = 1;
3327     }
3328     goto ONE_CHAR;
3329     } /* End of 1-char optimization */
3330    
3331     /* The general case - not the one-char optimization. If this is the first
3332     thing in the branch, there can be no first char setting, whatever the
3333     repeat count. Any reqbyte setting must remain unchanged after any kind of
3334     repeat. */
3335    
3336     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3337     zerofirstbyte = firstbyte;
3338     zeroreqbyte = reqbyte;
3339    
3340     /* If there are characters with values > 255, we have to compile an
3341 ph10 264 extended class, with its own opcode, unless there was a negated special
3342     such as \S in the class, because in that case all characters > 255 are in
3343     the class, so any that were explicitly given as well can be ignored. If
3344     (when there are explicit characters > 255 that must be listed) there are no
3345     characters < 256, we can omit the bitmap in the actual compiled code. */
3346 nigel 77
3347     #ifdef SUPPORT_UTF8
3348 ph10 264 if (class_utf8 && !should_flip_negation)
3349 nigel 77 {
3350     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3351     *code++ = OP_XCLASS;
3352     code += LINK_SIZE;
3353     *code = negate_class? XCL_NOT : 0;
3354    
3355 nigel 93 /* If the map is required, move up the extra data to make room for it;
3356     otherwise just move the code pointer to the end of the extra data. */
3357 nigel 77
3358     if (class_charcount > 0)
3359     {
3360     *code++ |= XCL_MAP;
3361 nigel 93 memmove(code + 32, code, class_utf8data - code);
3362 nigel 77 memcpy(code, classbits, 32);
3363 nigel 93 code = class_utf8data + 32;
3364 nigel 77 }
3365 nigel 93 else code = class_utf8data;
3366 nigel 77
3367     /* Now fill in the complete length of the item */
3368    
3369     PUT(previous, 1, code - previous);
3370     break; /* End of class handling */
3371     }
3372     #endif
3373    
3374 ph10 264 /* If there are no characters > 255, set the opcode to OP_CLASS or
3375     OP_NCLASS, depending on whether the whole class was negated and whether
3376     there were negative specials such as \S in the class. Then copy the 32-byte
3377     map into the code vector, negating it if necessary. */
3378    
3379     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3380 nigel 77 if (negate_class)
3381     {
3382 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3383     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3384 nigel 77 }
3385     else
3386     {
3387     memcpy(code, classbits, 32);
3388     }
3389     code += 32;
3390     break;
3391    
3392 nigel 93
3393     /* ===================================================================*/
3394 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3395     has been tested above. */
3396    
3397     case '{':
3398     if (!is_quantifier) goto NORMAL_CHAR;
3399     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3400     if (*errorcodeptr != 0) goto FAILED;
3401     goto REPEAT;
3402    
3403     case '*':
3404     repeat_min = 0;
3405     repeat_max = -1;
3406     goto REPEAT;
3407    
3408     case '+':
3409     repeat_min = 1;
3410     repeat_max = -1;
3411     goto REPEAT;
3412    
3413     case '?':
3414     repeat_min = 0;
3415     repeat_max = 1;
3416    
3417     REPEAT:
3418     if (previous == NULL)
3419     {
3420     *errorcodeptr = ERR9;
3421     goto FAILED;
3422     }
3423    
3424     if (repeat_min == 0)
3425     {
3426     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3427     reqbyte = zeroreqbyte; /* Ditto */
3428     }
3429    
3430     /* Remember whether this is a variable length repeat */
3431    
3432     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3433    
3434     op_type = 0; /* Default single-char op codes */
3435     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3436    
3437     /* Save start of previous item, in case we have to move it up to make space
3438     for an inserted OP_ONCE for the additional '+' extension. */
3439    
3440     tempcode = previous;
3441    
3442     /* If the next character is '+', we have a possessive quantifier. This
3443     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3444     If the next character is '?' this is a minimizing repeat, by default,
3445     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3446     repeat type to the non-default. */
3447    
3448     if (ptr[1] == '+')
3449     {
3450     repeat_type = 0; /* Force greedy */
3451     possessive_quantifier = TRUE;
3452     ptr++;
3453     }
3454     else if (ptr[1] == '?')
3455     {
3456     repeat_type = greedy_non_default;
3457     ptr++;
3458     }
3459     else repeat_type = greedy_default;
3460    
3461     /* If previous was a character match, abolish the item and generate a
3462     repeat item instead. If a char item has a minumum of more than one, ensure
3463     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3464     the first thing in a branch because the x will have gone into firstbyte
3465     instead. */
3466    
3467     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3468     {
3469     /* Deal with UTF-8 characters that take up more than one byte. It's
3470     easier to write this out separately than try to macrify it. Use c to
3471     hold the length of the character in bytes, plus 0x80 to flag that it's a
3472     length rather than a small character. */
3473    
3474     #ifdef SUPPORT_UTF8
3475     if (utf8 && (code[-1] & 0x80) != 0)
3476     {
3477     uschar *lastchar = code - 1;
3478     while((*lastchar & 0xc0) == 0x80) lastchar--;
3479     c = code - lastchar; /* Length of UTF-8 character */
3480     memcpy(utf8_char, lastchar, c); /* Save the char */
3481     c |= 0x80; /* Flag c as a length */
3482     }
3483     else
3484     #endif
3485    
3486     /* Handle the case of a single byte - either with no UTF8 support, or
3487     with UTF-8 disabled, or for a UTF-8 character < 128. */
3488    
3489     {
3490     c = code[-1];
3491     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3492     }
3493    
3494 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3495     the line is something that cannot possibly match this character. If so,
3496     automatically possessifying this item gains some performance in the case
3497     where the match fails. */
3498    
3499     if (!possessive_quantifier &&
3500     repeat_max < 0 &&
3501     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3502     options, cd))
3503     {
3504     repeat_type = 0; /* Force greedy */
3505     possessive_quantifier = TRUE;
3506     }
3507    
3508 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3509     }
3510    
3511     /* If previous was a single negated character ([^a] or similar), we use
3512     one of the special opcodes, replacing it. The code is shared with single-
3513     character repeats by setting opt_type to add a suitable offset into
3514 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3515     currently used only for single-byte chars. */
3516 nigel 77
3517     else if (*previous == OP_NOT)
3518     {
3519     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3520     c = previous[1];
3521 nigel 93 if (!possessive_quantifier &&
3522     repeat_max < 0 &&
3523     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3524     {
3525     repeat_type = 0; /* Force greedy */
3526     possessive_quantifier = TRUE;
3527     }
3528 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3529     }
3530    
3531     /* If previous was a character type match (\d or similar), abolish it and
3532     create a suitable repeat item. The code is shared with single-character
3533     repeats by setting op_type to add a suitable offset into repeat_type. Note
3534     the the Unicode property types will be present only when SUPPORT_UCP is
3535     defined, but we don't wrap the little bits of code here because it just
3536     makes it horribly messy. */
3537    
3538     else if (*previous < OP_EODN)
3539     {
3540     uschar *oldcode;
3541 nigel 87 int prop_type, prop_value;
3542 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3543     c = *previous;
3544    
3545 nigel 93 if (!possessive_quantifier &&
3546     repeat_max < 0 &&
3547     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3548     {
3549     repeat_type = 0; /* Force greedy */
3550     possessive_quantifier = TRUE;
3551     }
3552    
3553 nigel 77 OUTPUT_SINGLE_REPEAT:
3554 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3555     {
3556     prop_type = previous[1];
3557     prop_value = previous[2];
3558     }
3559     else prop_type = prop_value = -1;
3560 nigel 77
3561     oldcode = code;
3562     code = previous; /* Usually overwrite previous item */
3563    
3564     /* If the maximum is zero then the minimum must also be zero; Perl allows
3565     this case, so we do too - by simply omitting the item altogether. */
3566    
3567     if (repeat_max == 0) goto END_REPEAT;
3568    
3569     /* All real repeats make it impossible to handle partial matching (maybe
3570     one day we will be able to remove this restriction). */
3571    
3572 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3573 nigel 77
3574     /* Combine the op_type with the repeat_type */
3575    
3576     repeat_type += op_type;
3577    
3578     /* A minimum of zero is handled either as the special case * or ?, or as
3579     an UPTO, with the maximum given. */
3580    
3581     if (repeat_min == 0)
3582     {
3583     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3584     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3585     else
3586     {
3587     *code++ = OP_UPTO + repeat_type;
3588     PUT2INC(code, 0, repeat_max);
3589     }
3590     }
3591    
3592     /* A repeat minimum of 1 is optimized into some special cases. If the
3593 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3594 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3595     one less than the maximum. */
3596    
3597     else if (repeat_min == 1)
3598     {
3599     if (repeat_max == -1)
3600     *code++ = OP_PLUS + repeat_type;
3601     else
3602     {
3603     code = oldcode; /* leave previous item in place */
3604     if (repeat_max == 1) goto END_REPEAT;
3605     *code++ = OP_UPTO + repeat_type;
3606     PUT2INC(code, 0, repeat_max - 1);
3607     }
3608     }
3609    
3610     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3611     handled as an EXACT followed by an UPTO. */
3612    
3613     else
3614     {
3615     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3616     PUT2INC(code, 0, repeat_min);
3617    
3618     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3619     we have to insert the character for the previous code. For a repeated
3620 nigel 87 Unicode property match, there are two extra bytes that define the
3621 nigel 77 required property. In UTF-8 mode, long characters have their length in
3622     c, with the 0x80 bit as a flag. */
3623    
3624     if (repeat_max < 0)
3625     {
3626     #ifdef SUPPORT_UTF8
3627     if (utf8 && c >= 128)
3628     {
3629     memcpy(code, utf8_char, c & 7);
3630     code += c & 7;
3631     }
3632     else
3633     #endif
3634     {
3635     *code++ = c;
3636 nigel 87 if (prop_type >= 0)
3637     {
3638     *code++ = prop_type;
3639     *code++ = prop_value;
3640     }
3641 nigel 77 }
3642     *code++ = OP_STAR + repeat_type;
3643     }
3644    
3645     /* Else insert an UPTO if the max is greater than the min, again
3646 nigel 93 preceded by the character, for the previously inserted code. If the
3647     UPTO is just for 1 instance, we can use QUERY instead. */
3648 nigel 77
3649     else if (repeat_max != repeat_min)
3650     {
3651     #ifdef SUPPORT_UTF8
3652     if (utf8 && c >= 128)
3653     {
3654     memcpy(code, utf8_char, c & 7);
3655     code += c & 7;
3656     }
3657     else
3658     #endif
3659     *code++ = c;
3660 nigel 87 if (prop_type >= 0)
3661     {
3662     *code++ = prop_type;
3663     *code++ = prop_value;
3664     }
3665 nigel 77 repeat_max -= repeat_min;
3666 nigel 93
3667     if (repeat_max == 1)
3668     {
3669     *code++ = OP_QUERY + repeat_type;
3670     }
3671     else
3672     {
3673     *code++ = OP_UPTO + repeat_type;
3674     PUT2INC(code, 0, repeat_max);
3675     }
3676 nigel 77 }
3677     }
3678    
3679     /* The character or character type itself comes last in all cases. */
3680    
3681     #ifdef SUPPORT_UTF8
3682     if (utf8 && c >= 128)
3683     {
3684     memcpy(code, utf8_char, c & 7);
3685     code += c & 7;
3686     }
3687     else
3688     #endif
3689     *code++ = c;
3690    
3691 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3692     define the required property. */
3693 nigel 77
3694     #ifdef SUPPORT_UCP
3695 nigel 87 if (prop_type >= 0)
3696     {
3697     *code++ = prop_type;
3698     *code++ = prop_value;
3699     }
3700 nigel 77 #endif
3701     }
3702    
3703     /* If previous was a character class or a back reference, we put the repeat
3704     stuff after it, but just skip the item if the repeat was {0,0}. */
3705    
3706     else if (*previous == OP_CLASS ||
3707     *previous == OP_NCLASS ||
3708     #ifdef SUPPORT_UTF8
3709     *previous == OP_XCLASS ||
3710     #endif
3711     *previous == OP_REF)
3712     {
3713     if (repeat_max == 0)
3714     {
3715     code = previous;
3716     goto END_REPEAT;
3717     }
3718    
3719     /* All real repeats make it impossible to handle partial matching (maybe
3720     one day we will be able to remove this restriction). */
3721    
3722 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3723 nigel 77
3724     if (repeat_min == 0 && repeat_max == -1)
3725     *code++ = OP_CRSTAR + repeat_type;
3726     else if (repeat_min == 1 && repeat_max == -1)
3727     *code++ = OP_CRPLUS + repeat_type;
3728     else if (repeat_min == 0 && repeat_max == 1)
3729     *code++ = OP_CRQUERY + repeat_type;
3730     else
3731     {
3732     *code++ = OP_CRRANGE + repeat_type;
3733     PUT2INC(code, 0, repeat_min);
3734     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3735     PUT2INC(code, 0, repeat_max);
3736     }
3737     }
3738    
3739     /* If previous was a bracket group, we may have to replicate it in certain
3740     cases. */
3741    
3742 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3743     *previous == OP_ONCE || *previous == OP_COND)
3744 nigel 77 {
3745     register int i;
3746     int ketoffset = 0;
3747     int len = code - previous;
3748     uschar *bralink = NULL;
3749    
3750 nigel 93 /* Repeating a DEFINE group is pointless */
3751    
3752     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3753     {
3754     *errorcodeptr = ERR55;
3755     goto FAILED;
3756     }
3757    
3758 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3759     by scanning through from the start, and compute the offset back to it
3760     from the current code pointer. There may be an OP_OPT setting following
3761     the final KET, so we can't find the end just by going back from the code
3762     pointer. */
3763    
3764     if (repeat_max == -1)
3765     {
3766     register uschar *ket = previous;
3767     do ket += GET(ket, 1); while (*ket != OP_KET);
3768     ketoffset = code - ket;
3769     }
3770    
3771     /* The case of a zero minimum is special because of the need to stick
3772     OP_BRAZERO in front of it, and because the group appears once in the
3773     data, whereas in other cases it appears the minimum number of times. For
3774     this reason, it is simplest to treat this case separately, as otherwise
3775     the code gets far too messy. There are several special subcases when the
3776     minimum is zero. */
3777    
3778     if (repeat_min == 0)
3779     {
3780     /* If the maximum is also zero, we just omit the group from the output
3781     altogether. */
3782    
3783     if (repeat_max == 0)
3784     {
3785     code = previous;
3786     goto END_REPEAT;
3787     }
3788    
3789     /* If the maximum is 1 or unlimited, we just have to stick in the
3790     BRAZERO and do no more at this point. However, we do need to adjust
3791     any OP_RECURSE calls inside the group that refer to the group itself or
3792 nigel 93 any internal or forward referenced group, because the offset is from
3793     the start of the whole regex. Temporarily terminate the pattern while
3794     doing this. */
3795 nigel 77
3796     if (repeat_max <= 1)
3797     {
3798     *code = OP_END;
3799 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3800 nigel 77 memmove(previous+1, previous, len);
3801     code++;
3802     *previous++ = OP_BRAZERO + repeat_type;
3803     }
3804    
3805     /* If the maximum is greater than 1 and limited, we have to replicate
3806     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3807     The first one has to be handled carefully because it's the original
3808     copy, which has to be moved up. The remainder can be handled by code
3809     that is common with the non-zero minimum case below. We have to
3810     adjust the value or repeat_max, since one less copy is required. Once
3811     again, we may have to adjust any OP_RECURSE calls inside the group. */
3812    
3813     else
3814     {
3815     int offset;
3816     *code = OP_END;
3817 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3818 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3819     code += 2 + LINK_SIZE;
3820     *previous++ = OP_BRAZERO + repeat_type;
3821     *previous++ = OP_BRA;
3822    
3823     /* We chain together the bracket offset fields that have to be
3824     filled in later when the ends of the brackets are reached. */
3825    
3826     offset = (bralink == NULL)? 0 : previous - bralink;
3827     bralink = previous;
3828     PUTINC(previous, 0, offset);
3829     }
3830    
3831     repeat_max--;
3832     }
3833    
3834     /* If the minimum is greater than zero, replicate the group as many
3835     times as necessary, and adjust the maximum to the number of subsequent
3836     copies that we need. If we set a first char from the group, and didn't
3837 nigel 93 set a required char, copy the latter from the former. If there are any
3838     forward reference subroutine calls in the group, there will be entries on
3839     the workspace list; replicate these with an appropriate increment. */
3840 nigel 77
3841     else
3842     {
3843     if (repeat_min > 1)
3844     {
3845 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3846 ph10 202 just adjust the length as if we had. Do some paranoid checks for
3847     potential integer overflow. */
3848 nigel 93
3849     if (lengthptr != NULL)
3850 ph10 202 {
3851     int delta = (repeat_min - 1)*length_prevgroup;
3852     if ((double)(repeat_min - 1)*(double)length_prevgroup >
3853     (double)INT_MAX ||
3854     OFLOW_MAX - *lengthptr < delta)
3855     {
3856     *errorcodeptr = ERR20;
3857     goto FAILED;
3858     }
3859     *lengthptr += delta;
3860     }
3861 nigel 93
3862     /* This is compiling for real */
3863    
3864     else
3865 nigel 77 {
3866 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3867     for (i = 1; i < repeat_min; i++)
3868     {
3869     uschar *hc;
3870     uschar *this_hwm = cd->hwm;
3871     memcpy(code, previous, len);
3872     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3873     {
3874     PUT(cd->hwm, 0, GET(hc, 0) + len);
3875     cd->hwm += LINK_SIZE;
3876     }
3877     save_hwm = this_hwm;
3878     code += len;
3879     }
3880 nigel 77 }
3881     }
3882 nigel 93
3883 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3884     }
3885    
3886     /* This code is common to both the zero and non-zero minimum cases. If
3887     the maximum is limited, it replicates the group in a nested fashion,
3888     remembering the bracket starts on a stack. In the case of a zero minimum,
3889     the first one was set up above. In all cases the repeat_max now specifies
3890 nigel 93 the number of additional copies needed. Again, we must remember to
3891     replicate entries on the forward reference list. */
3892 nigel 77
3893     if (repeat_max >= 0)
3894     {
3895 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3896     just adjust the length as if we had. For each repetition we must add 1
3897     to the length for BRAZERO and for all but the last repetition we must
3898 ph10 202 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3899     paranoid checks to avoid integer overflow. */
3900 nigel 93
3901     if (lengthptr != NULL && repeat_max > 0)
3902 ph10 202 {
3903     int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3904     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3905     if ((double)repeat_max *
3906     (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3907     > (double)INT_MAX ||
3908     OFLOW_MAX - *lengthptr < delta)
3909     {
3910     *errorcodeptr = ERR20;
3911     goto FAILED;
3912     }
3913     *lengthptr += delta;
3914     }
3915 nigel 93
3916     /* This is compiling for real */
3917    
3918     else for (i = repeat_max - 1; i >= 0; i--)
3919 nigel 77 {
3920 nigel 93 uschar *hc;
3921     uschar *this_hwm = cd->hwm;
3922    
3923 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3924    
3925     /* All but the final copy start a new nesting, maintaining the
3926     chain of brackets outstanding. */
3927    
3928     if (i != 0)
3929     {
3930     int offset;
3931     *code++ = OP_BRA;
3932     offset = (bralink == NULL)? 0 : code - bralink;
3933     bralink = code;
3934     PUTINC(code, 0, offset);
3935     }
3936    
3937     memcpy(code, previous, len);
3938 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3939     {
3940     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3941     cd->hwm += LINK_SIZE;
3942     }
3943     save_hwm = this_hwm;
3944 nigel 77 code += len;
3945     }
3946    
3947     /* Now chain through the pending brackets, and fill in their length
3948     fields (which are holding the chain links pro tem). */
3949    
3950     while (bralink != NULL)
3951     {
3952     int oldlinkoffset;
3953     int offset = code - bralink + 1;
3954     uschar *bra = code - offset;
3955     oldlinkoffset = GET(bra, 1);
3956     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3957     *code++ = OP_KET;
3958     PUTINC(code, 0, offset);
3959     PUT(bra, 1, offset);
3960     }
3961     }
3962    
3963     /* If the maximum is unlimited, set a repeater in the final copy. We
3964     can't just offset backwards from the current code point, because we
3965     don't know if there's been an options resetting after the ket. The
3966 nigel 93 correct offset was computed above.
3967 nigel 77
3968 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3969     this group is a non-atomic one that could match an empty string. If so,
3970     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3971     that runtime checking can be done. [This check is also applied to
3972     atomic groups at runtime, but in a different way.] */
3973    
3974     else
3975     {
3976     uschar *ketcode = code - ketoffset;
3977     uschar *bracode = ketcode - GET(ketcode, 1);
3978     *ketcode = OP_KETRMAX + repeat_type;
3979     if (lengthptr == NULL && *bracode != OP_ONCE)
3980     {
3981     uschar *scode = bracode;
3982     do
3983     {
3984     if (could_be_empty_branch(scode, ketcode, utf8))
3985     {
3986     *bracode += OP_SBRA - OP_BRA;
3987     break;
3988     }
3989     scode += GET(scode, 1);
3990     }
3991     while (*scode == OP_ALT);
3992     }
3993     }
3994 nigel 77 }
3995    
3996     /* Else there's some kind of shambles */
3997    
3998     else
3999     {
4000     *errorcodeptr = ERR11;
4001     goto FAILED;
4002     }
4003    
4004 nigel 93 /* If the character following a repeat is '+', or if certain optimization
4005     tests above succeeded, possessive_quantifier is TRUE. For some of the
4006     simpler opcodes, there is an special alternative opcode for this. For
4007     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4008     The '+' notation is just syntactic sugar, taken from Sun's Java package,
4009     but the special opcodes can optimize it a bit. The repeated item starts at
4010     tempcode, not at previous, which might be the first part of a string whose
4011     (former) last char we repeated.
4012 nigel 77
4013 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4014     an 'upto' may follow. We skip over an 'exact' item, and then test the
4015     length of what remains before proceeding. */
4016    
4017 nigel 77 if (possessive_quantifier)
4018     {
4019 nigel 93 int len;
4020     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4021     *tempcode == OP_NOTEXACT)
4022     tempcode += _pcre_OP_lengths[*tempcode];
4023     len = code - tempcode;
4024     if (len > 0) switch (*tempcode)
4025     {
4026     case OP_STAR: *tempcode = OP_POSSTAR; break;
4027     case OP_PLUS: *tempcode = OP_POSPLUS; break;
4028     case OP_QUERY: *tempcode = OP_POSQUERY; break;
4029     case OP_UPTO: *tempcode = OP_POSUPTO; break;
4030    
4031     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4032     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4033     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4034     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4035    
4036     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4037     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4038     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4039     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4040    
4041     default:
4042     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4043     code += 1 + LINK_SIZE;
4044     len += 1 + LINK_SIZE;
4045     tempcode[0] = OP_ONCE;
4046     *code++ = OP_KET;
4047     PUTINC(code, 0, len);
4048     PUT(tempcode, 1, len);
4049     break;
4050     }
4051 nigel 77 }
4052    
4053     /* In all case we no longer have a previous item. We also set the
4054     "follows varying string" flag for subsequently encountered reqbytes if
4055     it isn't already set and we have just passed a varying length item. */
4056    
4057     END_REPEAT:
4058     previous = NULL;
4059     cd->req_varyopt |= reqvary;
4060     break;
4061    
4062    
4063 nigel 93 /* ===================================================================*/
4064     /* Start of nested parenthesized sub-expression, or comment or lookahead or
4065     lookbehind or option setting or condition or all the other extended
4066 ph10 210 parenthesis forms. */
4067 nigel 77
4068     case '(':
4069     newoptions = options;
4070     skipbytes = 0;
4071 nigel 93 bravalue = OP_CBRA;
4072     save_hwm = cd->hwm;
4073 ph10 180 reset_bracount = FALSE;
4074 ph10 211
4075 ph10 210 /* First deal with various "verbs" that can be introduced by '*'. */
4076 ph10 211
4077 ph10 210 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4078     {
4079 ph10 211 int i, namelen;
4080 ph10 243 const char *vn = verbnames;
4081 ph10 210 const uschar *name = ++ptr;
4082     previous = NULL;
4083     while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4084     if (*ptr == ':')
4085     {
4086     *errorcodeptr = ERR59; /* Not supported */
4087 ph10 211 goto FAILED;
4088     }
4089 ph10 210 if (*ptr != ')')
4090     {
4091     *errorcodeptr = ERR60;
4092     goto FAILED;
4093     }
4094 ph10 211 namelen = ptr - name;
4095 ph10 210 for (i = 0; i < verbcount; i++)
4096 ph10