/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 268 - (hide annotations) (download)
Thu Nov 15 10:28:09 2007 UTC (6 years, 11 months ago) by ph10
File MIME type: text/plain
File size: 194795 byte(s)
Fix (?&) non-diagnosis bug and missing length check for (?&a) etc.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 nigel 77 static const short int escapes[] = {
102     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 nigel 77 0, 0, -ESC_z /* x - z */
112     };
113    
114 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
115 nigel 77 static const short int escapes[] = {
116     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139     };
140     #endif
141    
142    
143 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144     searched linearly. Put all the names into a single string, in order to reduce
145 ph10 240 the number of relocations when a shared library is dynamically linked. */
146 ph10 210
147     typedef struct verbitem {
148     int len;
149     int op;
150 ph10 211 } verbitem;
151 ph10 210
152 ph10 240 static const char verbnames[] =
153 ph10 243 "ACCEPT\0"
154     "COMMIT\0"
155     "F\0"
156     "FAIL\0"
157     "PRUNE\0"
158     "SKIP\0"
159     "THEN";
160 ph10 240
161 ph10 210 static verbitem verbs[] = {
162 ph10 240 { 6, OP_ACCEPT },
163     { 6, OP_COMMIT },
164     { 1, OP_FAIL },
165     { 4, OP_FAIL },
166     { 5, OP_PRUNE },
167     { 4, OP_SKIP },
168     { 4, OP_THEN }
169 ph10 210 };
170    
171     static int verbcount = sizeof(verbs)/sizeof(verbitem);
172    
173    
174 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
175     now all in a single string, to reduce the number of relocations when a shared
176 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
177     length entry. The first three must be alpha, lower, upper, as this is assumed
178     for handling case independence. */
179 nigel 77
180 ph10 240 static const char posix_names[] =
181 ph10 243 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182     "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 ph10 240 "word\0" "xdigit";
184 nigel 77
185     static const uschar posix_name_lengths[] = {
186     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187    
188 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
189     base map, with an optional addition or removal of another map. Then, for some
190     classes, there is some additional tweaking: for [:blank:] the vertical space
191     characters are removed, and for [:alpha:] and [:alnum:] the underscore
192     character is removed. The triples in the table consist of the base map offset,
193     second map offset or -1 if no second map, and a non-negative value for map
194     addition or a negative value for map subtraction (if there are two maps). The
195     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196     remove vertical space characters, 2 => remove underscore. */
197 nigel 77
198     static const int posix_class_maps[] = {
199 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
200     cbit_lower, -1, 0, /* lower */
201     cbit_upper, -1, 0, /* upper */
202     cbit_word, -1, 2, /* alnum - word without underscore */
203     cbit_print, cbit_cntrl, 0, /* ascii */
204     cbit_space, -1, 1, /* blank - a GNU extension */
205     cbit_cntrl, -1, 0, /* cntrl */
206     cbit_digit, -1, 0, /* digit */
207     cbit_graph, -1, 0, /* graph */
208     cbit_print, -1, 0, /* print */
209     cbit_punct, -1, 0, /* punct */
210     cbit_space, -1, 0, /* space */
211     cbit_word, -1, 0, /* word - a Perl extension */
212     cbit_xdigit,-1, 0 /* xdigit */
213 nigel 77 };
214    
215    
216 nigel 93 #define STRING(a) # a
217     #define XSTRING(s) STRING(s)
218    
219 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
220 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
221     they are documented. Always add a new error instead. Messages marked DEAD below
222 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
223     the number of relocations needed when a shared library is loaded dynamically,
224     it is now one long string. We cannot use a table of offsets, because the
225     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226     simply count through to the one we want - this isn't a performance issue
227 ph10 240 because these strings are used only when there is a compilation error. */
228 nigel 77
229 ph10 240 static const char error_texts[] =
230     "no error\0"
231     "\\ at end of pattern\0"
232     "\\c at end of pattern\0"
233     "unrecognized character follows \\\0"
234     "numbers out of order in {} quantifier\0"
235 nigel 77 /* 5 */
236 ph10 240 "number too big in {} quantifier\0"
237     "missing terminating ] for character class\0"
238     "invalid escape sequence in character class\0"
239     "range out of order in character class\0"
240     "nothing to repeat\0"
241 nigel 77 /* 10 */
242 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243     "internal error: unexpected repeat\0"
244     "unrecognized character after (?\0"
245     "POSIX named classes are supported only within a class\0"
246     "missing )\0"
247 nigel 77 /* 15 */
248 ph10 240 "reference to non-existent subpattern\0"
249     "erroffset passed as NULL\0"
250     "unknown option bit(s) set\0"
251     "missing ) after comment\0"
252     "parentheses nested too deeply\0" /** DEAD **/
253 nigel 77 /* 20 */
254 ph10 240 "regular expression is too large\0"
255     "failed to get memory\0"
256     "unmatched parentheses\0"
257     "internal error: code overflow\0"
258     "unrecognized character after (?<\0"
259 nigel 77 /* 25 */
260 ph10 240 "lookbehind assertion is not fixed length\0"
261     "malformed number or name after (?(\0"
262     "conditional group contains more than two branches\0"
263     "assertion expected after (?(\0"
264     "(?R or (?[+-]digits must be followed by )\0"
265 nigel 77 /* 30 */
266 ph10 240 "unknown POSIX class name\0"
267     "POSIX collating elements are not supported\0"
268     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269     "spare error\0" /** DEAD **/
270     "character value in \\x{...} sequence is too large\0"
271 nigel 77 /* 35 */
272 ph10 240 "invalid condition (?(0)\0"
273     "\\C not allowed in lookbehind assertion\0"
274     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275     "number after (?C is > 255\0"
276     "closing ) for (?C expected\0"
277 nigel 77 /* 40 */
278 ph10 240 "recursive call could loop indefinitely\0"
279     "unrecognized character after (?P\0"
280     "syntax error in subpattern name (missing terminator)\0"
281     "two named subpatterns have the same name\0"
282     "invalid UTF-8 string\0"
283 nigel 77 /* 45 */
284 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
285     "malformed \\P or \\p sequence\0"
286     "unknown property name after \\P or \\p\0"
287     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 nigel 91 /* 50 */
290 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
291     "octal value is greater than \\377 (not in UTF-8 mode)\0"
292     "internal error: overran compiling workspace\0"
293     "internal error: previously-checked referenced subpattern not found\0"
294     "DEFINE group contains more than one branch\0"
295 nigel 93 /* 55 */
296 ph10 240 "repeating a DEFINE group is not allowed\0"
297     "inconsistent NEWLINE options\0"
298     "\\g is not followed by a braced name or an optionally braced non-zero number\0"
299     "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
300     "(*VERB) with an argument is not supported\0"
301 ph10 211 /* 60 */
302 ph10 240 "(*VERB) not recognized\0"
303 ph10 268 "number is too big\0"
304     "subpattern name expected after (?&";
305 nigel 77
306    
307     /* Table to identify digits and hex digits. This is used when compiling
308     patterns. Note that the tables in chartables are dependent on the locale, and
309     may mark arbitrary characters as digits - but the PCRE compiling code expects
310     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
311     a private table here. It costs 256 bytes, but it is a lot faster than doing
312     character value tests (at least in some simple cases I timed), and in some
313     applications one wants PCRE to compile efficiently as well as match
314     efficiently.
315    
316     For convenience, we use the same bit definitions as in chartables:
317    
318     0x04 decimal digit
319     0x08 hexadecimal digit
320    
321     Then we can use ctype_digit and ctype_xdigit in the code. */
322    
323 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
324 nigel 77 static const unsigned char digitab[] =
325     {
326     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
332     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
333     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
334     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
335     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
336     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
337     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
338     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
339     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
340     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
341     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
342     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
343     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
344     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
347     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
348     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
350     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
351     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
352     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
355     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
356     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
357     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
358    
359 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
360 nigel 77 static const unsigned char digitab[] =
361     {
362     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
363     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
364     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
365     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
366     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
367     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
368     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
369     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
370     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
371     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
373 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
374 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
375     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
376     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
377     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
378     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
379     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
380     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
381     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
382     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
383     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
386     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
387     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
388     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
392     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
393     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
394    
395     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
396     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
397     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
398     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
399     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
400     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
401     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
402     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
403     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
404     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
405     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
406     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
407 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
408 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
409     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
410     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
411     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
412     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
413     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
414     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
415     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
416     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
417     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
418     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
419     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
420     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
421     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
422     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
423     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
424     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
425     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
426     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
427     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
428     #endif
429    
430    
431     /* Definition to allow mutual recursion */
432    
433     static BOOL
434 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
435 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
436 nigel 77
437    
438    
439     /*************************************************
440 ph10 240 * Find an error text *
441     *************************************************/
442    
443 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
444     some of the text is of unknown length, we can't use a table of offsets.
445     Instead, just count through the strings. This is not a performance issue
446 ph10 240 because it happens only when there has been a compilation error.
447    
448     Argument: the error number
449     Returns: pointer to the error string
450     */
451    
452     static const char *
453     find_error_text(int n)
454     {
455     const char *s = error_texts;
456 ph10 243 for (; n > 0; n--) while (*s++ != 0);
457 ph10 240 return s;
458     }
459    
460    
461     /*************************************************
462 nigel 77 * Handle escapes *
463     *************************************************/
464    
465     /* This function is called when a \ has been encountered. It either returns a
466     positive value for a simple escape such as \n, or a negative value which
467 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
468     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
469     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
470     ptr is pointing at the \. On exit, it is on the final character of the escape
471     sequence.
472 nigel 77
473     Arguments:
474     ptrptr points to the pattern position pointer
475     errorcodeptr points to the errorcode variable
476     bracount number of previous extracting brackets
477     options the options bits
478     isclass TRUE if inside a character class
479    
480     Returns: zero or positive => a data character
481     negative => a special escape sequence
482 ph10 213 on error, errorcodeptr is set
483 nigel 77 */
484    
485     static int
486     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
487     int options, BOOL isclass)
488     {
489 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
490     const uschar *ptr = *ptrptr + 1;
491 nigel 77 int c, i;
492    
493 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
494     ptr--; /* Set pointer back to the last byte */
495    
496 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
497    
498     if (c == 0) *errorcodeptr = ERR1;
499    
500     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
501     a table. A non-zero result is something that can be returned immediately.
502     Otherwise further processing may be required. */
503    
504 ph10 97 #ifndef EBCDIC /* ASCII coding */
505 nigel 77 else if (c < '0' || c > 'z') {} /* Not alphameric */
506     else if ((i = escapes[c - '0']) != 0) c = i;
507    
508 ph10 97 #else /* EBCDIC coding */
509 nigel 77 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
510     else if ((i = escapes[c - 0x48]) != 0) c = i;
511     #endif
512    
513     /* Escapes that need further processing, or are illegal. */
514    
515     else
516     {
517     const uschar *oldptr;
518 nigel 93 BOOL braced, negated;
519    
520 nigel 77 switch (c)
521     {
522     /* A number of Perl escapes are not handled by PCRE. We give an explicit
523     error. */
524    
525     case 'l':
526     case 'L':
527     case 'N':
528     case 'u':
529     case 'U':
530     *errorcodeptr = ERR37;
531     break;
532    
533 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
534     is an absolute backreference. If negative, it is a relative backreference.
535 ph10 172 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
536     reference to a named group. This is part of Perl's movement towards a
537     unified syntax for back references. As this is synonymous with \k{name}, we
538 ph10 171 fudge it up by pretending it really was \k. */
539 nigel 93
540     case 'g':
541     if (ptr[1] == '{')
542     {
543 ph10 171 const uschar *p;
544     for (p = ptr+2; *p != 0 && *p != '}'; p++)
545     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
546 ph10 172 if (*p != 0 && *p != '}')
547 ph10 171 {
548     c = -ESC_k;
549     break;
550 ph10 172 }
551 nigel 93 braced = TRUE;
552     ptr++;
553     }
554     else braced = FALSE;
555    
556     if (ptr[1] == '-')
557     {
558     negated = TRUE;
559     ptr++;
560     }
561     else negated = FALSE;
562    
563     c = 0;
564     while ((digitab[ptr[1]] & ctype_digit) != 0)
565     c = c * 10 + *(++ptr) - '0';
566 ph10 220
567 ph10 213 if (c < 0)
568     {
569     *errorcodeptr = ERR61;
570     break;
571 ph10 220 }
572 nigel 93
573     if (c == 0 || (braced && *(++ptr) != '}'))
574     {
575     *errorcodeptr = ERR57;
576 ph10 213 break;
577 nigel 93 }
578    
579     if (negated)
580     {
581     if (c > bracount)
582     {
583     *errorcodeptr = ERR15;
584 ph10 213 break;
585 nigel 93 }
586     c = bracount - (c - 1);
587     }
588    
589     c = -(ESC_REF + c);
590     break;
591    
592 nigel 77 /* The handling of escape sequences consisting of a string of digits
593     starting with one that is not zero is not straightforward. By experiment,
594     the way Perl works seems to be as follows:
595    
596     Outside a character class, the digits are read as a decimal number. If the
597     number is less than 10, or if there are that many previous extracting
598     left brackets, then it is a back reference. Otherwise, up to three octal
599     digits are read to form an escaped byte. Thus \123 is likely to be octal
600     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
601     value is greater than 377, the least significant 8 bits are taken. Inside a
602     character class, \ followed by a digit is always an octal number. */
603    
604     case '1': case '2': case '3': case '4': case '5':
605     case '6': case '7': case '8': case '9':
606    
607     if (!isclass)
608     {
609     oldptr = ptr;
610     c -= '0';
611     while ((digitab[ptr[1]] & ctype_digit) != 0)
612     c = c * 10 + *(++ptr) - '0';
613 ph10 213 if (c < 0)
614     {
615     *errorcodeptr = ERR61;
616 ph10 220 break;
617     }
618 nigel 77 if (c < 10 || c <= bracount)
619     {
620     c = -(ESC_REF + c);
621     break;
622     }
623     ptr = oldptr; /* Put the pointer back and fall through */
624     }
625    
626     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
627     generates a binary zero byte and treats the digit as a following literal.
628     Thus we have to pull back the pointer by one. */
629    
630     if ((c = *ptr) >= '8')
631     {
632     ptr--;
633     c = 0;
634     break;
635     }
636    
637     /* \0 always starts an octal number, but we may drop through to here with a
638 nigel 91 larger first octal digit. The original code used just to take the least
639     significant 8 bits of octal numbers (I think this is what early Perls used
640     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
641     than 3 octal digits. */
642 nigel 77
643     case '0':
644     c -= '0';
645     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
646     c = c * 8 + *(++ptr) - '0';
647 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
648 nigel 77 break;
649    
650 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
651     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
652     treated as a data character. */
653 nigel 77
654     case 'x':
655 nigel 87 if (ptr[1] == '{')
656 nigel 77 {
657     const uschar *pt = ptr + 2;
658 nigel 87 int count = 0;
659    
660 nigel 77 c = 0;
661     while ((digitab[*pt] & ctype_xdigit) != 0)
662     {
663 nigel 87 register int cc = *pt++;
664     if (c == 0 && cc == '0') continue; /* Leading zeroes */
665 nigel 77 count++;
666 nigel 87
667 ph10 97 #ifndef EBCDIC /* ASCII coding */
668 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
669 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
670 ph10 97 #else /* EBCDIC coding */
671 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
672 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
673 nigel 77 #endif
674     }
675 nigel 87
676 nigel 77 if (*pt == '}')
677     {
678 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
679 nigel 77 ptr = pt;
680     break;
681     }
682 nigel 87
683 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
684     recognize this construct; fall through to the normal \x handling. */
685     }
686    
687 nigel 87 /* Read just a single-byte hex-defined char */
688 nigel 77
689     c = 0;
690     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
691     {
692     int cc; /* Some compilers don't like ++ */
693     cc = *(++ptr); /* in initializers */
694 ph10 97 #ifndef EBCDIC /* ASCII coding */
695 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
696     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
697 ph10 97 #else /* EBCDIC coding */
698 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
699     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
700     #endif
701     }
702     break;
703    
704 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
705     This coding is ASCII-specific, but then the whole concept of \cx is
706     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
707 nigel 77
708     case 'c':
709     c = *(++ptr);
710     if (c == 0)
711     {
712     *errorcodeptr = ERR2;
713 ph10 213 break;
714 nigel 77 }
715    
716 ph10 97 #ifndef EBCDIC /* ASCII coding */
717 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
718     c ^= 0x40;
719 ph10 97 #else /* EBCDIC coding */
720 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
721     c ^= 0xC0;
722     #endif
723     break;
724    
725     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
726     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
727     for Perl compatibility, it is a literal. This code looks a bit odd, but
728     there used to be some cases other than the default, and there may be again
729     in future, so I haven't "optimized" it. */
730    
731     default:
732     if ((options & PCRE_EXTRA) != 0) switch(c)
733     {
734     default:
735     *errorcodeptr = ERR3;
736     break;
737     }
738     break;
739     }
740     }
741    
742     *ptrptr = ptr;
743     return c;
744     }
745    
746    
747    
748     #ifdef SUPPORT_UCP
749     /*************************************************
750     * Handle \P and \p *
751     *************************************************/
752    
753     /* This function is called after \P or \p has been encountered, provided that
754     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
755     pointing at the P or p. On exit, it is pointing at the final character of the
756     escape sequence.
757    
758     Argument:
759     ptrptr points to the pattern position pointer
760     negptr points to a boolean that is set TRUE for negation else FALSE
761 nigel 87 dptr points to an int that is set to the detailed property value
762 nigel 77 errorcodeptr points to the error code variable
763    
764 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
765 nigel 77 */
766    
767     static int
768 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
769 nigel 77 {
770     int c, i, bot, top;
771     const uschar *ptr = *ptrptr;
772 nigel 87 char name[32];
773 nigel 77
774     c = *(++ptr);
775     if (c == 0) goto ERROR_RETURN;
776    
777     *negptr = FALSE;
778    
779 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
780     negation. */
781 nigel 77
782     if (c == '{')
783     {
784     if (ptr[1] == '^')
785     {
786     *negptr = TRUE;
787     ptr++;
788     }
789 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
790 nigel 77 {
791     c = *(++ptr);
792     if (c == 0) goto ERROR_RETURN;
793     if (c == '}') break;
794     name[i] = c;
795     }
796 nigel 87 if (c !='}') goto ERROR_RETURN;
797 nigel 77 name[i] = 0;
798     }
799    
800     /* Otherwise there is just one following character */
801    
802     else
803     {
804     name[0] = c;
805     name[1] = 0;
806     }
807    
808     *ptrptr = ptr;
809    
810     /* Search for a recognized property name using binary chop */
811    
812     bot = 0;
813     top = _pcre_utt_size;
814    
815     while (bot < top)
816     {
817 nigel 87 i = (bot + top) >> 1;
818 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
819 nigel 87 if (c == 0)
820     {
821     *dptr = _pcre_utt[i].value;
822     return _pcre_utt[i].type;
823     }
824 nigel 77 if (c > 0) bot = i + 1; else top = i;
825     }
826    
827     *errorcodeptr = ERR47;
828     *ptrptr = ptr;
829     return -1;
830    
831     ERROR_RETURN:
832     *errorcodeptr = ERR46;
833     *ptrptr = ptr;
834     return -1;
835     }
836     #endif
837    
838    
839    
840    
841     /*************************************************
842     * Check for counted repeat *
843     *************************************************/
844    
845     /* This function is called when a '{' is encountered in a place where it might
846     start a quantifier. It looks ahead to see if it really is a quantifier or not.
847     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
848     where the ddds are digits.
849    
850     Arguments:
851     p pointer to the first char after '{'
852    
853     Returns: TRUE or FALSE
854     */
855    
856     static BOOL
857     is_counted_repeat(const uschar *p)
858     {
859     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
860     while ((digitab[*p] & ctype_digit) != 0) p++;
861     if (*p == '}') return TRUE;
862    
863     if (*p++ != ',') return FALSE;
864     if (*p == '}') return TRUE;
865    
866     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
867     while ((digitab[*p] & ctype_digit) != 0) p++;
868    
869     return (*p == '}');
870     }
871    
872    
873    
874     /*************************************************
875     * Read repeat counts *
876     *************************************************/
877    
878     /* Read an item of the form {n,m} and return the values. This is called only
879     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
880     so the syntax is guaranteed to be correct, but we need to check the values.
881    
882     Arguments:
883     p pointer to first char after '{'
884     minp pointer to int for min
885     maxp pointer to int for max
886     returned as -1 if no max
887     errorcodeptr points to error code variable
888    
889     Returns: pointer to '}' on success;
890     current ptr on error, with errorcodeptr set non-zero
891     */
892    
893     static const uschar *
894     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
895     {
896     int min = 0;
897     int max = -1;
898    
899 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
900     an integer overflow. */
901    
902 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
903 nigel 81 if (min < 0 || min > 65535)
904     {
905     *errorcodeptr = ERR5;
906     return p;
907     }
908 nigel 77
909 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
910     Also, max must not be less than min. */
911    
912 nigel 77 if (*p == '}') max = min; else
913     {
914     if (*(++p) != '}')
915     {
916     max = 0;
917     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
918 nigel 81 if (max < 0 || max > 65535)
919     {
920     *errorcodeptr = ERR5;
921     return p;
922     }
923 nigel 77 if (max < min)
924     {
925     *errorcodeptr = ERR4;
926     return p;
927     }
928     }
929     }
930    
931 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
932     '}'. */
933 nigel 77
934 nigel 81 *minp = min;
935     *maxp = max;
936 nigel 77 return p;
937     }
938    
939    
940    
941     /*************************************************
942 nigel 93 * Find forward referenced subpattern *
943 nigel 91 *************************************************/
944    
945 nigel 93 /* This function scans along a pattern's text looking for capturing
946     subpatterns, and counting them. If it finds a named pattern that matches the
947     name it is given, it returns its number. Alternatively, if the name is NULL, it
948     returns when it reaches a given numbered subpattern. This is used for forward
949     references to subpatterns. We know that if (?P< is encountered, the name will
950     be terminated by '>' because that is checked in the first pass.
951 nigel 91
952     Arguments:
953 nigel 93 ptr current position in the pattern
954     count current count of capturing parens so far encountered
955     name name to seek, or NULL if seeking a numbered subpattern
956     lorn name length, or subpattern number if name is NULL
957     xmode TRUE if we are in /x mode
958 nigel 91
959     Returns: the number of the named subpattern, or -1 if not found
960     */
961    
962     static int
963 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
964     BOOL xmode)
965 nigel 91 {
966     const uschar *thisname;
967 nigel 93
968 nigel 91 for (; *ptr != 0; ptr++)
969     {
970 nigel 93 int term;
971    
972     /* Skip over backslashed characters and also entire \Q...\E */
973    
974     if (*ptr == '\\')
975     {
976     if (*(++ptr) == 0) return -1;
977     if (*ptr == 'Q') for (;;)
978     {
979     while (*(++ptr) != 0 && *ptr != '\\');
980     if (*ptr == 0) return -1;
981     if (*(++ptr) == 'E') break;
982     }
983     continue;
984     }
985    
986     /* Skip over character classes */
987    
988     if (*ptr == '[')
989     {
990     while (*(++ptr) != ']')
991     {
992 ph10 220 if (*ptr == 0) return -1;
993 nigel 93 if (*ptr == '\\')
994     {
995     if (*(++ptr) == 0) return -1;
996     if (*ptr == 'Q') for (;;)
997     {
998     while (*(++ptr) != 0 && *ptr != '\\');
999     if (*ptr == 0) return -1;
1000     if (*(++ptr) == 'E') break;
1001     }
1002     continue;
1003     }
1004     }
1005     continue;
1006     }
1007    
1008     /* Skip comments in /x mode */
1009    
1010     if (xmode && *ptr == '#')
1011     {
1012     while (*(++ptr) != 0 && *ptr != '\n');
1013     if (*ptr == 0) return -1;
1014     continue;
1015     }
1016    
1017     /* An opening parens must now be a real metacharacter */
1018    
1019 nigel 91 if (*ptr != '(') continue;
1020 ph10 210 if (ptr[1] != '?' && ptr[1] != '*')
1021 nigel 93 {
1022     count++;
1023     if (name == NULL && count == lorn) return count;
1024     continue;
1025     }
1026    
1027     ptr += 2;
1028     if (*ptr == 'P') ptr++; /* Allow optional P */
1029    
1030     /* We have to disambiguate (?<! and (?<= from (?<name> */
1031    
1032     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1033     *ptr != '\'')
1034     continue;
1035    
1036 nigel 91 count++;
1037 nigel 93
1038     if (name == NULL && count == lorn) return count;
1039     term = *ptr++;
1040     if (term == '<') term = '>';
1041 nigel 91 thisname = ptr;
1042 nigel 93 while (*ptr != term) ptr++;
1043     if (name != NULL && lorn == ptr - thisname &&
1044     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1045 nigel 91 return count;
1046     }
1047 nigel 93
1048 nigel 91 return -1;
1049     }
1050    
1051    
1052    
1053     /*************************************************
1054 nigel 77 * Find first significant op code *
1055     *************************************************/
1056    
1057     /* This is called by several functions that scan a compiled expression looking
1058     for a fixed first character, or an anchoring op code etc. It skips over things
1059     that do not influence this. For some calls, a change of option is important.
1060     For some calls, it makes sense to skip negative forward and all backward
1061     assertions, and also the \b assertion; for others it does not.
1062    
1063     Arguments:
1064     code pointer to the start of the group
1065     options pointer to external options
1066     optbit the option bit whose changing is significant, or
1067     zero if none are
1068     skipassert TRUE if certain assertions are to be skipped
1069    
1070     Returns: pointer to the first significant opcode
1071     */
1072    
1073     static const uschar*
1074     first_significant_code(const uschar *code, int *options, int optbit,
1075     BOOL skipassert)
1076     {
1077     for (;;)
1078     {
1079     switch ((int)*code)
1080     {
1081     case OP_OPT:
1082     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1083     *options = (int)code[1];
1084     code += 2;
1085     break;
1086    
1087     case OP_ASSERT_NOT:
1088     case OP_ASSERTBACK:
1089     case OP_ASSERTBACK_NOT:
1090     if (!skipassert) return code;
1091     do code += GET(code, 1); while (*code == OP_ALT);
1092     code += _pcre_OP_lengths[*code];
1093     break;
1094    
1095     case OP_WORD_BOUNDARY:
1096     case OP_NOT_WORD_BOUNDARY:
1097     if (!skipassert) return code;
1098     /* Fall through */
1099    
1100     case OP_CALLOUT:
1101     case OP_CREF:
1102 nigel 93 case OP_RREF:
1103     case OP_DEF:
1104 nigel 77 code += _pcre_OP_lengths[*code];
1105     break;
1106    
1107     default:
1108     return code;
1109     }
1110     }
1111     /* Control never reaches here */
1112     }
1113    
1114    
1115    
1116    
1117     /*************************************************
1118     * Find the fixed length of a pattern *
1119     *************************************************/
1120    
1121     /* Scan a pattern and compute the fixed length of subject that will match it,
1122     if the length is fixed. This is needed for dealing with backward assertions.
1123     In UTF8 mode, the result is in characters rather than bytes.
1124    
1125     Arguments:
1126     code points to the start of the pattern (the bracket)
1127     options the compiling options
1128    
1129     Returns: the fixed length, or -1 if there is no fixed length,
1130     or -2 if \C was encountered
1131     */
1132    
1133     static int
1134     find_fixedlength(uschar *code, int options)
1135     {
1136     int length = -1;
1137    
1138     register int branchlength = 0;
1139     register uschar *cc = code + 1 + LINK_SIZE;
1140    
1141     /* Scan along the opcodes for this branch. If we get to the end of the
1142     branch, check the length against that of the other branches. */
1143    
1144     for (;;)
1145     {
1146     int d;
1147     register int op = *cc;
1148     switch (op)
1149     {
1150 nigel 93 case OP_CBRA:
1151 nigel 77 case OP_BRA:
1152     case OP_ONCE:
1153     case OP_COND:
1154 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1155 nigel 77 if (d < 0) return d;
1156     branchlength += d;
1157     do cc += GET(cc, 1); while (*cc == OP_ALT);
1158     cc += 1 + LINK_SIZE;
1159     break;
1160    
1161     /* Reached end of a branch; if it's a ket it is the end of a nested
1162     call. If it's ALT it is an alternation in a nested call. If it is
1163     END it's the end of the outer call. All can be handled by the same code. */
1164    
1165     case OP_ALT:
1166     case OP_KET:
1167     case OP_KETRMAX:
1168     case OP_KETRMIN:
1169     case OP_END:
1170     if (length < 0) length = branchlength;
1171     else if (length != branchlength) return -1;
1172     if (*cc != OP_ALT) return length;
1173     cc += 1 + LINK_SIZE;
1174     branchlength = 0;
1175     break;
1176    
1177     /* Skip over assertive subpatterns */
1178    
1179     case OP_ASSERT:
1180     case OP_ASSERT_NOT:
1181     case OP_ASSERTBACK:
1182     case OP_ASSERTBACK_NOT:
1183     do cc += GET(cc, 1); while (*cc == OP_ALT);
1184     /* Fall through */
1185    
1186     /* Skip over things that don't match chars */
1187    
1188     case OP_REVERSE:
1189     case OP_CREF:
1190 nigel 93 case OP_RREF:
1191     case OP_DEF:
1192 nigel 77 case OP_OPT:
1193     case OP_CALLOUT:
1194     case OP_SOD:
1195     case OP_SOM:
1196     case OP_EOD:
1197     case OP_EODN:
1198     case OP_CIRC:
1199     case OP_DOLL:
1200     case OP_NOT_WORD_BOUNDARY:
1201     case OP_WORD_BOUNDARY:
1202     cc += _pcre_OP_lengths[*cc];
1203     break;
1204    
1205     /* Handle literal characters */
1206    
1207     case OP_CHAR:
1208     case OP_CHARNC:
1209 nigel 91 case OP_NOT:
1210 nigel 77 branchlength++;
1211     cc += 2;
1212     #ifdef SUPPORT_UTF8
1213     if ((options & PCRE_UTF8) != 0)
1214     {
1215     while ((*cc & 0xc0) == 0x80) cc++;
1216     }
1217     #endif
1218     break;
1219    
1220     /* Handle exact repetitions. The count is already in characters, but we
1221     need to skip over a multibyte character in UTF8 mode. */
1222    
1223     case OP_EXACT:
1224     branchlength += GET2(cc,1);
1225     cc += 4;
1226     #ifdef SUPPORT_UTF8
1227     if ((options & PCRE_UTF8) != 0)
1228     {
1229     while((*cc & 0x80) == 0x80) cc++;
1230     }
1231     #endif
1232     break;
1233    
1234     case OP_TYPEEXACT:
1235     branchlength += GET2(cc,1);
1236 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1237 nigel 77 cc += 4;
1238     break;
1239    
1240     /* Handle single-char matchers */
1241    
1242     case OP_PROP:
1243     case OP_NOTPROP:
1244 nigel 87 cc += 2;
1245 nigel 77 /* Fall through */
1246    
1247     case OP_NOT_DIGIT:
1248     case OP_DIGIT:
1249     case OP_NOT_WHITESPACE:
1250     case OP_WHITESPACE:
1251     case OP_NOT_WORDCHAR:
1252     case OP_WORDCHAR:
1253     case OP_ANY:
1254     branchlength++;
1255     cc++;
1256     break;
1257    
1258     /* The single-byte matcher isn't allowed */
1259    
1260     case OP_ANYBYTE:
1261     return -2;
1262    
1263     /* Check a class for variable quantification */
1264    
1265     #ifdef SUPPORT_UTF8
1266     case OP_XCLASS:
1267     cc += GET(cc, 1) - 33;
1268     /* Fall through */
1269     #endif
1270    
1271     case OP_CLASS:
1272     case OP_NCLASS:
1273     cc += 33;
1274    
1275     switch (*cc)
1276     {
1277     case OP_CRSTAR:
1278     case OP_CRMINSTAR:
1279     case OP_CRQUERY:
1280     case OP_CRMINQUERY:
1281     return -1;
1282    
1283     case OP_CRRANGE:
1284     case OP_CRMINRANGE:
1285     if (GET2(cc,1) != GET2(cc,3)) return -1;
1286     branchlength += GET2(cc,1);
1287     cc += 5;
1288     break;
1289    
1290     default:
1291     branchlength++;
1292     }
1293     break;
1294    
1295     /* Anything else is variable length */
1296    
1297     default:
1298     return -1;
1299     }
1300     }
1301     /* Control never gets here */
1302     }
1303    
1304    
1305    
1306    
1307     /*************************************************
1308     * Scan compiled regex for numbered bracket *
1309     *************************************************/
1310    
1311     /* This little function scans through a compiled pattern until it finds a
1312     capturing bracket with the given number.
1313    
1314     Arguments:
1315     code points to start of expression
1316     utf8 TRUE in UTF-8 mode
1317     number the required bracket number
1318    
1319     Returns: pointer to the opcode for the bracket, or NULL if not found
1320     */
1321    
1322     static const uschar *
1323     find_bracket(const uschar *code, BOOL utf8, int number)
1324     {
1325     for (;;)
1326     {
1327     register int c = *code;
1328     if (c == OP_END) return NULL;
1329 nigel 91
1330     /* XCLASS is used for classes that cannot be represented just by a bit
1331     map. This includes negated single high-valued characters. The length in
1332     the table is zero; the actual length is stored in the compiled code. */
1333    
1334     if (c == OP_XCLASS) code += GET(code, 1);
1335    
1336 nigel 93 /* Handle capturing bracket */
1337 nigel 91
1338 nigel 93 else if (c == OP_CBRA)
1339 nigel 77 {
1340 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1341 nigel 77 if (n == number) return (uschar *)code;
1342 nigel 93 code += _pcre_OP_lengths[c];
1343 nigel 77 }
1344 nigel 91
1345 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1346     repeated character types, we have to test for \p and \P, which have an extra
1347 ph10 218 two bytes of parameters. */
1348 nigel 91
1349 nigel 77 else
1350     {
1351 ph10 218 switch(c)
1352     {
1353     case OP_TYPESTAR:
1354     case OP_TYPEMINSTAR:
1355     case OP_TYPEPLUS:
1356     case OP_TYPEMINPLUS:
1357     case OP_TYPEQUERY:
1358     case OP_TYPEMINQUERY:
1359     case OP_TYPEPOSSTAR:
1360     case OP_TYPEPOSPLUS:
1361     case OP_TYPEPOSQUERY:
1362     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1363 ph10 220 break;
1364 ph10 221
1365     case OP_TYPEUPTO:
1366     case OP_TYPEMINUPTO:
1367     case OP_TYPEEXACT:
1368     case OP_TYPEPOSUPTO:
1369     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1370     break;
1371 ph10 220 }
1372    
1373 ph10 218 /* Add in the fixed length from the table */
1374 ph10 220
1375 nigel 77 code += _pcre_OP_lengths[c];
1376 ph10 220
1377 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1378     a multi-byte character. The length in the table is a minimum, so we have to
1379     arrange to skip the extra bytes. */
1380 ph10 220
1381 ph10 107 #ifdef SUPPORT_UTF8
1382 nigel 77 if (utf8) switch(c)
1383     {
1384     case OP_CHAR:
1385     case OP_CHARNC:
1386     case OP_EXACT:
1387     case OP_UPTO:
1388     case OP_MINUPTO:
1389 nigel 93 case OP_POSUPTO:
1390 nigel 77 case OP_STAR:
1391     case OP_MINSTAR:
1392 nigel 93 case OP_POSSTAR:
1393 nigel 77 case OP_PLUS:
1394     case OP_MINPLUS:
1395 nigel 93 case OP_POSPLUS:
1396 nigel 77 case OP_QUERY:
1397     case OP_MINQUERY:
1398 nigel 93 case OP_POSQUERY:
1399     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1400 nigel 77 break;
1401     }
1402 ph10 111 #endif
1403 nigel 77 }
1404     }
1405     }
1406    
1407    
1408    
1409     /*************************************************
1410     * Scan compiled regex for recursion reference *
1411     *************************************************/
1412    
1413     /* This little function scans through a compiled pattern until it finds an
1414     instance of OP_RECURSE.
1415    
1416     Arguments:
1417     code points to start of expression
1418     utf8 TRUE in UTF-8 mode
1419    
1420     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1421     */
1422    
1423     static const uschar *
1424     find_recurse(const uschar *code, BOOL utf8)
1425     {
1426     for (;;)
1427     {
1428     register int c = *code;
1429     if (c == OP_END) return NULL;
1430 nigel 91 if (c == OP_RECURSE) return code;
1431 ph10 220
1432 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1433     map. This includes negated single high-valued characters. The length in
1434     the table is zero; the actual length is stored in the compiled code. */
1435    
1436     if (c == OP_XCLASS) code += GET(code, 1);
1437    
1438 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1439     repeated character types, we have to test for \p and \P, which have an extra
1440 ph10 218 two bytes of parameters. */
1441 nigel 91
1442 nigel 77 else
1443     {
1444 ph10 218 switch(c)
1445     {
1446     case OP_TYPESTAR:
1447     case OP_TYPEMINSTAR:
1448     case OP_TYPEPLUS:
1449     case OP_TYPEMINPLUS:
1450     case OP_TYPEQUERY:
1451     case OP_TYPEMINQUERY:
1452     case OP_TYPEPOSSTAR:
1453     case OP_TYPEPOSPLUS:
1454     case OP_TYPEPOSQUERY:
1455     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1456 ph10 220 break;
1457 ph10 221
1458     case OP_TYPEPOSUPTO:
1459     case OP_TYPEUPTO:
1460     case OP_TYPEMINUPTO:
1461     case OP_TYPEEXACT:
1462     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1463     break;
1464 ph10 220 }
1465    
1466 ph10 218 /* Add in the fixed length from the table */
1467    
1468 nigel 77 code += _pcre_OP_lengths[c];
1469 ph10 220
1470 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1471     by a multi-byte character. The length in the table is a minimum, so we have
1472     to arrange to skip the extra bytes. */
1473 ph10 220
1474 ph10 107 #ifdef SUPPORT_UTF8
1475 nigel 77 if (utf8) switch(c)
1476     {
1477     case OP_CHAR:
1478     case OP_CHARNC:
1479     case OP_EXACT:
1480     case OP_UPTO:
1481     case OP_MINUPTO:
1482 nigel 93 case OP_POSUPTO:
1483 nigel 77 case OP_STAR:
1484     case OP_MINSTAR:
1485 nigel 93 case OP_POSSTAR:
1486 nigel 77 case OP_PLUS:
1487     case OP_MINPLUS:
1488 nigel 93 case OP_POSPLUS:
1489 nigel 77 case OP_QUERY:
1490     case OP_MINQUERY:
1491 nigel 93 case OP_POSQUERY:
1492     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1493 nigel 77 break;
1494     }
1495 ph10 111 #endif
1496 nigel 77 }
1497     }
1498     }
1499    
1500    
1501    
1502     /*************************************************
1503     * Scan compiled branch for non-emptiness *
1504     *************************************************/
1505    
1506     /* This function scans through a branch of a compiled pattern to see whether it
1507 nigel 93 can match the empty string or not. It is called from could_be_empty()
1508     below and from compile_branch() when checking for an unlimited repeat of a
1509     group that can match nothing. Note that first_significant_code() skips over
1510     assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1511     struck an inner bracket whose current branch will already have been scanned.
1512 nigel 77
1513     Arguments:
1514     code points to start of search
1515     endcode points to where to stop
1516     utf8 TRUE if in UTF8 mode
1517    
1518     Returns: TRUE if what is matched could be empty
1519     */
1520    
1521     static BOOL
1522     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1523     {
1524     register int c;
1525 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1526 nigel 77 code < endcode;
1527     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1528     {
1529     const uschar *ccode;
1530    
1531     c = *code;
1532 ph10 172
1533 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1534 nigel 77
1535 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1536     {
1537 ph10 172 code += _pcre_OP_lengths[c];
1538 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1539     c = *code;
1540     continue;
1541     }
1542    
1543     /* For other groups, scan the branches. */
1544 ph10 172
1545 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1546 nigel 77 {
1547     BOOL empty_branch;
1548     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1549    
1550     /* Scan a closed bracket */
1551    
1552     empty_branch = FALSE;
1553     do
1554     {
1555     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1556     empty_branch = TRUE;
1557     code += GET(code, 1);
1558     }
1559     while (*code == OP_ALT);
1560     if (!empty_branch) return FALSE; /* All branches are non-empty */
1561 ph10 172 c = *code;
1562 nigel 93 continue;
1563 nigel 77 }
1564    
1565 nigel 93 /* Handle the other opcodes */
1566    
1567     switch (c)
1568 nigel 77 {
1569 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1570     cannot be represented just by a bit map. This includes negated single
1571     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1572 ph10 220 actual length is stored in the compiled code, so we must update "code"
1573 ph10 216 here. */
1574 nigel 77
1575     #ifdef SUPPORT_UTF8
1576     case OP_XCLASS:
1577 ph10 216 ccode = code += GET(code, 1);
1578 nigel 77 goto CHECK_CLASS_REPEAT;
1579     #endif
1580    
1581     case OP_CLASS:
1582     case OP_NCLASS:
1583     ccode = code + 33;
1584    
1585     #ifdef SUPPORT_UTF8
1586     CHECK_CLASS_REPEAT:
1587     #endif
1588    
1589     switch (*ccode)
1590     {
1591     case OP_CRSTAR: /* These could be empty; continue */
1592     case OP_CRMINSTAR:
1593     case OP_CRQUERY:
1594     case OP_CRMINQUERY:
1595     break;
1596    
1597     default: /* Non-repeat => class must match */
1598     case OP_CRPLUS: /* These repeats aren't empty */
1599     case OP_CRMINPLUS:
1600     return FALSE;
1601    
1602     case OP_CRRANGE:
1603     case OP_CRMINRANGE:
1604     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1605     break;
1606     }
1607     break;
1608    
1609     /* Opcodes that must match a character */
1610    
1611     case OP_PROP:
1612     case OP_NOTPROP:
1613     case OP_EXTUNI:
1614     case OP_NOT_DIGIT:
1615     case OP_DIGIT:
1616     case OP_NOT_WHITESPACE:
1617     case OP_WHITESPACE:
1618     case OP_NOT_WORDCHAR:
1619     case OP_WORDCHAR:
1620     case OP_ANY:
1621     case OP_ANYBYTE:
1622     case OP_CHAR:
1623     case OP_CHARNC:
1624     case OP_NOT:
1625     case OP_PLUS:
1626     case OP_MINPLUS:
1627 nigel 93 case OP_POSPLUS:
1628 nigel 77 case OP_EXACT:
1629     case OP_NOTPLUS:
1630     case OP_NOTMINPLUS:
1631 nigel 93 case OP_NOTPOSPLUS:
1632 nigel 77 case OP_NOTEXACT:
1633     case OP_TYPEPLUS:
1634     case OP_TYPEMINPLUS:
1635 nigel 93 case OP_TYPEPOSPLUS:
1636 nigel 77 case OP_TYPEEXACT:
1637     return FALSE;
1638 ph10 227
1639     /* These are going to continue, as they may be empty, but we have to
1640     fudge the length for the \p and \P cases. */
1641    
1642 ph10 224 case OP_TYPESTAR:
1643     case OP_TYPEMINSTAR:
1644     case OP_TYPEPOSSTAR:
1645     case OP_TYPEQUERY:
1646     case OP_TYPEMINQUERY:
1647     case OP_TYPEPOSQUERY:
1648     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1649 ph10 227 break;
1650    
1651 ph10 224 /* Same for these */
1652 ph10 227
1653 ph10 224 case OP_TYPEUPTO:
1654     case OP_TYPEMINUPTO:
1655     case OP_TYPEPOSUPTO:
1656     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1657     break;
1658 nigel 77
1659     /* End of branch */
1660    
1661     case OP_KET:
1662     case OP_KETRMAX:
1663     case OP_KETRMIN:
1664     case OP_ALT:
1665     return TRUE;
1666    
1667 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1668     MINUPTO, and POSUPTO may be followed by a multibyte character */
1669 nigel 77
1670     #ifdef SUPPORT_UTF8
1671     case OP_STAR:
1672     case OP_MINSTAR:
1673 nigel 93 case OP_POSSTAR:
1674 nigel 77 case OP_QUERY:
1675     case OP_MINQUERY:
1676 nigel 93 case OP_POSQUERY:
1677 nigel 77 case OP_UPTO:
1678     case OP_MINUPTO:
1679 nigel 93 case OP_POSUPTO:
1680 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1681     break;
1682     #endif
1683     }
1684     }
1685    
1686     return TRUE;
1687     }
1688    
1689    
1690    
1691     /*************************************************
1692     * Scan compiled regex for non-emptiness *
1693     *************************************************/
1694    
1695     /* This function is called to check for left recursive calls. We want to check
1696     the current branch of the current pattern to see if it could match the empty
1697     string. If it could, we must look outwards for branches at other levels,
1698     stopping when we pass beyond the bracket which is the subject of the recursion.
1699    
1700     Arguments:
1701     code points to start of the recursion
1702     endcode points to where to stop (current RECURSE item)
1703     bcptr points to the chain of current (unclosed) branch starts
1704     utf8 TRUE if in UTF-8 mode
1705    
1706     Returns: TRUE if what is matched could be empty
1707     */
1708    
1709     static BOOL
1710     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1711     BOOL utf8)
1712     {
1713     while (bcptr != NULL && bcptr->current >= code)
1714     {
1715     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1716     bcptr = bcptr->outer;
1717     }
1718     return TRUE;
1719     }
1720    
1721    
1722    
1723     /*************************************************
1724     * Check for POSIX class syntax *
1725     *************************************************/
1726    
1727     /* This function is called when the sequence "[:" or "[." or "[=" is
1728     encountered in a character class. It checks whether this is followed by an
1729     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1730     ".]" or "=]".
1731    
1732     Argument:
1733     ptr pointer to the initial [
1734     endptr where to return the end pointer
1735     cd pointer to compile data
1736    
1737     Returns: TRUE or FALSE
1738     */
1739    
1740     static BOOL
1741     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1742     {
1743     int terminator; /* Don't combine these lines; the Solaris cc */
1744     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1745     if (*(++ptr) == '^') ptr++;
1746     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1747     if (*ptr == terminator && ptr[1] == ']')
1748     {
1749     *endptr = ptr;
1750     return TRUE;
1751     }
1752     return FALSE;
1753     }
1754    
1755    
1756    
1757    
1758     /*************************************************
1759     * Check POSIX class name *
1760     *************************************************/
1761    
1762     /* This function is called to check the name given in a POSIX-style class entry
1763     such as [:alnum:].
1764    
1765     Arguments:
1766     ptr points to the first letter
1767     len the length of the name
1768    
1769     Returns: a value representing the name, or -1 if unknown
1770     */
1771    
1772     static int
1773     check_posix_name(const uschar *ptr, int len)
1774     {
1775 ph10 240 const char *pn = posix_names;
1776 nigel 77 register int yield = 0;
1777     while (posix_name_lengths[yield] != 0)
1778     {
1779     if (len == posix_name_lengths[yield] &&
1780 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
1781 ph10 243 pn += posix_name_lengths[yield] + 1;
1782 nigel 77 yield++;
1783     }
1784     return -1;
1785     }
1786    
1787    
1788     /*************************************************
1789     * Adjust OP_RECURSE items in repeated group *
1790     *************************************************/
1791    
1792     /* OP_RECURSE items contain an offset from the start of the regex to the group
1793     that is referenced. This means that groups can be replicated for fixed
1794     repetition simply by copying (because the recursion is allowed to refer to
1795     earlier groups that are outside the current group). However, when a group is
1796     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1797     it, after it has been compiled. This means that any OP_RECURSE items within it
1798     that refer to the group itself or any contained groups have to have their
1799 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1800     the partially compiled regex must be temporarily terminated with OP_END.
1801 nigel 77
1802 nigel 93 This function has been extended with the possibility of forward references for
1803     recursions and subroutine calls. It must also check the list of such references
1804     for the group we are dealing with. If it finds that one of the recursions in
1805     the current group is on this list, it adjusts the offset in the list, not the
1806     value in the reference (which is a group number).
1807    
1808 nigel 77 Arguments:
1809     group points to the start of the group
1810     adjust the amount by which the group is to be moved
1811     utf8 TRUE in UTF-8 mode
1812     cd contains pointers to tables etc.
1813 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1814 nigel 77
1815     Returns: nothing
1816     */
1817    
1818     static void
1819 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1820     uschar *save_hwm)
1821 nigel 77 {
1822     uschar *ptr = group;
1823 ph10 224
1824 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1825     {
1826 nigel 93 int offset;
1827     uschar *hc;
1828    
1829     /* See if this recursion is on the forward reference list. If so, adjust the
1830     reference. */
1831    
1832     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1833     {
1834     offset = GET(hc, 0);
1835     if (cd->start_code + offset == ptr + 1)
1836     {
1837     PUT(hc, 0, offset + adjust);
1838     break;
1839     }
1840     }
1841    
1842     /* Otherwise, adjust the recursion offset if it's after the start of this
1843     group. */
1844    
1845     if (hc >= cd->hwm)
1846     {
1847     offset = GET(ptr, 1);
1848     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1849     }
1850    
1851 nigel 77 ptr += 1 + LINK_SIZE;
1852     }
1853     }
1854    
1855    
1856    
1857     /*************************************************
1858     * Insert an automatic callout point *
1859     *************************************************/
1860    
1861     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1862     callout points before each pattern item.
1863    
1864     Arguments:
1865     code current code pointer
1866     ptr current pattern pointer
1867     cd pointers to tables etc
1868    
1869     Returns: new code pointer
1870     */
1871    
1872     static uschar *
1873     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1874     {
1875     *code++ = OP_CALLOUT;
1876     *code++ = 255;
1877     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1878     PUT(code, LINK_SIZE, 0); /* Default length */
1879     return code + 2*LINK_SIZE;
1880     }
1881    
1882    
1883    
1884     /*************************************************
1885     * Complete a callout item *
1886     *************************************************/
1887    
1888     /* A callout item contains the length of the next item in the pattern, which
1889     we can't fill in till after we have reached the relevant point. This is used
1890     for both automatic and manual callouts.
1891    
1892     Arguments:
1893     previous_callout points to previous callout item
1894     ptr current pattern pointer
1895     cd pointers to tables etc
1896    
1897     Returns: nothing
1898     */
1899    
1900     static void
1901     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1902     {
1903     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1904     PUT(previous_callout, 2 + LINK_SIZE, length);
1905     }
1906    
1907    
1908    
1909     #ifdef SUPPORT_UCP
1910     /*************************************************
1911     * Get othercase range *
1912     *************************************************/
1913    
1914     /* This function is passed the start and end of a class range, in UTF-8 mode
1915     with UCP support. It searches up the characters, looking for internal ranges of
1916     characters in the "other" case. Each call returns the next one, updating the
1917     start address.
1918    
1919     Arguments:
1920     cptr points to starting character value; updated
1921     d end value
1922     ocptr where to put start of othercase range
1923     odptr where to put end of othercase range
1924    
1925     Yield: TRUE when range returned; FALSE when no more
1926     */
1927    
1928     static BOOL
1929 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1930     unsigned int *odptr)
1931 nigel 77 {
1932 nigel 93 unsigned int c, othercase, next;
1933 nigel 77
1934     for (c = *cptr; c <= d; c++)
1935 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1936 nigel 77
1937     if (c > d) return FALSE;
1938    
1939     *ocptr = othercase;
1940     next = othercase + 1;
1941    
1942     for (++c; c <= d; c++)
1943     {
1944 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1945 nigel 77 next++;
1946     }
1947    
1948     *odptr = next - 1;
1949     *cptr = c;
1950    
1951     return TRUE;
1952     }
1953     #endif /* SUPPORT_UCP */
1954    
1955    
1956 nigel 93
1957 nigel 77 /*************************************************
1958 nigel 93 * Check if auto-possessifying is possible *
1959     *************************************************/
1960    
1961     /* This function is called for unlimited repeats of certain items, to see
1962     whether the next thing could possibly match the repeated item. If not, it makes
1963     sense to automatically possessify the repeated item.
1964    
1965     Arguments:
1966     op_code the repeated op code
1967     this data for this item, depends on the opcode
1968     utf8 TRUE in UTF-8 mode
1969     utf8_char used for utf8 character bytes, NULL if not relevant
1970     ptr next character in pattern
1971     options options bits
1972     cd contains pointers to tables etc.
1973    
1974     Returns: TRUE if possessifying is wanted
1975     */
1976    
1977     static BOOL
1978     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1979     const uschar *ptr, int options, compile_data *cd)
1980     {
1981     int next;
1982    
1983     /* Skip whitespace and comments in extended mode */
1984    
1985     if ((options & PCRE_EXTENDED) != 0)
1986     {
1987     for (;;)
1988     {
1989     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1990     if (*ptr == '#')
1991     {
1992     while (*(++ptr) != 0)
1993     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1994     }
1995     else break;
1996     }
1997     }
1998    
1999     /* If the next item is one that we can handle, get its value. A non-negative
2000     value is a character, a negative value is an escape value. */
2001    
2002     if (*ptr == '\\')
2003     {
2004     int temperrorcode = 0;
2005     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2006     if (temperrorcode != 0) return FALSE;
2007     ptr++; /* Point after the escape sequence */
2008     }
2009    
2010     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2011     {
2012     #ifdef SUPPORT_UTF8
2013     if (utf8) { GETCHARINC(next, ptr); } else
2014     #endif
2015     next = *ptr++;
2016     }
2017    
2018     else return FALSE;
2019    
2020     /* Skip whitespace and comments in extended mode */
2021    
2022     if ((options & PCRE_EXTENDED) != 0)
2023     {
2024     for (;;)
2025     {
2026     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2027     if (*ptr == '#')
2028     {
2029     while (*(++ptr) != 0)
2030     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2031     }
2032     else break;
2033     }
2034     }
2035    
2036     /* If the next thing is itself optional, we have to give up. */
2037    
2038     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2039     return FALSE;
2040    
2041     /* Now compare the next item with the previous opcode. If the previous is a
2042     positive single character match, "item" either contains the character or, if
2043     "item" is greater than 127 in utf8 mode, the character's bytes are in
2044     utf8_char. */
2045    
2046    
2047     /* Handle cases when the next item is a character. */
2048    
2049     if (next >= 0) switch(op_code)
2050     {
2051     case OP_CHAR:
2052     #ifdef SUPPORT_UTF8
2053     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2054     #endif
2055     return item != next;
2056    
2057     /* For CHARNC (caseless character) we must check the other case. If we have
2058     Unicode property support, we can use it to test the other case of
2059     high-valued characters. */
2060    
2061     case OP_CHARNC:
2062     #ifdef SUPPORT_UTF8
2063     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2064     #endif
2065     if (item == next) return FALSE;
2066     #ifdef SUPPORT_UTF8
2067     if (utf8)
2068     {
2069     unsigned int othercase;
2070     if (next < 128) othercase = cd->fcc[next]; else
2071     #ifdef SUPPORT_UCP
2072     othercase = _pcre_ucp_othercase((unsigned int)next);
2073     #else
2074     othercase = NOTACHAR;
2075     #endif
2076     return (unsigned int)item != othercase;
2077     }
2078     else
2079     #endif /* SUPPORT_UTF8 */
2080     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2081    
2082     /* For OP_NOT, "item" must be a single-byte character. */
2083    
2084     case OP_NOT:
2085     if (next < 0) return FALSE; /* Not a character */
2086     if (item == next) return TRUE;
2087     if ((options & PCRE_CASELESS) == 0) return FALSE;
2088     #ifdef SUPPORT_UTF8
2089     if (utf8)
2090     {
2091     unsigned int othercase;
2092     if (next < 128) othercase = cd->fcc[next]; else
2093     #ifdef SUPPORT_UCP
2094     othercase = _pcre_ucp_othercase(next);
2095     #else
2096     othercase = NOTACHAR;
2097     #endif
2098     return (unsigned int)item == othercase;
2099     }
2100     else
2101     #endif /* SUPPORT_UTF8 */
2102     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2103    
2104     case OP_DIGIT:
2105     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2106    
2107     case OP_NOT_DIGIT:
2108     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2109    
2110     case OP_WHITESPACE:
2111     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2112    
2113     case OP_NOT_WHITESPACE:
2114     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2115    
2116     case OP_WORDCHAR:
2117     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2118    
2119     case OP_NOT_WORDCHAR:
2120     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2121    
2122 ph10 180 case OP_HSPACE:
2123     case OP_NOT_HSPACE:
2124     switch(next)
2125     {
2126     case 0x09:
2127     case 0x20:
2128     case 0xa0:
2129     case 0x1680:
2130     case 0x180e:
2131     case 0x2000:
2132     case 0x2001:
2133     case 0x2002:
2134     case 0x2003:
2135     case 0x2004:
2136     case 0x2005:
2137     case 0x2006:
2138     case 0x2007:
2139     case 0x2008:
2140     case 0x2009:
2141     case 0x200A:
2142     case 0x202f:
2143     case 0x205f:
2144     case 0x3000:
2145     return op_code != OP_HSPACE;
2146     default:
2147     return op_code == OP_HSPACE;
2148     }
2149    
2150     case OP_VSPACE:
2151     case OP_NOT_VSPACE:
2152     switch(next)
2153     {
2154     case 0x0a:
2155     case 0x0b:
2156     case 0x0c:
2157     case 0x0d:
2158     case 0x85:
2159     case 0x2028:
2160     case 0x2029:
2161     return op_code != OP_VSPACE;
2162     default:
2163     return op_code == OP_VSPACE;
2164     }
2165    
2166 nigel 93 default:
2167     return FALSE;
2168     }
2169    
2170    
2171     /* Handle the case when the next item is \d, \s, etc. */
2172    
2173     switch(op_code)
2174     {
2175     case OP_CHAR:
2176     case OP_CHARNC:
2177     #ifdef SUPPORT_UTF8
2178     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2179     #endif
2180     switch(-next)
2181     {
2182     case ESC_d:
2183     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2184    
2185     case ESC_D:
2186     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2187    
2188     case ESC_s:
2189     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2190    
2191     case ESC_S:
2192     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2193    
2194     case ESC_w:
2195     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2196    
2197     case ESC_W:
2198     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2199 ph10 182
2200 ph10 180 case ESC_h:
2201     case ESC_H:
2202     switch(item)
2203     {
2204     case 0x09:
2205     case 0x20:
2206     case 0xa0:
2207     case 0x1680:
2208     case 0x180e:
2209     case 0x2000:
2210     case 0x2001:
2211     case 0x2002:
2212     case 0x2003:
2213     case 0x2004:
2214     case 0x2005:
2215     case 0x2006:
2216     case 0x2007:
2217     case 0x2008:
2218     case 0x2009:
2219     case 0x200A:
2220     case 0x202f:
2221     case 0x205f:
2222     case 0x3000:
2223     return -next != ESC_h;
2224     default:
2225     return -next == ESC_h;
2226 ph10 182 }
2227    
2228 ph10 180 case ESC_v:
2229     case ESC_V:
2230     switch(item)
2231     {
2232     case 0x0a:
2233     case 0x0b:
2234     case 0x0c:
2235     case 0x0d:
2236     case 0x85:
2237     case 0x2028:
2238     case 0x2029:
2239     return -next != ESC_v;
2240     default:
2241     return -next == ESC_v;
2242 ph10 182 }
2243 nigel 93
2244     default:
2245     return FALSE;
2246     }
2247    
2248     case OP_DIGIT:
2249 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2250     next == -ESC_h || next == -ESC_v;
2251 nigel 93
2252     case OP_NOT_DIGIT:
2253     return next == -ESC_d;
2254    
2255     case OP_WHITESPACE:
2256     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2257    
2258     case OP_NOT_WHITESPACE:
2259 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2260 nigel 93
2261 ph10 180 case OP_HSPACE:
2262     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2263    
2264     case OP_NOT_HSPACE:
2265     return next == -ESC_h;
2266 ph10 182
2267 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2268 ph10 182 case OP_VSPACE:
2269 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2270    
2271     case OP_NOT_VSPACE:
2272 ph10 182 return next == -ESC_v;
2273 ph10 180
2274 nigel 93 case OP_WORDCHAR:
2275 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2276 nigel 93
2277     case OP_NOT_WORDCHAR:
2278     return next == -ESC_w || next == -ESC_d;
2279 ph10 182
2280 nigel 93 default:
2281     return FALSE;
2282     }
2283    
2284     /* Control does not reach here */
2285     }
2286    
2287    
2288    
2289     /*************************************************
2290 nigel 77 * Compile one branch *
2291     *************************************************/
2292    
2293 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2294 nigel 77 changed during the branch, the pointer is used to change the external options
2295 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2296     to find out the amount of memory needed, as well as during the real compile
2297     phase. The value of lengthptr distinguishes the two phases.
2298 nigel 77
2299     Arguments:
2300     optionsptr pointer to the option bits
2301     codeptr points to the pointer to the current code point
2302     ptrptr points to the current pattern pointer
2303     errorcodeptr points to error code variable
2304     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2305     reqbyteptr set to the last literal character required, else < 0
2306     bcptr points to current branch chain
2307     cd contains pointers to tables etc.
2308 nigel 93 lengthptr NULL during the real compile phase
2309     points to length accumulator during pre-compile phase
2310 nigel 77
2311     Returns: TRUE on success
2312     FALSE, with *errorcodeptr set non-zero on error
2313     */
2314    
2315     static BOOL
2316 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2317     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2318     compile_data *cd, int *lengthptr)
2319 nigel 77 {
2320     int repeat_type, op_type;
2321     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2322     int bravalue = 0;
2323     int greedy_default, greedy_non_default;
2324     int firstbyte, reqbyte;
2325     int zeroreqbyte, zerofirstbyte;
2326     int req_caseopt, reqvary, tempreqvary;
2327     int options = *optionsptr;
2328     int after_manual_callout = 0;
2329 nigel 93 int length_prevgroup = 0;
2330 nigel 77 register int c;
2331     register uschar *code = *codeptr;
2332 nigel 93 uschar *last_code = code;
2333     uschar *orig_code = code;
2334 nigel 77 uschar *tempcode;
2335     BOOL inescq = FALSE;
2336     BOOL groupsetfirstbyte = FALSE;
2337     const uschar *ptr = *ptrptr;
2338     const uschar *tempptr;
2339     uschar *previous = NULL;
2340     uschar *previous_callout = NULL;
2341 nigel 93 uschar *save_hwm = NULL;
2342 nigel 77 uschar classbits[32];
2343    
2344     #ifdef SUPPORT_UTF8
2345     BOOL class_utf8;
2346     BOOL utf8 = (options & PCRE_UTF8) != 0;
2347     uschar *class_utf8data;
2348     uschar utf8_char[6];
2349     #else
2350     BOOL utf8 = FALSE;
2351 nigel 93 uschar *utf8_char = NULL;
2352 nigel 77 #endif
2353    
2354 nigel 93 #ifdef DEBUG
2355     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2356     #endif
2357    
2358 nigel 77 /* Set up the default and non-default settings for greediness */
2359    
2360     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2361     greedy_non_default = greedy_default ^ 1;
2362    
2363     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2364     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2365     matches a non-fixed char first char; reqbyte just remains unset if we never
2366     find one.
2367    
2368     When we hit a repeat whose minimum is zero, we may have to adjust these values
2369     to take the zero repeat into account. This is implemented by setting them to
2370     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2371     item types that can be repeated set these backoff variables appropriately. */
2372    
2373     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2374    
2375     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2376     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2377     value > 255. It is added into the firstbyte or reqbyte variables to record the
2378     case status of the value. This is used only for ASCII characters. */
2379    
2380     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2381    
2382     /* Switch on next character until the end of the branch */
2383    
2384     for (;; ptr++)
2385     {
2386     BOOL negate_class;
2387 ph10 264 BOOL should_flip_negation;
2388 nigel 77 BOOL possessive_quantifier;
2389     BOOL is_quantifier;
2390 nigel 93 BOOL is_recurse;
2391 ph10 180 BOOL reset_bracount;
2392 nigel 77 int class_charcount;
2393     int class_lastchar;
2394     int newoptions;
2395     int recno;
2396 ph10 172 int refsign;
2397 nigel 77 int skipbytes;
2398     int subreqbyte;
2399     int subfirstbyte;
2400 nigel 93 int terminator;
2401 nigel 77 int mclength;
2402     uschar mcbuffer[8];
2403    
2404 nigel 93 /* Get next byte in the pattern */
2405 nigel 77
2406     c = *ptr;
2407    
2408 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2409     previous cycle of this loop. */
2410    
2411     if (lengthptr != NULL)
2412     {
2413     #ifdef DEBUG
2414     if (code > cd->hwm) cd->hwm = code; /* High water info */
2415     #endif
2416     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2417     {
2418     *errorcodeptr = ERR52;
2419     goto FAILED;
2420     }
2421    
2422     /* There is at least one situation where code goes backwards: this is the
2423     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2424     the class is simply eliminated. However, it is created first, so we have to
2425     allow memory for it. Therefore, don't ever reduce the length at this point.
2426     */
2427    
2428     if (code < last_code) code = last_code;
2429 ph10 202
2430     /* Paranoid check for integer overflow */
2431    
2432     if (OFLOW_MAX - *lengthptr < code - last_code)
2433     {
2434     *errorcodeptr = ERR20;
2435     goto FAILED;
2436     }
2437    
2438 nigel 93 *lengthptr += code - last_code;
2439     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2440    
2441     /* If "previous" is set and it is not at the start of the work space, move
2442     it back to there, in order to avoid filling up the work space. Otherwise,
2443     if "previous" is NULL, reset the current code pointer to the start. */
2444    
2445     if (previous != NULL)
2446     {
2447     if (previous > orig_code)
2448     {
2449     memmove(orig_code, previous, code - previous);
2450     code -= previous - orig_code;
2451     previous = orig_code;
2452     }
2453     }
2454     else code = orig_code;
2455    
2456     /* Remember where this code item starts so we can pick up the length
2457     next time round. */
2458    
2459     last_code = code;
2460     }
2461    
2462     /* In the real compile phase, just check the workspace used by the forward
2463     reference list. */
2464    
2465     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2466     {
2467     *errorcodeptr = ERR52;
2468     goto FAILED;
2469     }
2470    
2471 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2472    
2473     if (inescq && c != 0)
2474     {
2475     if (c == '\\' && ptr[1] == 'E')
2476     {
2477     inescq = FALSE;
2478     ptr++;
2479     continue;
2480     }
2481     else
2482     {
2483     if (previous_callout != NULL)
2484     {
2485 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2486     complete_callout(previous_callout, ptr, cd);
2487 nigel 77 previous_callout = NULL;
2488     }
2489     if ((options & PCRE_AUTO_CALLOUT) != 0)
2490     {
2491     previous_callout = code;
2492     code = auto_callout(code, ptr, cd);
2493     }
2494     goto NORMAL_CHAR;
2495     }
2496     }
2497    
2498     /* Fill in length of a previous callout, except when the next thing is
2499     a quantifier. */
2500    
2501     is_quantifier = c == '*' || c == '+' || c == '?' ||
2502     (c == '{' && is_counted_repeat(ptr+1));
2503    
2504     if (!is_quantifier && previous_callout != NULL &&
2505     after_manual_callout-- <= 0)
2506     {
2507 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2508     complete_callout(previous_callout, ptr, cd);
2509 nigel 77 previous_callout = NULL;
2510     }
2511    
2512     /* In extended mode, skip white space and comments */
2513    
2514     if ((options & PCRE_EXTENDED) != 0)
2515     {
2516     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2517     if (c == '#')
2518     {
2519 nigel 93 while (*(++ptr) != 0)
2520 nigel 91 {
2521 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2522 nigel 91 }
2523 nigel 93 if (*ptr != 0) continue;
2524    
2525 nigel 91 /* Else fall through to handle end of string */
2526     c = 0;
2527 nigel 77 }
2528     }
2529    
2530     /* No auto callout for quantifiers. */
2531    
2532     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2533     {
2534     previous_callout = code;
2535     code = auto_callout(code, ptr, cd);
2536     }
2537    
2538     switch(c)
2539     {
2540 nigel 93 /* ===================================================================*/
2541     case 0: /* The branch terminates at string end */
2542     case '|': /* or | or ) */
2543 nigel 77 case ')':
2544     *firstbyteptr = firstbyte;
2545     *reqbyteptr = reqbyte;
2546     *codeptr = code;
2547     *ptrptr = ptr;
2548 nigel 93 if (lengthptr != NULL)
2549     {
2550 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2551     {
2552     *errorcodeptr = ERR20;
2553     goto FAILED;
2554     }
2555 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2556     DPRINTF((">> end branch\n"));
2557     }
2558 nigel 77 return TRUE;
2559    
2560 nigel 93
2561     /* ===================================================================*/
2562 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2563     the setting of any following char as a first character. */
2564    
2565     case '^':
2566     if ((options & PCRE_MULTILINE) != 0)
2567     {
2568     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2569     }
2570     previous = NULL;
2571     *code++ = OP_CIRC;
2572     break;
2573    
2574     case '$':
2575     previous = NULL;
2576     *code++ = OP_DOLL;
2577     break;
2578    
2579     /* There can never be a first char if '.' is first, whatever happens about
2580     repeats. The value of reqbyte doesn't change either. */
2581    
2582     case '.':
2583     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2584     zerofirstbyte = firstbyte;
2585     zeroreqbyte = reqbyte;
2586     previous = code;
2587     *code++ = OP_ANY;
2588     break;
2589    
2590 nigel 93
2591     /* ===================================================================*/
2592 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2593     32-byte bitmap of the permitted characters, except in the special case
2594     where there is only one such character. For negated classes, we build the
2595     map as usual, then invert it at the end. However, we use a different opcode
2596     so that data characters > 255 can be handled correctly.
2597 nigel 77
2598     If the class contains characters outside the 0-255 range, a different
2599     opcode is compiled. It may optionally have a bit map for characters < 256,
2600     but those above are are explicitly listed afterwards. A flag byte tells
2601     whether the bitmap is present, and whether this is a negated class or not.
2602     */
2603    
2604     case '[':
2605     previous = code;
2606    
2607     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2608     they are encountered at the top level, so we'll do that too. */
2609    
2610     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2611     check_posix_syntax(ptr, &tempptr, cd))
2612     {
2613     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2614     goto FAILED;
2615     }
2616    
2617 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2618 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2619 ph10 205 skip them too. This makes for compatibility with Perl. */
2620 ph10 208
2621 ph10 205 negate_class = FALSE;
2622     for (;;)
2623 nigel 77 {
2624     c = *(++ptr);
2625 ph10 205 if (c == '\\')
2626     {
2627 ph10 208 if (ptr[1] == 'E') ptr++;
2628 ph10 205 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2629 ph10 208 else break;
2630 ph10 205 }
2631     else if (!negate_class && c == '^')
2632     negate_class = TRUE;
2633     else break;
2634 ph10 208 }
2635 nigel 77
2636 ph10 264 /* If a class contains a negative special such as \S, we need to flip the
2637     negation flag at the end, so that support for characters > 255 works
2638     correctly (they are all included in the class). */
2639    
2640     should_flip_negation = FALSE;
2641    
2642 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
2643 nigel 93 of just a single character (as long as it's < 256). However, For higher
2644     valued UTF-8 characters, we don't yet do any optimization. */
2645 nigel 77
2646     class_charcount = 0;
2647     class_lastchar = -1;
2648    
2649 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2650     temporary bit of memory, in case the class contains only 1 character (less
2651     than 256), because in that case the compiled code doesn't use the bit map.
2652     */
2653    
2654     memset(classbits, 0, 32 * sizeof(uschar));
2655    
2656 nigel 77 #ifdef SUPPORT_UTF8
2657     class_utf8 = FALSE; /* No chars >= 256 */
2658 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2659 nigel 77 #endif
2660    
2661     /* Process characters until ] is reached. By writing this as a "do" it
2662 nigel 93 means that an initial ] is taken as a data character. At the start of the
2663     loop, c contains the first byte of the character. */
2664 nigel 77
2665 nigel 93 if (c != 0) do
2666 nigel 77 {
2667 nigel 93 const uschar *oldptr;
2668    
2669 nigel 77 #ifdef SUPPORT_UTF8
2670     if (utf8 && c > 127)
2671     { /* Braces are required because the */
2672     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2673     }
2674     #endif
2675    
2676     /* Inside \Q...\E everything is literal except \E */
2677    
2678     if (inescq)
2679     {
2680 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2681 nigel 77 {
2682 nigel 93 inescq = FALSE; /* Reset literal state */
2683     ptr++; /* Skip the 'E' */
2684     continue; /* Carry on with next */
2685 nigel 77 }
2686 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2687 nigel 77 }
2688    
2689     /* Handle POSIX class names. Perl allows a negation extension of the
2690     form [:^name:]. A square bracket that doesn't match the syntax is
2691     treated as a literal. We also recognize the POSIX constructions
2692     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2693     5.6 and 5.8 do. */
2694    
2695     if (c == '[' &&
2696     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2697     check_posix_syntax(ptr, &tempptr, cd))
2698     {
2699     BOOL local_negate = FALSE;
2700 nigel 87 int posix_class, taboffset, tabopt;
2701 nigel 77 register const uschar *cbits = cd->cbits;
2702 nigel 87 uschar pbits[32];
2703 nigel 77
2704     if (ptr[1] != ':')
2705     {
2706     *errorcodeptr = ERR31;
2707     goto FAILED;
2708     }
2709    
2710     ptr += 2;
2711     if (*ptr == '^')
2712     {
2713     local_negate = TRUE;
2714 ph10 265 should_flip_negation = TRUE; /* Note negative special */
2715 nigel 77 ptr++;
2716     }
2717    
2718     posix_class = check_posix_name(ptr, tempptr - ptr);
2719     if (posix_class < 0)
2720     {
2721     *errorcodeptr = ERR30;
2722     goto FAILED;
2723     }
2724    
2725     /* If matching is caseless, upper and lower are converted to
2726     alpha. This relies on the fact that the class table starts with
2727     alpha, lower, upper as the first 3 entries. */
2728    
2729     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2730     posix_class = 0;
2731    
2732 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2733     because we may be adding and subtracting from it, and we don't want to
2734     subtract bits that may be in the main map already. At the end we or the
2735     result into the bit map that is being built. */
2736 nigel 77
2737     posix_class *= 3;
2738 nigel 87
2739     /* Copy in the first table (always present) */
2740    
2741     memcpy(pbits, cbits + posix_class_maps[posix_class],
2742     32 * sizeof(uschar));
2743    
2744     /* If there is a second table, add or remove it as required. */
2745    
2746     taboffset = posix_class_maps[posix_class + 1];
2747     tabopt = posix_class_maps[posix_class + 2];
2748    
2749     if (taboffset >= 0)
2750 nigel 77 {
2751 nigel 87 if (tabopt >= 0)
2752     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2753 nigel 77 else
2754 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2755 nigel 77 }
2756    
2757 nigel 87 /* Not see if we need to remove any special characters. An option
2758     value of 1 removes vertical space and 2 removes underscore. */
2759    
2760     if (tabopt < 0) tabopt = -tabopt;
2761     if (tabopt == 1) pbits[1] &= ~0x3c;
2762     else if (tabopt == 2) pbits[11] &= 0x7f;
2763    
2764     /* Add the POSIX table or its complement into the main table that is
2765     being built and we are done. */
2766    
2767     if (local_negate)
2768     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2769     else
2770     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2771    
2772 nigel 77 ptr = tempptr + 1;
2773     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2774     continue; /* End of POSIX syntax handling */
2775     }
2776    
2777     /* Backslash may introduce a single character, or it may introduce one
2778 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2779     case. Inside a class (and only there) it is treated as backspace.
2780     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2781 ph10 205 to 'or' into the one we are building. We assume they have more than one
2782 nigel 77 character in them, so set class_charcount bigger than one. */
2783    
2784     if (c == '\\')
2785     {
2786 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2787     if (*errorcodeptr != 0) goto FAILED;
2788 nigel 77
2789     if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2790     else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2791 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2792 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2793     {
2794     if (ptr[1] == '\\' && ptr[2] == 'E')
2795     {
2796     ptr += 2; /* avoid empty string */
2797     }
2798     else inescq = TRUE;
2799     continue;
2800     }
2801 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2802 nigel 77
2803     if (c < 0)
2804     {
2805     register const uschar *cbits = cd->cbits;
2806     class_charcount += 2; /* Greater than 1 is what matters */
2807 nigel 93
2808     /* Save time by not doing this in the pre-compile phase. */
2809    
2810     if (lengthptr == NULL) switch (-c)
2811 nigel 77 {
2812     case ESC_d:
2813     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2814     continue;
2815    
2816     case ESC_D:
2817 ph10 264 should_flip_negation = TRUE;
2818 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2819     continue;
2820    
2821     case ESC_w:
2822     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2823     continue;
2824    
2825     case ESC_W:
2826 ph10 264 should_flip_negation = TRUE;
2827 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2828     continue;
2829    
2830     case ESC_s:
2831     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2832     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2833     continue;
2834    
2835     case ESC_S:
2836 ph10 264 should_flip_negation = TRUE;
2837 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2838     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2839     continue;
2840    
2841 nigel 93 case ESC_E: /* Perl ignores an orphan \E */
2842     continue;
2843 ph10 180
2844 nigel 93 default: /* Not recognized; fall through */
2845     break; /* Need "default" setting to stop compiler warning. */
2846     }
2847    
2848     /* In the pre-compile phase, just do the recognition. */
2849    
2850     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2851     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2852 ph10 180
2853 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
2854     they use extra memory. */
2855 ph10 180
2856 ph10 178 if (-c == ESC_h)
2857     {
2858     SETBIT(classbits, 0x09); /* VT */
2859     SETBIT(classbits, 0x20); /* SPACE */
2860 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
2861 ph10 178 #ifdef SUPPORT_UTF8
2862     if (utf8)
2863 ph10 180 {
2864 ph10 178 class_utf8 = TRUE;
2865     *class_utf8data++ = XCL_SINGLE;
2866 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2867 ph10 178 *class_utf8data++ = XCL_SINGLE;
2868 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2869     *class_utf8data++ = XCL_RANGE;
2870     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2871     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2872 ph10 178 *class_utf8data++ = XCL_SINGLE;
2873 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2874 ph10 178 *class_utf8data++ = XCL_SINGLE;
2875 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2876 ph10 178 *class_utf8data++ = XCL_SINGLE;
2877 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2878     }
2879     #endif
2880     continue;
2881     }
2882 nigel 93
2883 ph10 178 if (-c == ESC_H)
2884     {
2885     for (c = 0; c < 32; c++)
2886     {
2887     int x = 0xff;
2888     switch (c)
2889 ph10 180 {
2890 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
2891     case 0x20/8: x ^= 1 << (0x20%8); break;
2892     case 0xa0/8: x ^= 1 << (0xa0%8); break;
2893     default: break;
2894     }
2895     classbits[c] |= x;
2896 ph10 180 }
2897    
2898 ph10 178 #ifdef SUPPORT_UTF8
2899     if (utf8)
2900 ph10 180 {
2901 ph10 178 class_utf8 = TRUE;
2902 ph10 180 *class_utf8data++ = XCL_RANGE;
2903     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2904     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2905     *class_utf8data++ = XCL_RANGE;
2906     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2907     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2908     *class_utf8data++ = XCL_RANGE;
2909     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2910     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2911     *class_utf8data++ = XCL_RANGE;
2912     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2913     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2914     *class_utf8data++ = XCL_RANGE;
2915     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2916     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2917     *class_utf8data++ = XCL_RANGE;
2918     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2919     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2920     *class_utf8data++ = XCL_RANGE;
2921     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2922     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2923     }
2924     #endif
2925     continue;
2926     }
2927 ph10 178
2928     if (-c == ESC_v)
2929     {
2930     SETBIT(classbits, 0x0a); /* LF */
2931     SETBIT(classbits, 0x0b); /* VT */
2932 ph10 180 SETBIT(classbits, 0x0c); /* FF */
2933     SETBIT(classbits, 0x0d); /* CR */
2934     SETBIT(classbits, 0x85); /* NEL */
2935 ph10 178 #ifdef SUPPORT_UTF8
2936     if (utf8)
2937 ph10 180 {
2938 ph10 178 class_utf8 = TRUE;
2939 ph10 180 *class_utf8data++ = XCL_RANGE;
2940     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2941     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2942     }
2943     #endif
2944     continue;
2945     }
2946 ph10 178
2947     if (-c == ESC_V)
2948     {
2949     for (c = 0; c < 32; c++)
2950     {
2951     int x = 0xff;
2952     switch (c)
2953 ph10 180 {
2954 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
2955     x ^= 1 << (0x0b%8);
2956     x ^= 1 << (0x0c%8);
2957 ph10 180 x ^= 1 << (0x0d%8);
2958 ph10 178 break;
2959     case 0x85/8: x ^= 1 << (0x85%8); break;
2960     default: break;
2961     }
2962     classbits[c] |= x;
2963 ph10 180 }
2964    
2965 ph10 178 #ifdef SUPPORT_UTF8
2966     if (utf8)
2967 ph10 180 {
2968 ph10 178 class_utf8 = TRUE;
2969 ph10 180 *class_utf8data++ = XCL_RANGE;
2970     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2971     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2972     *class_utf8data++ = XCL_RANGE;
2973     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2974     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2975     }
2976     #endif
2977     continue;
2978     }
2979 ph10 178
2980 nigel 93 /* We need to deal with \P and \p in both phases. */
2981    
2982 nigel 77 #ifdef SUPPORT_UCP
2983 nigel 93 if (-c == ESC_p || -c == ESC_P)
2984     {
2985     BOOL negated;
2986     int pdata;
2987     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2988     if (ptype < 0) goto FAILED;
2989     class_utf8 = TRUE;
2990     *class_utf8data++ = ((-c == ESC_p) != negated)?
2991     XCL_PROP : XCL_NOTPROP;
2992     *class_utf8data++ = ptype;
2993     *class_utf8data++ = pdata;
2994     class_charcount -= 2; /* Not a < 256 character */
2995 nigel 77 continue;
2996 nigel 93 }
2997 nigel 77 #endif
2998 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
2999     strict mode. By default, for compatibility with Perl, they are
3000     treated as literals. */
3001 nigel 77
3002 nigel 93 if ((options & PCRE_EXTRA) != 0)
3003     {
3004     *errorcodeptr = ERR7;
3005     goto FAILED;
3006     }
3007 nigel 77
3008 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3009     c = *ptr; /* Get the final character and fall through */
3010 nigel 77 }
3011    
3012     /* Fall through if we have a single character (c >= 0). This may be
3013 nigel 93 greater than 256 in UTF-8 mode. */
3014 nigel 77
3015     } /* End of backslash handling */
3016    
3017     /* A single character may be followed by '-' to form a range. However,
3018     Perl does not permit ']' to be the end of the range. A '-' character
3019 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3020     entirely. The code for handling \Q and \E is messy. */
3021 nigel 77
3022 nigel 93 CHECK_RANGE:
3023     while (ptr[1] == '\\' && ptr[2] == 'E')
3024 nigel 77 {
3025 nigel 93 inescq = FALSE;
3026     ptr += 2;
3027     }
3028    
3029     oldptr = ptr;
3030 ph10 231
3031 ph10 230 /* Remember \r or \n */
3032 ph10 231
3033     if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3034    
3035 ph10 230 /* Check for range */
3036 nigel 93
3037     if (!inescq && ptr[1] == '-')
3038     {
3039 nigel 77 int d;
3040     ptr += 2;
3041 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3042 nigel 77
3043 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3044     mode. */
3045    
3046     while (*ptr == '\\' && ptr[1] == 'Q')
3047     {
3048     ptr += 2;
3049     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3050     inescq = TRUE;
3051     break;
3052     }
3053    
3054     if (*ptr == 0 || (!inescq && *ptr == ']'))
3055     {
3056     ptr = oldptr;
3057     goto LONE_SINGLE_CHARACTER;
3058     }
3059    
3060 nigel 77 #ifdef SUPPORT_UTF8
3061     if (utf8)
3062     { /* Braces are required because the */
3063     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3064     }
3065     else
3066     #endif
3067     d = *ptr; /* Not UTF-8 mode */
3068    
3069     /* The second part of a range can be a single-character escape, but
3070     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3071     in such circumstances. */
3072    
3073 nigel 93 if (!inescq && d == '\\')
3074 nigel 77 {
3075 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3076     if (*errorcodeptr != 0) goto FAILED;
3077 nigel 77
3078 nigel 93 /* \b is backslash; \X is literal X; \R is literal R; any other
3079     special means the '-' was literal */
3080 nigel 77
3081     if (d < 0)
3082     {
3083     if (d == -ESC_b) d = '\b';
3084 nigel 93 else if (d == -ESC_X) d = 'X';
3085     else if (d == -ESC_R) d = 'R'; else
3086 nigel 77 {
3087 nigel 93 ptr = oldptr;
3088 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3089     }
3090     }
3091     }
3092    
3093 nigel 93 /* Check that the two values are in the correct order. Optimize
3094     one-character ranges */
3095 nigel 77
3096 nigel 93 if (d < c)
3097     {
3098     *errorcodeptr = ERR8;
3099     goto FAILED;
3100     }
3101    
3102 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3103    
3104 ph10 230 /* Remember \r or \n */
3105 ph10 231
3106     if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3107    
3108 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3109     matching, we have to use an XCLASS with extra data items. Caseless
3110     matching for characters > 127 is available only if UCP support is
3111     available. */
3112    
3113     #ifdef SUPPORT_UTF8
3114     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3115     {
3116     class_utf8 = TRUE;
3117    
3118     /* With UCP support, we can find the other case equivalents of
3119     the relevant characters. There may be several ranges. Optimize how
3120     they fit with the basic range. */
3121    
3122     #ifdef SUPPORT_UCP
3123     if ((options & PCRE_CASELESS) != 0)
3124     {
3125 nigel 93 unsigned int occ, ocd;
3126     unsigned int cc = c;
3127     unsigned int origd = d;
3128 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3129     {
3130 ph10 180 if (occ >= (unsigned int)c &&
3131     ocd <= (unsigned int)d)
3132 ph10 176 continue; /* Skip embedded ranges */
3133 nigel 77
3134 ph10 180 if (occ < (unsigned int)c &&
3135 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3136 nigel 77 { /* if there is overlap, */
3137     c = occ; /* noting that if occ < c */
3138     continue; /* we can't have ocd > d */
3139     } /* because a subrange is */
3140 ph10 180 if (ocd > (unsigned int)d &&
3141 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3142 nigel 77 { /* the basic range. */
3143     d = ocd;
3144     continue;
3145     }
3146    
3147     if (occ == ocd)
3148     {
3149     *class_utf8data++ = XCL_SINGLE;
3150     }
3151     else
3152     {
3153     *class_utf8data++ = XCL_RANGE;
3154     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3155     }
3156     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3157     }
3158     }
3159     #endif /* SUPPORT_UCP */
3160    
3161     /* Now record the original range, possibly modified for UCP caseless
3162     overlapping ranges. */
3163    
3164     *class_utf8data++ = XCL_RANGE;
3165     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3166     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3167    
3168     /* With UCP support, we are done. Without UCP support, there is no
3169     caseless matching for UTF-8 characters > 127; we can use the bit map
3170     for the smaller ones. */
3171    
3172     #ifdef SUPPORT_UCP
3173     continue; /* With next character in the class */
3174     #else
3175     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3176    
3177     /* Adjust upper limit and fall through to set up the map */
3178    
3179     d = 127;
3180    
3181     #endif /* SUPPORT_UCP */
3182     }
3183     #endif /* SUPPORT_UTF8 */
3184    
3185     /* We use the bit map for all cases when not in UTF-8 mode; else
3186     ranges that lie entirely within 0-127 when there is UCP support; else
3187     for partial ranges without UCP support. */
3188    
3189 nigel 93 class_charcount += d - c + 1;
3190     class_lastchar = d;
3191    
3192     /* We can save a bit of time by skipping this in the pre-compile. */
3193    
3194     if (lengthptr == NULL) for (; c <= d; c++)
3195 nigel 77 {
3196     classbits[c/8] |= (1 << (c&7));
3197     if ((options & PCRE_CASELESS) != 0)
3198     {
3199     int uc = cd->fcc[c]; /* flip case */
3200     classbits[uc/8] |= (1 << (uc&7));
3201     }
3202     }
3203    
3204     continue; /* Go get the next char in the class */
3205     }
3206    
3207     /* Handle a lone single character - we can get here for a normal
3208     non-escape char, or after \ that introduces a single character or for an
3209     apparent range that isn't. */
3210    
3211     LONE_SINGLE_CHARACTER:
3212 ph10 231
3213 nigel 77 /* Handle a character that cannot go in the bit map */
3214    
3215     #ifdef SUPPORT_UTF8
3216     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3217     {
3218     class_utf8 = TRUE;
3219     *class_utf8data++ = XCL_SINGLE;
3220     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3221    
3222     #ifdef SUPPORT_UCP
3223     if ((options & PCRE_CASELESS) != 0)
3224     {
3225 nigel 93 unsigned int othercase;
3226     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3227 nigel 77 {
3228     *class_utf8data++ = XCL_SINGLE;
3229     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3230     }
3231     }
3232     #endif /* SUPPORT_UCP */
3233    
3234     }
3235     else
3236     #endif /* SUPPORT_UTF8 */
3237    
3238     /* Handle a single-byte character */
3239     {
3240     classbits[c/8] |= (1 << (c&7));
3241     if ((options & PCRE_CASELESS) != 0)
3242     {
3243     c = cd->fcc[c]; /* flip case */
3244     classbits[c/8] |= (1 << (c&7));
3245     }
3246     class_charcount++;
3247     class_lastchar = c;
3248     }
3249     }
3250    
3251 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3252 nigel 77
3253 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3254 nigel 77
3255 nigel 93 if (c == 0) /* Missing terminating ']' */
3256     {
3257     *errorcodeptr = ERR6;
3258     goto FAILED;
3259     }
3260 ph10 231
3261    
3262 ph10 230 /* This code has been disabled because it would mean that \s counts as
3263     an explicit \r or \n reference, and that's not really what is wanted. Now
3264     we set the flag only if there is a literal "\r" or "\n" in the class. */
3265 ph10 227
3266 ph10 230 #if 0
3267 ph10 226 /* Remember whether \r or \n are in this class */
3268 ph10 227
3269 ph10 226 if (negate_class)
3270     {
3271 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3272 ph10 226 }
3273     else
3274     {
3275 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3276 ph10 227 }
3277 ph10 230 #endif
3278 ph10 227
3279 ph10 231
3280 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3281 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3282     use of \p or \P, in other words, no use of any XCLASS features, we can
3283     optimize.
3284    
3285 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3286     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3287     operate on single-bytes only. This is an historical hangover. Maybe one day
3288     we can tidy these opcodes to handle multi-byte characters.
3289 nigel 77
3290     The optimization throws away the bit map. We turn the item into a
3291     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3292     that OP_NOT does not support multibyte characters. In the positive case, it
3293     can cause firstbyte to be set. Otherwise, there can be no first char if
3294     this item is first, whatever repeat count may follow. In the case of
3295     reqbyte, save the previous value for reinstating. */
3296    
3297     #ifdef SUPPORT_UTF8
3298 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3299 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3300 nigel 77 #else
3301     if (class_charcount == 1)
3302     #endif
3303     {
3304     zeroreqbyte = reqbyte;
3305    
3306     /* The OP_NOT opcode works on one-byte characters only. */
3307    
3308     if (negate_class)
3309     {
3310     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3311     zerofirstbyte = firstbyte;
3312     *code++ = OP_NOT;
3313     *code++ = class_lastchar;
3314     break;
3315     }
3316    
3317     /* For a single, positive character, get the value into mcbuffer, and
3318     then we can handle this with the normal one-character code. */
3319    
3320     #ifdef SUPPORT_UTF8
3321     if (utf8 && class_lastchar > 127)
3322     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3323     else
3324     #endif
3325     {
3326     mcbuffer[0] = class_lastchar;
3327     mclength = 1;
3328     }
3329     goto ONE_CHAR;
3330     } /* End of 1-char optimization */
3331    
3332     /* The general case - not the one-char optimization. If this is the first
3333     thing in the branch, there can be no first char setting, whatever the
3334     repeat count. Any reqbyte setting must remain unchanged after any kind of
3335     repeat. */
3336    
3337     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3338     zerofirstbyte = firstbyte;
3339     zeroreqbyte = reqbyte;
3340    
3341     /* If there are characters with values > 255, we have to compile an
3342 ph10 264 extended class, with its own opcode, unless there was a negated special
3343     such as \S in the class, because in that case all characters > 255 are in
3344     the class, so any that were explicitly given as well can be ignored. If
3345     (when there are explicit characters > 255 that must be listed) there are no
3346     characters < 256, we can omit the bitmap in the actual compiled code. */
3347 nigel 77
3348     #ifdef SUPPORT_UTF8
3349 ph10 264 if (class_utf8 && !should_flip_negation)
3350 nigel 77 {
3351     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3352     *code++ = OP_XCLASS;
3353     code += LINK_SIZE;
3354     *code = negate_class? XCL_NOT : 0;
3355    
3356 nigel 93 /* If the map is required, move up the extra data to make room for it;
3357     otherwise just move the code pointer to the end of the extra data. */
3358 nigel 77
3359     if (class_charcount > 0)
3360     {
3361     *code++ |= XCL_MAP;
3362 nigel 93 memmove(code + 32, code, class_utf8data - code);
3363 nigel 77 memcpy(code, classbits, 32);
3364 nigel 93 code = class_utf8data + 32;
3365 nigel 77 }
3366 nigel 93 else code = class_utf8data;
3367 nigel 77
3368     /* Now fill in the complete length of the item */
3369    
3370     PUT(previous, 1, code - previous);
3371     break; /* End of class handling */
3372     }
3373     #endif
3374    
3375 ph10 264 /* If there are no characters > 255, set the opcode to OP_CLASS or
3376     OP_NCLASS, depending on whether the whole class was negated and whether
3377     there were negative specials such as \S in the class. Then copy the 32-byte
3378     map into the code vector, negating it if necessary. */
3379    
3380     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3381 nigel 77 if (negate_class)
3382     {
3383 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3384     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3385 nigel 77 }
3386     else
3387     {
3388     memcpy(code, classbits, 32);
3389     }
3390     code += 32;
3391     break;
3392    
3393 nigel 93
3394     /* ===================================================================*/
3395 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3396     has been tested above. */
3397    
3398     case '{':
3399     if (!is_quantifier) goto NORMAL_CHAR;
3400     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3401     if (*errorcodeptr != 0) goto FAILED;
3402     goto REPEAT;
3403    
3404     case '*':
3405     repeat_min = 0;
3406     repeat_max = -1;
3407     goto REPEAT;
3408    
3409     case '+':
3410     repeat_min = 1;
3411     repeat_max = -1;
3412     goto REPEAT;
3413    
3414     case '?':
3415     repeat_min = 0;
3416     repeat_max = 1;
3417    
3418     REPEAT:
3419     if (previous == NULL)
3420     {
3421     *errorcodeptr = ERR9;
3422     goto FAILED;
3423     }
3424    
3425     if (repeat_min == 0)
3426     {
3427     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3428     reqbyte = zeroreqbyte; /* Ditto */
3429     }
3430    
3431     /* Remember whether this is a variable length repeat */
3432    
3433     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3434    
3435     op_type = 0; /* Default single-char op codes */
3436     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3437    
3438     /* Save start of previous item, in case we have to move it up to make space
3439     for an inserted OP_ONCE for the additional '+' extension. */
3440    
3441     tempcode = previous;
3442    
3443     /* If the next character is '+', we have a possessive quantifier. This
3444     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3445     If the next character is '?' this is a minimizing repeat, by default,
3446     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3447     repeat type to the non-default. */
3448    
3449     if (ptr[1] == '+')
3450     {
3451     repeat_type = 0; /* Force greedy */
3452     possessive_quantifier = TRUE;
3453     ptr++;
3454     }
3455     else if (ptr[1] == '?')
3456     {
3457     repeat_type = greedy_non_default;
3458     ptr++;
3459     }
3460     else repeat_type = greedy_default;
3461    
3462     /* If previous was a character match, abolish the item and generate a
3463     repeat item instead. If a char item has a minumum of more than one, ensure
3464     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3465     the first thing in a branch because the x will have gone into firstbyte
3466     instead. */
3467    
3468     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3469     {
3470     /* Deal with UTF-8 characters that take up more than one byte. It's
3471     easier to write this out separately than try to macrify it. Use c to
3472     hold the length of the character in bytes, plus 0x80 to flag that it's a
3473     length rather than a small character. */
3474    
3475     #ifdef SUPPORT_UTF8
3476     if (utf8 && (code[-1] & 0x80) != 0)
3477     {
3478     uschar *lastchar = code - 1;
3479     while((*lastchar & 0xc0) == 0x80) lastchar--;
3480     c = code - lastchar; /* Length of UTF-8 character */
3481     memcpy(utf8_char, lastchar, c); /* Save the char */
3482     c |= 0x80; /* Flag c as a length */
3483     }
3484     else
3485     #endif
3486    
3487     /* Handle the case of a single byte - either with no UTF8 support, or
3488     with UTF-8 disabled, or for a UTF-8 character < 128. */
3489    
3490     {
3491     c = code[-1];
3492     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3493     }
3494    
3495 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3496     the line is something that cannot possibly match this character. If so,
3497     automatically possessifying this item gains some performance in the case
3498     where the match fails. */
3499    
3500     if (!possessive_quantifier &&
3501     repeat_max < 0 &&
3502     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3503     options, cd))
3504     {
3505     repeat_type = 0; /* Force greedy */
3506     possessive_quantifier = TRUE;
3507     }
3508    
3509 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3510     }
3511    
3512     /* If previous was a single negated character ([^a] or similar), we use
3513     one of the special opcodes, replacing it. The code is shared with single-
3514     character repeats by setting opt_type to add a suitable offset into
3515 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3516     currently used only for single-byte chars. */
3517 nigel 77
3518     else if (*previous == OP_NOT)
3519     {
3520     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3521     c = previous[1];
3522 nigel 93 if (!possessive_quantifier &&
3523     repeat_max < 0 &&
3524     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3525     {
3526     repeat_type = 0; /* Force greedy */
3527     possessive_quantifier = TRUE;
3528     }
3529 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3530     }
3531    
3532     /* If previous was a character type match (\d or similar), abolish it and
3533     create a suitable repeat item. The code is shared with single-character
3534     repeats by setting op_type to add a suitable offset into repeat_type. Note
3535     the the Unicode property types will be present only when SUPPORT_UCP is
3536     defined, but we don't wrap the little bits of code here because it just
3537     makes it horribly messy. */
3538    
3539     else if (*previous < OP_EODN)
3540     {
3541     uschar *oldcode;
3542 nigel 87 int prop_type, prop_value;
3543 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3544     c = *previous;
3545    
3546 nigel 93 if (!possessive_quantifier &&
3547     repeat_max < 0 &&
3548     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3549     {
3550     repeat_type = 0; /* Force greedy */
3551     possessive_quantifier = TRUE;
3552     }
3553    
3554 nigel 77 OUTPUT_SINGLE_REPEAT:
3555 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3556     {
3557     prop_type = previous[1];
3558     prop_value = previous[2];
3559     }
3560     else prop_type = prop_value = -1;
3561 nigel 77
3562     oldcode = code;
3563     code = previous; /* Usually overwrite previous item */
3564    
3565     /* If the maximum is zero then the minimum must also be zero; Perl allows
3566     this case, so we do too - by simply omitting the item altogether. */
3567    
3568     if (repeat_max == 0) goto END_REPEAT;
3569    
3570     /* All real repeats make it impossible to handle partial matching (maybe
3571     one day we will be able to remove this restriction). */
3572    
3573 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3574 nigel 77
3575     /* Combine the op_type with the repeat_type */
3576    
3577     repeat_type += op_type;
3578    
3579     /* A minimum of zero is handled either as the special case * or ?, or as
3580     an UPTO, with the maximum given. */
3581    
3582     if (repeat_min == 0)
3583     {
3584     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3585     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3586     else
3587     {
3588     *code++ = OP_UPTO + repeat_type;
3589     PUT2INC(code, 0, repeat_max);
3590     }
3591     }
3592    
3593     /* A repeat minimum of 1 is optimized into some special cases. If the
3594 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3595 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3596     one less than the maximum. */
3597    
3598     else if (repeat_min == 1)
3599     {
3600     if (repeat_max == -1)
3601     *code++ = OP_PLUS + repeat_type;
3602     else
3603     {
3604     code = oldcode; /* leave previous item in place */
3605     if (repeat_max == 1) goto END_REPEAT;
3606     *code++ = OP_UPTO + repeat_type;
3607     PUT2INC(code, 0, repeat_max - 1);
3608     }
3609     }
3610    
3611     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3612     handled as an EXACT followed by an UPTO. */
3613    
3614     else
3615     {
3616     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3617     PUT2INC(code, 0, repeat_min);
3618    
3619     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3620     we have to insert the character for the previous code. For a repeated
3621 nigel 87 Unicode property match, there are two extra bytes that define the
3622 nigel 77 required property. In UTF-8 mode, long characters have their length in
3623     c, with the 0x80 bit as a flag. */
3624    
3625     if (repeat_max < 0)
3626     {
3627     #ifdef SUPPORT_UTF8
3628     if (utf8 && c >= 128)
3629     {
3630     memcpy(code, utf8_char, c & 7);
3631     code += c & 7;
3632     }
3633     else
3634     #endif
3635     {
3636     *code++ = c;
3637 nigel 87 if (prop_type >= 0)
3638     {
3639     *code++ = prop_type;
3640     *code++ = prop_value;
3641     }
3642 nigel 77 }
3643     *code++ = OP_STAR + repeat_type;
3644     }
3645    
3646     /* Else insert an UPTO if the max is greater than the min, again
3647 nigel 93 preceded by the character, for the previously inserted code. If the
3648     UPTO is just for 1 instance, we can use QUERY instead. */
3649 nigel 77
3650     else if (repeat_max != repeat_min)
3651     {
3652     #ifdef SUPPORT_UTF8
3653     if (utf8 && c >= 128)
3654     {
3655     memcpy(code, utf8_char, c & 7);
3656     code += c & 7;
3657     }
3658     else
3659     #endif
3660     *code++ = c;
3661 nigel 87 if (prop_type >= 0)
3662     {
3663     *code++ = prop_type;
3664     *code++ = prop_value;
3665     }
3666 nigel 77 repeat_max -= repeat_min;
3667 nigel 93
3668     if (repeat_max == 1)
3669     {
3670     *code++ = OP_QUERY + repeat_type;
3671     }
3672     else
3673     {
3674     *code++ = OP_UPTO + repeat_type;
3675     PUT2INC(code, 0, repeat_max);
3676     }
3677 nigel 77 }
3678     }
3679    
3680     /* The character or character type itself comes last in all cases. */
3681    
3682     #ifdef SUPPORT_UTF8
3683     if (utf8 && c >= 128)
3684     {
3685     memcpy(code, utf8_char, c & 7);
3686     code += c & 7;
3687     }
3688     else
3689     #endif
3690     *code++ = c;
3691    
3692 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3693     define the required property. */
3694 nigel 77
3695     #ifdef SUPPORT_UCP
3696 nigel 87 if (prop_type >= 0)
3697     {
3698     *code++ = prop_type;
3699     *code++ = prop_value;
3700     }
3701 nigel 77 #endif
3702     }
3703    
3704     /* If previous was a character class or a back reference, we put the repeat
3705     stuff after it, but just skip the item if the repeat was {0,0}. */
3706    
3707     else if (*previous == OP_CLASS ||
3708     *previous == OP_NCLASS ||
3709     #ifdef SUPPORT_UTF8
3710     *previous == OP_XCLASS ||
3711     #endif
3712     *previous == OP_REF)
3713     {
3714     if (repeat_max == 0)
3715     {
3716     code = previous;
3717     goto END_REPEAT;
3718     }
3719    
3720     /* All real repeats make it impossible to handle partial matching (maybe
3721     one day we will be able to remove this restriction). */
3722    
3723 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3724 nigel 77
3725     if (repeat_min == 0 && repeat_max == -1)
3726     *code++ = OP_CRSTAR + repeat_type;
3727     else if (repeat_min == 1 && repeat_max == -1)
3728     *code++ = OP_CRPLUS + repeat_type;
3729     else if (repeat_min == 0 && repeat_max == 1)
3730     *code++ = OP_CRQUERY + repeat_type;
3731     else
3732     {
3733     *code++ = OP_CRRANGE + repeat_type;
3734     PUT2INC(code, 0, repeat_min);
3735     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3736     PUT2INC(code, 0, repeat_max);
3737     }
3738     }
3739    
3740     /* If previous was a bracket group, we may have to replicate it in certain
3741     cases. */
3742    
3743 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3744     *previous == OP_ONCE || *previous == OP_COND)
3745 nigel 77 {
3746     register int i;
3747     int ketoffset = 0;
3748     int len = code - previous;
3749     uschar *bralink = NULL;
3750    
3751 nigel 93 /* Repeating a DEFINE group is pointless */
3752    
3753     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3754     {
3755     *errorcodeptr = ERR55;
3756     goto FAILED;
3757     }
3758    
3759 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3760     by scanning through from the start, and compute the offset back to it
3761     from the current code pointer. There may be an OP_OPT setting following
3762     the final KET, so we can't find the end just by going back from the code
3763     pointer. */
3764    
3765     if (repeat_max == -1)
3766     {
3767     register uschar *ket = previous;
3768     do ket += GET(ket, 1); while (*ket != OP_KET);
3769     ketoffset = code - ket;
3770     }
3771    
3772     /* The case of a zero minimum is special because of the need to stick
3773     OP_BRAZERO in front of it, and because the group appears once in the
3774     data, whereas in other cases it appears the minimum number of times. For
3775     this reason, it is simplest to treat this case separately, as otherwise
3776     the code gets far too messy. There are several special subcases when the
3777     minimum is zero. */
3778    
3779     if (repeat_min == 0)
3780     {
3781     /* If the maximum is also zero, we just omit the group from the output
3782     altogether. */
3783    
3784     if (repeat_max == 0)
3785     {
3786     code = previous;
3787     goto END_REPEAT;
3788     }
3789    
3790     /* If the maximum is 1 or unlimited, we just have to stick in the
3791     BRAZERO and do no more at this point. However, we do need to adjust
3792     any OP_RECURSE calls inside the group that refer to the group itself or
3793 nigel 93 any internal or forward referenced group, because the offset is from
3794     the start of the whole regex. Temporarily terminate the pattern while
3795     doing this. */
3796 nigel 77
3797     if (repeat_max <= 1)
3798     {
3799     *code = OP_END;
3800 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3801 nigel 77 memmove(previous+1, previous, len);
3802     code++;
3803     *previous++ = OP_BRAZERO + repeat_type;
3804     }
3805    
3806     /* If the maximum is greater than 1 and limited, we have to replicate
3807     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3808     The first one has to be handled carefully because it's the original
3809     copy, which has to be moved up. The remainder can be handled by code
3810     that is common with the non-zero minimum case below. We have to
3811     adjust the value or repeat_max, since one less copy is required. Once
3812     again, we may have to adjust any OP_RECURSE calls inside the group. */
3813    
3814     else
3815     {
3816     int offset;
3817     *code = OP_END;
3818 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3819 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3820     code += 2 + LINK_SIZE;
3821     *previous++ = OP_BRAZERO + repeat_type;
3822     *previous++ = OP_BRA;
3823    
3824     /* We chain together the bracket offset fields that have to be
3825     filled in later when the ends of the brackets are reached. */
3826    
3827     offset = (bralink == NULL)? 0 : previous - bralink;
3828     bralink = previous;
3829     PUTINC(previous, 0, offset);
3830     }
3831    
3832     repeat_max--;
3833     }
3834    
3835     /* If the minimum is greater than zero, replicate the group as many
3836     times as necessary, and adjust the maximum to the number of subsequent
3837     copies that we need. If we set a first char from the group, and didn't
3838 nigel 93 set a required char, copy the latter from the former. If there are any
3839     forward reference subroutine calls in the group, there will be entries on
3840     the workspace list; replicate these with an appropriate increment. */
3841 nigel 77
3842     else
3843     {
3844     if (repeat_min > 1)
3845     {
3846 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3847 ph10 202 just adjust the length as if we had. Do some paranoid checks for
3848     potential integer overflow. */
3849 nigel 93
3850     if (lengthptr != NULL)
3851 ph10 202 {
3852     int delta = (repeat_min - 1)*length_prevgroup;
3853     if ((double)(repeat_min - 1)*(double)length_prevgroup >
3854     (double)INT_MAX ||
3855     OFLOW_MAX - *lengthptr < delta)
3856     {
3857     *errorcodeptr = ERR20;
3858     goto FAILED;
3859     }
3860     *lengthptr += delta;
3861     }
3862 nigel 93
3863     /* This is compiling for real */
3864    
3865     else
3866 nigel 77 {
3867 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3868     for (i = 1; i < repeat_min; i++)
3869     {
3870     uschar *hc;
3871     uschar *this_hwm = cd->hwm;
3872     memcpy(code, previous, len);
3873     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3874     {
3875     PUT(cd->hwm, 0, GET(hc, 0) + len);
3876     cd->hwm += LINK_SIZE;
3877     }
3878     save_hwm = this_hwm;
3879     code += len;
3880     }
3881 nigel 77 }
3882     }
3883 nigel 93
3884 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3885     }
3886    
3887     /* This code is common to both the zero and non-zero minimum cases. If
3888     the maximum is limited, it replicates the group in a nested fashion,
3889     remembering the bracket starts on a stack. In the case of a zero minimum,
3890     the first one was set up above. In all cases the repeat_max now specifies
3891 nigel 93 the number of additional copies needed. Again, we must remember to
3892     replicate entries on the forward reference list. */
3893 nigel 77
3894     if (repeat_max >= 0)
3895     {
3896 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3897     just adjust the length as if we had. For each repetition we must add 1
3898     to the length for BRAZERO and for all but the last repetition we must
3899 ph10 202 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3900     paranoid checks to avoid integer overflow. */
3901 nigel 93
3902     if (lengthptr != NULL && repeat_max > 0)
3903 ph10 202 {
3904     int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3905     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3906     if ((double)repeat_max *
3907     (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3908     > (double)INT_MAX ||
3909     OFLOW_MAX - *lengthptr < delta)
3910     {
3911     *errorcodeptr = ERR20;
3912     goto FAILED;
3913     }
3914     *lengthptr += delta;
3915     }
3916 nigel 93
3917     /* This is compiling for real */
3918    
3919     else for (i = repeat_max - 1; i >= 0; i--)
3920 nigel 77 {
3921 nigel 93 uschar *hc;
3922     uschar *this_hwm = cd->hwm;
3923    
3924 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3925    
3926     /* All but the final copy start a new nesting, maintaining the
3927     chain of brackets outstanding. */
3928    
3929     if (i != 0)
3930     {
3931     int offset;
3932     *code++ = OP_BRA;
3933     offset = (bralink == NULL)? 0 : code - bralink;
3934     bralink = code;
3935     PUTINC(code, 0, offset);
3936     }
3937    
3938     memcpy(code, previous, len);
3939 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3940     {
3941     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3942     cd->hwm += LINK_SIZE;
3943     }
3944     save_hwm = this_hwm;
3945 nigel 77 code += len;
3946     }
3947    
3948     /* Now chain through the pending brackets, and fill in their length
3949     fields (which are holding the chain links pro tem). */
3950    
3951     while (bralink != NULL)
3952     {
3953     int oldlinkoffset;
3954     int offset = code - bralink + 1;
3955     uschar *bra = code - offset;
3956     oldlinkoffset = GET(bra, 1);
3957     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3958     *code++ = OP_KET;
3959     PUTINC(code, 0, offset);
3960     PUT(bra, 1, offset);
3961     }
3962     }
3963    
3964     /* If the maximum is unlimited, set a repeater in the final copy. We
3965     can't just offset backwards from the current code point, because we
3966     don't know if there's been an options resetting after the ket. The
3967 nigel 93 correct offset was computed above.
3968 nigel 77
3969 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3970     this group is a non-atomic one that could match an empty string. If so,
3971     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3972     that runtime checking can be done. [This check is also applied to
3973     atomic groups at runtime, but in a different way.] */
3974    
3975     else
3976     {
3977     uschar *ketcode = code - ketoffset;
3978     uschar *bracode = ketcode - GET(ketcode, 1);
3979     *ketcode = OP_KETRMAX + repeat_type;
3980     if (lengthptr == NULL && *bracode != OP_ONCE)
3981     {
3982     uschar *scode = bracode;
3983     do
3984     {
3985     if (could_be_empty_branch(scode, ketcode, utf8))
3986     {
3987     *bracode += OP_SBRA - OP_BRA;
3988     break;
3989     }
3990     scode += GET(scode, 1);
3991     }
3992     while (*scode == OP_ALT);
3993     }
3994     }
3995 nigel 77 }
3996    
3997     /* Else there's some kind of shambles */
3998    
3999     else
4000     {
4001     *errorcodeptr = ERR11;
4002     goto FAILED;
4003     }
4004    
4005 nigel 93 /* If the character following a repeat is '+', or if certain optimization
4006     tests above succeeded, possessive_quantifier is TRUE. For some of the
4007     simpler opcodes, there is an special alternative opcode for this. For
4008     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4009     The '+' notation is just syntactic sugar, taken from Sun's Java package,
4010     but the special opcodes can optimize it a bit. The repeated item starts at
4011     tempcode, not at previous, which might be the first part of a string whose
4012     (former) last char we repeated.
4013 nigel 77
4014 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4015     an 'upto' may follow. We skip over an 'exact' item, and then test the
4016     length of what remains before proceeding. */
4017    
4018 nigel 77 if (possessive_quantifier)
4019     {
4020 nigel 93 int len;
4021     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4022     *tempcode == OP_NOTEXACT)
4023     tempcode += _pcre_OP_lengths[*tempcode];
4024     len = code - tempcode;
4025     if (len > 0) switch (*tempcode)
4026     {
4027     case OP_STAR: *tempcode = OP_POSSTAR; break;
4028     case OP_PLUS: *tempcode = OP_POSPLUS; break;
4029     case OP_QUERY: *tempcode = OP_POSQUERY; break;
4030     case OP_UPTO: *tempcode = OP_POSUPTO; break;
4031    
4032     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4033     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4034     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4035     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4036    
4037     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4038     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4039     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4040     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4041    
4042     default:
4043     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4044     code += 1 + LINK_SIZE;
4045     len += 1 + LINK_SIZE;
4046     tempcode[0] = OP_ONCE;
4047     *code++ = OP_KET;
4048     PUTINC(code, 0, len);
4049     PUT(tempcode, 1, len);
4050     break;
4051     }
4052 nigel 77 }
4053    
4054     /* In all case we no longer have a previous item.