/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 300 - (hide annotations) (download)
Mon Jan 14 19:43:19 2008 UTC (6 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 197176 byte(s)
Fix buffer overrun for class with very many characters whose codepoints are 
above 255.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 nigel 77 static const short int escapes[] = {
102     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 nigel 77 0, 0, -ESC_z /* x - z */
112     };
113    
114 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
115 nigel 77 static const short int escapes[] = {
116     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139     };
140     #endif
141    
142    
143 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144     searched linearly. Put all the names into a single string, in order to reduce
145 ph10 240 the number of relocations when a shared library is dynamically linked. */
146 ph10 210
147     typedef struct verbitem {
148     int len;
149     int op;
150 ph10 211 } verbitem;
151 ph10 210
152 ph10 240 static const char verbnames[] =
153 ph10 243 "ACCEPT\0"
154     "COMMIT\0"
155     "F\0"
156     "FAIL\0"
157     "PRUNE\0"
158     "SKIP\0"
159     "THEN";
160 ph10 240
161 ph10 210 static verbitem verbs[] = {
162 ph10 240 { 6, OP_ACCEPT },
163     { 6, OP_COMMIT },
164     { 1, OP_FAIL },
165     { 4, OP_FAIL },
166     { 5, OP_PRUNE },
167     { 4, OP_SKIP },
168     { 4, OP_THEN }
169 ph10 210 };
170    
171     static int verbcount = sizeof(verbs)/sizeof(verbitem);
172    
173    
174 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
175     now all in a single string, to reduce the number of relocations when a shared
176 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
177     length entry. The first three must be alpha, lower, upper, as this is assumed
178     for handling case independence. */
179 nigel 77
180 ph10 240 static const char posix_names[] =
181 ph10 243 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182     "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 ph10 240 "word\0" "xdigit";
184 nigel 77
185     static const uschar posix_name_lengths[] = {
186     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187    
188 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
189     base map, with an optional addition or removal of another map. Then, for some
190     classes, there is some additional tweaking: for [:blank:] the vertical space
191     characters are removed, and for [:alpha:] and [:alnum:] the underscore
192     character is removed. The triples in the table consist of the base map offset,
193     second map offset or -1 if no second map, and a non-negative value for map
194     addition or a negative value for map subtraction (if there are two maps). The
195     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196     remove vertical space characters, 2 => remove underscore. */
197 nigel 77
198     static const int posix_class_maps[] = {
199 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
200     cbit_lower, -1, 0, /* lower */
201     cbit_upper, -1, 0, /* upper */
202     cbit_word, -1, 2, /* alnum - word without underscore */
203     cbit_print, cbit_cntrl, 0, /* ascii */
204     cbit_space, -1, 1, /* blank - a GNU extension */
205     cbit_cntrl, -1, 0, /* cntrl */
206     cbit_digit, -1, 0, /* digit */
207     cbit_graph, -1, 0, /* graph */
208     cbit_print, -1, 0, /* print */
209     cbit_punct, -1, 0, /* punct */
210     cbit_space, -1, 0, /* space */
211     cbit_word, -1, 0, /* word - a Perl extension */
212     cbit_xdigit,-1, 0 /* xdigit */
213 nigel 77 };
214    
215    
216 nigel 93 #define STRING(a) # a
217     #define XSTRING(s) STRING(s)
218    
219 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
220 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
221     they are documented. Always add a new error instead. Messages marked DEAD below
222 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
223     the number of relocations needed when a shared library is loaded dynamically,
224     it is now one long string. We cannot use a table of offsets, because the
225     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226     simply count through to the one we want - this isn't a performance issue
227 ph10 240 because these strings are used only when there is a compilation error. */
228 nigel 77
229 ph10 240 static const char error_texts[] =
230     "no error\0"
231     "\\ at end of pattern\0"
232     "\\c at end of pattern\0"
233     "unrecognized character follows \\\0"
234     "numbers out of order in {} quantifier\0"
235 nigel 77 /* 5 */
236 ph10 240 "number too big in {} quantifier\0"
237     "missing terminating ] for character class\0"
238     "invalid escape sequence in character class\0"
239     "range out of order in character class\0"
240     "nothing to repeat\0"
241 nigel 77 /* 10 */
242 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243     "internal error: unexpected repeat\0"
244 ph10 269 "unrecognized character after (? or (?-\0"
245 ph10 240 "POSIX named classes are supported only within a class\0"
246     "missing )\0"
247 nigel 77 /* 15 */
248 ph10 240 "reference to non-existent subpattern\0"
249     "erroffset passed as NULL\0"
250     "unknown option bit(s) set\0"
251     "missing ) after comment\0"
252     "parentheses nested too deeply\0" /** DEAD **/
253 nigel 77 /* 20 */
254 ph10 240 "regular expression is too large\0"
255     "failed to get memory\0"
256     "unmatched parentheses\0"
257     "internal error: code overflow\0"
258     "unrecognized character after (?<\0"
259 nigel 77 /* 25 */
260 ph10 240 "lookbehind assertion is not fixed length\0"
261     "malformed number or name after (?(\0"
262     "conditional group contains more than two branches\0"
263     "assertion expected after (?(\0"
264     "(?R or (?[+-]digits must be followed by )\0"
265 nigel 77 /* 30 */
266 ph10 240 "unknown POSIX class name\0"
267     "POSIX collating elements are not supported\0"
268     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269     "spare error\0" /** DEAD **/
270     "character value in \\x{...} sequence is too large\0"
271 nigel 77 /* 35 */
272 ph10 240 "invalid condition (?(0)\0"
273     "\\C not allowed in lookbehind assertion\0"
274     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275     "number after (?C is > 255\0"
276     "closing ) for (?C expected\0"
277 nigel 77 /* 40 */
278 ph10 240 "recursive call could loop indefinitely\0"
279     "unrecognized character after (?P\0"
280     "syntax error in subpattern name (missing terminator)\0"
281     "two named subpatterns have the same name\0"
282     "invalid UTF-8 string\0"
283 nigel 77 /* 45 */
284 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
285     "malformed \\P or \\p sequence\0"
286     "unknown property name after \\P or \\p\0"
287     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 nigel 91 /* 50 */
290 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
291     "octal value is greater than \\377 (not in UTF-8 mode)\0"
292     "internal error: overran compiling workspace\0"
293     "internal error: previously-checked referenced subpattern not found\0"
294     "DEFINE group contains more than one branch\0"
295 nigel 93 /* 55 */
296 ph10 240 "repeating a DEFINE group is not allowed\0"
297     "inconsistent NEWLINE options\0"
298     "\\g is not followed by a braced name or an optionally braced non-zero number\0"
299     "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
300     "(*VERB) with an argument is not supported\0"
301 ph10 211 /* 60 */
302 ph10 240 "(*VERB) not recognized\0"
303 ph10 268 "number is too big\0"
304 ph10 272 "subpattern name expected\0"
305 ph10 269 "digit expected after (?+";
306 nigel 77
307    
308     /* Table to identify digits and hex digits. This is used when compiling
309     patterns. Note that the tables in chartables are dependent on the locale, and
310     may mark arbitrary characters as digits - but the PCRE compiling code expects
311     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
312     a private table here. It costs 256 bytes, but it is a lot faster than doing
313     character value tests (at least in some simple cases I timed), and in some
314     applications one wants PCRE to compile efficiently as well as match
315     efficiently.
316    
317     For convenience, we use the same bit definitions as in chartables:
318    
319     0x04 decimal digit
320     0x08 hexadecimal digit
321    
322     Then we can use ctype_digit and ctype_xdigit in the code. */
323    
324 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
325 nigel 77 static const unsigned char digitab[] =
326     {
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
333     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
334     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
335     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
336     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
337     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
338     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
339     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
340     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
341     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
342     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
343     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
344     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
347     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
348     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
350     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
351     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
352     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
355     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
356     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
357     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
358     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
359    
360 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
361 nigel 77 static const unsigned char digitab[] =
362     {
363     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
364     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
365     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
366     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
367     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
368     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
369     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
370     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
371     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
374 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
375 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
376     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
377     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
378     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
379     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
380     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
381     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
382     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
383     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
387     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
388     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
393     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
394     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
395    
396     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
397     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
398     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
399     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
400     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
401     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
402     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
403     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
404     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
405     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
406     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
407     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
408 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
409 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
410     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
411     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
412     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
413     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
414     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
415     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
416     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
417     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
418     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
419     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
420     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
421     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
422     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
423     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
424     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
425     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
426     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
427     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
428     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
429     #endif
430    
431    
432     /* Definition to allow mutual recursion */
433    
434     static BOOL
435 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
436 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
437 nigel 77
438    
439    
440     /*************************************************
441 ph10 240 * Find an error text *
442     *************************************************/
443    
444 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
445     some of the text is of unknown length, we can't use a table of offsets.
446     Instead, just count through the strings. This is not a performance issue
447 ph10 240 because it happens only when there has been a compilation error.
448    
449     Argument: the error number
450     Returns: pointer to the error string
451     */
452    
453     static const char *
454     find_error_text(int n)
455     {
456     const char *s = error_texts;
457 ph10 243 for (; n > 0; n--) while (*s++ != 0);
458 ph10 240 return s;
459     }
460    
461    
462     /*************************************************
463 nigel 77 * Handle escapes *
464     *************************************************/
465    
466     /* This function is called when a \ has been encountered. It either returns a
467     positive value for a simple escape such as \n, or a negative value which
468 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
469     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
470     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
471     ptr is pointing at the \. On exit, it is on the final character of the escape
472     sequence.
473 nigel 77
474     Arguments:
475     ptrptr points to the pattern position pointer
476     errorcodeptr points to the errorcode variable
477     bracount number of previous extracting brackets
478     options the options bits
479     isclass TRUE if inside a character class
480    
481     Returns: zero or positive => a data character
482     negative => a special escape sequence
483 ph10 213 on error, errorcodeptr is set
484 nigel 77 */
485    
486     static int
487     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
488     int options, BOOL isclass)
489     {
490 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
491     const uschar *ptr = *ptrptr + 1;
492 nigel 77 int c, i;
493    
494 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
495     ptr--; /* Set pointer back to the last byte */
496    
497 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
498    
499     if (c == 0) *errorcodeptr = ERR1;
500    
501 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
502     in a table. A non-zero result is something that can be returned immediately.
503 nigel 77 Otherwise further processing may be required. */
504    
505 ph10 97 #ifndef EBCDIC /* ASCII coding */
506 ph10 274 else if (c < '0' || c > 'z') {} /* Not alphanumeric */
507 nigel 77 else if ((i = escapes[c - '0']) != 0) c = i;
508    
509 ph10 97 #else /* EBCDIC coding */
510 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
511 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
512     #endif
513    
514     /* Escapes that need further processing, or are illegal. */
515    
516     else
517     {
518     const uschar *oldptr;
519 nigel 93 BOOL braced, negated;
520    
521 nigel 77 switch (c)
522     {
523     /* A number of Perl escapes are not handled by PCRE. We give an explicit
524     error. */
525    
526     case 'l':
527     case 'L':
528     case 'N':
529     case 'u':
530     case 'U':
531     *errorcodeptr = ERR37;
532     break;
533    
534 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
535     is an absolute backreference. If negative, it is a relative backreference.
536 ph10 172 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
537     reference to a named group. This is part of Perl's movement towards a
538     unified syntax for back references. As this is synonymous with \k{name}, we
539 ph10 171 fudge it up by pretending it really was \k. */
540 nigel 93
541     case 'g':
542     if (ptr[1] == '{')
543     {
544 ph10 171 const uschar *p;
545     for (p = ptr+2; *p != 0 && *p != '}'; p++)
546     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
547 ph10 172 if (*p != 0 && *p != '}')
548 ph10 171 {
549     c = -ESC_k;
550     break;
551 ph10 172 }
552 nigel 93 braced = TRUE;
553     ptr++;
554     }
555     else braced = FALSE;
556    
557     if (ptr[1] == '-')
558     {
559     negated = TRUE;
560     ptr++;
561     }
562     else negated = FALSE;
563    
564     c = 0;
565     while ((digitab[ptr[1]] & ctype_digit) != 0)
566     c = c * 10 + *(++ptr) - '0';
567 ph10 220
568 ph10 213 if (c < 0)
569     {
570     *errorcodeptr = ERR61;
571     break;
572 ph10 220 }
573 nigel 93
574     if (c == 0 || (braced && *(++ptr) != '}'))
575     {
576     *errorcodeptr = ERR57;
577 ph10 213 break;
578 nigel 93 }
579    
580     if (negated)
581     {
582     if (c > bracount)
583     {
584     *errorcodeptr = ERR15;
585 ph10 213 break;
586 nigel 93 }
587     c = bracount - (c - 1);
588     }
589    
590     c = -(ESC_REF + c);
591     break;
592    
593 nigel 77 /* The handling of escape sequences consisting of a string of digits
594     starting with one that is not zero is not straightforward. By experiment,
595     the way Perl works seems to be as follows:
596    
597     Outside a character class, the digits are read as a decimal number. If the
598     number is less than 10, or if there are that many previous extracting
599     left brackets, then it is a back reference. Otherwise, up to three octal
600     digits are read to form an escaped byte. Thus \123 is likely to be octal
601     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
602     value is greater than 377, the least significant 8 bits are taken. Inside a
603     character class, \ followed by a digit is always an octal number. */
604    
605     case '1': case '2': case '3': case '4': case '5':
606     case '6': case '7': case '8': case '9':
607    
608     if (!isclass)
609     {
610     oldptr = ptr;
611     c -= '0';
612     while ((digitab[ptr[1]] & ctype_digit) != 0)
613     c = c * 10 + *(++ptr) - '0';
614 ph10 213 if (c < 0)
615     {
616     *errorcodeptr = ERR61;
617 ph10 220 break;
618     }
619 nigel 77 if (c < 10 || c <= bracount)
620     {
621     c = -(ESC_REF + c);
622     break;
623     }
624     ptr = oldptr; /* Put the pointer back and fall through */
625     }
626    
627     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
628     generates a binary zero byte and treats the digit as a following literal.
629     Thus we have to pull back the pointer by one. */
630    
631     if ((c = *ptr) >= '8')
632     {
633     ptr--;
634     c = 0;
635     break;
636     }
637    
638     /* \0 always starts an octal number, but we may drop through to here with a
639 nigel 91 larger first octal digit. The original code used just to take the least
640     significant 8 bits of octal numbers (I think this is what early Perls used
641     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
642     than 3 octal digits. */
643 nigel 77
644     case '0':
645     c -= '0';
646     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
647     c = c * 8 + *(++ptr) - '0';
648 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
649 nigel 77 break;
650    
651 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
652     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
653     treated as a data character. */
654 nigel 77
655     case 'x':
656 nigel 87 if (ptr[1] == '{')
657 nigel 77 {
658     const uschar *pt = ptr + 2;
659 nigel 87 int count = 0;
660    
661 nigel 77 c = 0;
662     while ((digitab[*pt] & ctype_xdigit) != 0)
663     {
664 nigel 87 register int cc = *pt++;
665     if (c == 0 && cc == '0') continue; /* Leading zeroes */
666 nigel 77 count++;
667 nigel 87
668 ph10 97 #ifndef EBCDIC /* ASCII coding */
669 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
670 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
671 ph10 97 #else /* EBCDIC coding */
672 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
673 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
674 nigel 77 #endif
675     }
676 nigel 87
677 nigel 77 if (*pt == '}')
678     {
679 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
680 nigel 77 ptr = pt;
681     break;
682     }
683 nigel 87
684 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
685     recognize this construct; fall through to the normal \x handling. */
686     }
687    
688 nigel 87 /* Read just a single-byte hex-defined char */
689 nigel 77
690     c = 0;
691     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
692     {
693     int cc; /* Some compilers don't like ++ */
694     cc = *(++ptr); /* in initializers */
695 ph10 97 #ifndef EBCDIC /* ASCII coding */
696 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
697     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
698 ph10 97 #else /* EBCDIC coding */
699 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
700     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
701     #endif
702     }
703     break;
704    
705 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
706     This coding is ASCII-specific, but then the whole concept of \cx is
707     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
708 nigel 77
709     case 'c':
710     c = *(++ptr);
711     if (c == 0)
712     {
713     *errorcodeptr = ERR2;
714 ph10 213 break;
715 nigel 77 }
716    
717 ph10 97 #ifndef EBCDIC /* ASCII coding */
718 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
719     c ^= 0x40;
720 ph10 97 #else /* EBCDIC coding */
721 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
722     c ^= 0xC0;
723     #endif
724     break;
725    
726     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
727 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
728     otherwise, for Perl compatibility, it is a literal. This code looks a bit
729     odd, but there used to be some cases other than the default, and there may
730     be again in future, so I haven't "optimized" it. */
731 nigel 77
732     default:
733     if ((options & PCRE_EXTRA) != 0) switch(c)
734     {
735     default:
736     *errorcodeptr = ERR3;
737     break;
738     }
739     break;
740     }
741     }
742    
743     *ptrptr = ptr;
744     return c;
745     }
746    
747    
748    
749     #ifdef SUPPORT_UCP
750     /*************************************************
751     * Handle \P and \p *
752     *************************************************/
753    
754     /* This function is called after \P or \p has been encountered, provided that
755     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
756     pointing at the P or p. On exit, it is pointing at the final character of the
757     escape sequence.
758    
759     Argument:
760     ptrptr points to the pattern position pointer
761     negptr points to a boolean that is set TRUE for negation else FALSE
762 nigel 87 dptr points to an int that is set to the detailed property value
763 nigel 77 errorcodeptr points to the error code variable
764    
765 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
766 nigel 77 */
767    
768     static int
769 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
770 nigel 77 {
771     int c, i, bot, top;
772     const uschar *ptr = *ptrptr;
773 nigel 87 char name[32];
774 nigel 77
775     c = *(++ptr);
776     if (c == 0) goto ERROR_RETURN;
777    
778     *negptr = FALSE;
779    
780 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
781     negation. */
782 nigel 77
783     if (c == '{')
784     {
785     if (ptr[1] == '^')
786     {
787     *negptr = TRUE;
788     ptr++;
789     }
790 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
791 nigel 77 {
792     c = *(++ptr);
793     if (c == 0) goto ERROR_RETURN;
794     if (c == '}') break;
795     name[i] = c;
796     }
797 nigel 87 if (c !='}') goto ERROR_RETURN;
798 nigel 77 name[i] = 0;
799     }
800    
801     /* Otherwise there is just one following character */
802    
803     else
804     {
805     name[0] = c;
806     name[1] = 0;
807     }
808    
809     *ptrptr = ptr;
810    
811     /* Search for a recognized property name using binary chop */
812    
813     bot = 0;
814     top = _pcre_utt_size;
815    
816     while (bot < top)
817     {
818 nigel 87 i = (bot + top) >> 1;
819 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
820 nigel 87 if (c == 0)
821     {
822     *dptr = _pcre_utt[i].value;
823     return _pcre_utt[i].type;
824     }
825 nigel 77 if (c > 0) bot = i + 1; else top = i;
826     }
827    
828     *errorcodeptr = ERR47;
829     *ptrptr = ptr;
830     return -1;
831    
832     ERROR_RETURN:
833     *errorcodeptr = ERR46;
834     *ptrptr = ptr;
835     return -1;
836     }
837     #endif
838    
839    
840    
841    
842     /*************************************************
843     * Check for counted repeat *
844     *************************************************/
845    
846     /* This function is called when a '{' is encountered in a place where it might
847     start a quantifier. It looks ahead to see if it really is a quantifier or not.
848     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
849     where the ddds are digits.
850    
851     Arguments:
852     p pointer to the first char after '{'
853    
854     Returns: TRUE or FALSE
855     */
856    
857     static BOOL
858     is_counted_repeat(const uschar *p)
859     {
860     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
861     while ((digitab[*p] & ctype_digit) != 0) p++;
862     if (*p == '}') return TRUE;
863    
864     if (*p++ != ',') return FALSE;
865     if (*p == '}') return TRUE;
866    
867     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
868     while ((digitab[*p] & ctype_digit) != 0) p++;
869    
870     return (*p == '}');
871     }
872    
873    
874    
875     /*************************************************
876     * Read repeat counts *
877     *************************************************/
878    
879     /* Read an item of the form {n,m} and return the values. This is called only
880     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
881     so the syntax is guaranteed to be correct, but we need to check the values.
882    
883     Arguments:
884     p pointer to first char after '{'
885     minp pointer to int for min
886     maxp pointer to int for max
887     returned as -1 if no max
888     errorcodeptr points to error code variable
889    
890     Returns: pointer to '}' on success;
891     current ptr on error, with errorcodeptr set non-zero
892     */
893    
894     static const uschar *
895     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
896     {
897     int min = 0;
898     int max = -1;
899    
900 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
901     an integer overflow. */
902    
903 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
904 nigel 81 if (min < 0 || min > 65535)
905     {
906     *errorcodeptr = ERR5;
907     return p;
908     }
909 nigel 77
910 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
911     Also, max must not be less than min. */
912    
913 nigel 77 if (*p == '}') max = min; else
914     {
915     if (*(++p) != '}')
916     {
917     max = 0;
918     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
919 nigel 81 if (max < 0 || max > 65535)
920     {
921     *errorcodeptr = ERR5;
922     return p;
923     }
924 nigel 77 if (max < min)
925     {
926     *errorcodeptr = ERR4;
927     return p;
928     }
929     }
930     }
931    
932 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
933     '}'. */
934 nigel 77
935 nigel 81 *minp = min;
936     *maxp = max;
937 nigel 77 return p;
938     }
939    
940    
941    
942     /*************************************************
943 nigel 93 * Find forward referenced subpattern *
944 nigel 91 *************************************************/
945    
946 nigel 93 /* This function scans along a pattern's text looking for capturing
947     subpatterns, and counting them. If it finds a named pattern that matches the
948     name it is given, it returns its number. Alternatively, if the name is NULL, it
949     returns when it reaches a given numbered subpattern. This is used for forward
950     references to subpatterns. We know that if (?P< is encountered, the name will
951     be terminated by '>' because that is checked in the first pass.
952 nigel 91
953     Arguments:
954 nigel 93 ptr current position in the pattern
955     count current count of capturing parens so far encountered
956     name name to seek, or NULL if seeking a numbered subpattern
957     lorn name length, or subpattern number if name is NULL
958     xmode TRUE if we are in /x mode
959 nigel 91
960     Returns: the number of the named subpattern, or -1 if not found
961     */
962    
963     static int
964 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
965     BOOL xmode)
966 nigel 91 {
967     const uschar *thisname;
968 nigel 93
969 nigel 91 for (; *ptr != 0; ptr++)
970     {
971 nigel 93 int term;
972    
973     /* Skip over backslashed characters and also entire \Q...\E */
974    
975     if (*ptr == '\\')
976     {
977     if (*(++ptr) == 0) return -1;
978     if (*ptr == 'Q') for (;;)
979     {
980     while (*(++ptr) != 0 && *ptr != '\\');
981     if (*ptr == 0) return -1;
982     if (*(++ptr) == 'E') break;
983     }
984     continue;
985     }
986    
987     /* Skip over character classes */
988    
989     if (*ptr == '[')
990     {
991     while (*(++ptr) != ']')
992     {
993 ph10 220 if (*ptr == 0) return -1;
994 nigel 93 if (*ptr == '\\')
995     {
996     if (*(++ptr) == 0) return -1;
997     if (*ptr == 'Q') for (;;)
998     {
999     while (*(++ptr) != 0 && *ptr != '\\');
1000     if (*ptr == 0) return -1;
1001     if (*(++ptr) == 'E') break;
1002     }
1003     continue;
1004     }
1005     }
1006     continue;
1007     }
1008    
1009     /* Skip comments in /x mode */
1010    
1011     if (xmode && *ptr == '#')
1012     {
1013     while (*(++ptr) != 0 && *ptr != '\n');
1014     if (*ptr == 0) return -1;
1015     continue;
1016     }
1017    
1018     /* An opening parens must now be a real metacharacter */
1019    
1020 nigel 91 if (*ptr != '(') continue;
1021 ph10 210 if (ptr[1] != '?' && ptr[1] != '*')
1022 nigel 93 {
1023     count++;
1024     if (name == NULL && count == lorn) return count;
1025     continue;
1026     }
1027    
1028     ptr += 2;
1029     if (*ptr == 'P') ptr++; /* Allow optional P */
1030    
1031     /* We have to disambiguate (?<! and (?<= from (?<name> */
1032    
1033     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1034     *ptr != '\'')
1035     continue;
1036    
1037 nigel 91 count++;
1038 nigel 93
1039     if (name == NULL && count == lorn) return count;
1040     term = *ptr++;
1041     if (term == '<') term = '>';
1042 nigel 91 thisname = ptr;
1043 nigel 93 while (*ptr != term) ptr++;
1044     if (name != NULL && lorn == ptr - thisname &&
1045     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1046 nigel 91 return count;
1047     }
1048 nigel 93
1049 nigel 91 return -1;
1050     }
1051    
1052    
1053    
1054     /*************************************************
1055 nigel 77 * Find first significant op code *
1056     *************************************************/
1057    
1058     /* This is called by several functions that scan a compiled expression looking
1059     for a fixed first character, or an anchoring op code etc. It skips over things
1060     that do not influence this. For some calls, a change of option is important.
1061     For some calls, it makes sense to skip negative forward and all backward
1062     assertions, and also the \b assertion; for others it does not.
1063    
1064     Arguments:
1065     code pointer to the start of the group
1066     options pointer to external options
1067     optbit the option bit whose changing is significant, or
1068     zero if none are
1069     skipassert TRUE if certain assertions are to be skipped
1070    
1071     Returns: pointer to the first significant opcode
1072     */
1073    
1074     static const uschar*
1075     first_significant_code(const uschar *code, int *options, int optbit,
1076     BOOL skipassert)
1077     {
1078     for (;;)
1079     {
1080     switch ((int)*code)
1081     {
1082     case OP_OPT:
1083     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1084     *options = (int)code[1];
1085     code += 2;
1086     break;
1087    
1088     case OP_ASSERT_NOT:
1089     case OP_ASSERTBACK:
1090     case OP_ASSERTBACK_NOT:
1091     if (!skipassert) return code;
1092     do code += GET(code, 1); while (*code == OP_ALT);
1093     code += _pcre_OP_lengths[*code];
1094     break;
1095    
1096     case OP_WORD_BOUNDARY:
1097     case OP_NOT_WORD_BOUNDARY:
1098     if (!skipassert) return code;
1099     /* Fall through */
1100    
1101     case OP_CALLOUT:
1102     case OP_CREF:
1103 nigel 93 case OP_RREF:
1104     case OP_DEF:
1105 nigel 77 code += _pcre_OP_lengths[*code];
1106     break;
1107    
1108     default:
1109     return code;
1110     }
1111     }
1112     /* Control never reaches here */
1113     }
1114    
1115    
1116    
1117    
1118     /*************************************************
1119     * Find the fixed length of a pattern *
1120     *************************************************/
1121    
1122     /* Scan a pattern and compute the fixed length of subject that will match it,
1123     if the length is fixed. This is needed for dealing with backward assertions.
1124     In UTF8 mode, the result is in characters rather than bytes.
1125    
1126     Arguments:
1127     code points to the start of the pattern (the bracket)
1128     options the compiling options
1129    
1130     Returns: the fixed length, or -1 if there is no fixed length,
1131     or -2 if \C was encountered
1132     */
1133    
1134     static int
1135     find_fixedlength(uschar *code, int options)
1136     {
1137     int length = -1;
1138    
1139     register int branchlength = 0;
1140     register uschar *cc = code + 1 + LINK_SIZE;
1141    
1142     /* Scan along the opcodes for this branch. If we get to the end of the
1143     branch, check the length against that of the other branches. */
1144    
1145     for (;;)
1146     {
1147     int d;
1148     register int op = *cc;
1149     switch (op)
1150     {
1151 nigel 93 case OP_CBRA:
1152 nigel 77 case OP_BRA:
1153     case OP_ONCE:
1154     case OP_COND:
1155 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1156 nigel 77 if (d < 0) return d;
1157     branchlength += d;
1158     do cc += GET(cc, 1); while (*cc == OP_ALT);
1159     cc += 1 + LINK_SIZE;
1160     break;
1161    
1162     /* Reached end of a branch; if it's a ket it is the end of a nested
1163     call. If it's ALT it is an alternation in a nested call. If it is
1164     END it's the end of the outer call. All can be handled by the same code. */
1165    
1166     case OP_ALT:
1167     case OP_KET:
1168     case OP_KETRMAX:
1169     case OP_KETRMIN:
1170     case OP_END:
1171     if (length < 0) length = branchlength;
1172     else if (length != branchlength) return -1;
1173     if (*cc != OP_ALT) return length;
1174     cc += 1 + LINK_SIZE;
1175     branchlength = 0;
1176     break;
1177    
1178     /* Skip over assertive subpatterns */
1179    
1180     case OP_ASSERT:
1181     case OP_ASSERT_NOT:
1182     case OP_ASSERTBACK:
1183     case OP_ASSERTBACK_NOT:
1184     do cc += GET(cc, 1); while (*cc == OP_ALT);
1185     /* Fall through */
1186    
1187     /* Skip over things that don't match chars */
1188    
1189     case OP_REVERSE:
1190     case OP_CREF:
1191 nigel 93 case OP_RREF:
1192     case OP_DEF:
1193 nigel 77 case OP_OPT:
1194     case OP_CALLOUT:
1195     case OP_SOD:
1196     case OP_SOM:
1197     case OP_EOD:
1198     case OP_EODN:
1199     case OP_CIRC:
1200     case OP_DOLL:
1201     case OP_NOT_WORD_BOUNDARY:
1202     case OP_WORD_BOUNDARY:
1203     cc += _pcre_OP_lengths[*cc];
1204     break;
1205    
1206     /* Handle literal characters */
1207    
1208     case OP_CHAR:
1209     case OP_CHARNC:
1210 nigel 91 case OP_NOT:
1211 nigel 77 branchlength++;
1212     cc += 2;
1213     #ifdef SUPPORT_UTF8
1214     if ((options & PCRE_UTF8) != 0)
1215     {
1216     while ((*cc & 0xc0) == 0x80) cc++;
1217     }
1218     #endif
1219     break;
1220    
1221     /* Handle exact repetitions. The count is already in characters, but we
1222     need to skip over a multibyte character in UTF8 mode. */
1223    
1224     case OP_EXACT:
1225     branchlength += GET2(cc,1);
1226     cc += 4;
1227     #ifdef SUPPORT_UTF8
1228     if ((options & PCRE_UTF8) != 0)
1229     {
1230     while((*cc & 0x80) == 0x80) cc++;
1231     }
1232     #endif
1233     break;
1234    
1235     case OP_TYPEEXACT:
1236     branchlength += GET2(cc,1);
1237 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1238 nigel 77 cc += 4;
1239     break;
1240    
1241     /* Handle single-char matchers */
1242    
1243     case OP_PROP:
1244     case OP_NOTPROP:
1245 nigel 87 cc += 2;
1246 nigel 77 /* Fall through */
1247    
1248     case OP_NOT_DIGIT:
1249     case OP_DIGIT:
1250     case OP_NOT_WHITESPACE:
1251     case OP_WHITESPACE:
1252     case OP_NOT_WORDCHAR:
1253     case OP_WORDCHAR:
1254     case OP_ANY:
1255     branchlength++;
1256     cc++;
1257     break;
1258    
1259     /* The single-byte matcher isn't allowed */
1260    
1261     case OP_ANYBYTE:
1262     return -2;
1263    
1264     /* Check a class for variable quantification */
1265    
1266     #ifdef SUPPORT_UTF8
1267     case OP_XCLASS:
1268     cc += GET(cc, 1) - 33;
1269     /* Fall through */
1270     #endif
1271    
1272     case OP_CLASS:
1273     case OP_NCLASS:
1274     cc += 33;
1275    
1276     switch (*cc)
1277     {
1278     case OP_CRSTAR:
1279     case OP_CRMINSTAR:
1280     case OP_CRQUERY:
1281     case OP_CRMINQUERY:
1282     return -1;
1283    
1284     case OP_CRRANGE:
1285     case OP_CRMINRANGE:
1286     if (GET2(cc,1) != GET2(cc,3)) return -1;
1287     branchlength += GET2(cc,1);
1288     cc += 5;
1289     break;
1290    
1291     default:
1292     branchlength++;
1293     }
1294     break;
1295    
1296     /* Anything else is variable length */
1297    
1298     default:
1299     return -1;
1300     }
1301     }
1302     /* Control never gets here */
1303     }
1304    
1305    
1306    
1307    
1308     /*************************************************
1309     * Scan compiled regex for numbered bracket *
1310     *************************************************/
1311    
1312     /* This little function scans through a compiled pattern until it finds a
1313     capturing bracket with the given number.
1314    
1315     Arguments:
1316     code points to start of expression
1317     utf8 TRUE in UTF-8 mode
1318     number the required bracket number
1319    
1320     Returns: pointer to the opcode for the bracket, or NULL if not found
1321     */
1322    
1323     static const uschar *
1324     find_bracket(const uschar *code, BOOL utf8, int number)
1325     {
1326     for (;;)
1327     {
1328     register int c = *code;
1329     if (c == OP_END) return NULL;
1330 nigel 91
1331     /* XCLASS is used for classes that cannot be represented just by a bit
1332     map. This includes negated single high-valued characters. The length in
1333     the table is zero; the actual length is stored in the compiled code. */
1334    
1335     if (c == OP_XCLASS) code += GET(code, 1);
1336    
1337 nigel 93 /* Handle capturing bracket */
1338 nigel 91
1339 nigel 93 else if (c == OP_CBRA)
1340 nigel 77 {
1341 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1342 nigel 77 if (n == number) return (uschar *)code;
1343 nigel 93 code += _pcre_OP_lengths[c];
1344 nigel 77 }
1345 nigel 91
1346 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1347     repeated character types, we have to test for \p and \P, which have an extra
1348 ph10 218 two bytes of parameters. */
1349 nigel 91
1350 nigel 77 else
1351     {
1352 ph10 218 switch(c)
1353     {
1354     case OP_TYPESTAR:
1355     case OP_TYPEMINSTAR:
1356     case OP_TYPEPLUS:
1357     case OP_TYPEMINPLUS:
1358     case OP_TYPEQUERY:
1359     case OP_TYPEMINQUERY:
1360     case OP_TYPEPOSSTAR:
1361     case OP_TYPEPOSPLUS:
1362     case OP_TYPEPOSQUERY:
1363     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1364 ph10 220 break;
1365 ph10 221
1366     case OP_TYPEUPTO:
1367     case OP_TYPEMINUPTO:
1368     case OP_TYPEEXACT:
1369     case OP_TYPEPOSUPTO:
1370     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1371     break;
1372 ph10 220 }
1373    
1374 ph10 218 /* Add in the fixed length from the table */
1375 ph10 220
1376 nigel 77 code += _pcre_OP_lengths[c];
1377 ph10 220
1378 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1379     a multi-byte character. The length in the table is a minimum, so we have to
1380     arrange to skip the extra bytes. */
1381 ph10 220
1382 ph10 107 #ifdef SUPPORT_UTF8
1383 nigel 77 if (utf8) switch(c)
1384     {
1385     case OP_CHAR:
1386     case OP_CHARNC:
1387     case OP_EXACT:
1388     case OP_UPTO:
1389     case OP_MINUPTO:
1390 nigel 93 case OP_POSUPTO:
1391 nigel 77 case OP_STAR:
1392     case OP_MINSTAR:
1393 nigel 93 case OP_POSSTAR:
1394 nigel 77 case OP_PLUS:
1395     case OP_MINPLUS:
1396 nigel 93 case OP_POSPLUS:
1397 nigel 77 case OP_QUERY:
1398     case OP_MINQUERY:
1399 nigel 93 case OP_POSQUERY:
1400     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1401 nigel 77 break;
1402     }
1403 ph10 111 #endif
1404 nigel 77 }
1405     }
1406     }
1407    
1408    
1409    
1410     /*************************************************
1411     * Scan compiled regex for recursion reference *
1412     *************************************************/
1413    
1414     /* This little function scans through a compiled pattern until it finds an
1415     instance of OP_RECURSE.
1416    
1417     Arguments:
1418     code points to start of expression
1419     utf8 TRUE in UTF-8 mode
1420    
1421     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1422     */
1423    
1424     static const uschar *
1425     find_recurse(const uschar *code, BOOL utf8)
1426     {
1427     for (;;)
1428     {
1429     register int c = *code;
1430     if (c == OP_END) return NULL;
1431 nigel 91 if (c == OP_RECURSE) return code;
1432 ph10 220
1433 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1434     map. This includes negated single high-valued characters. The length in
1435     the table is zero; the actual length is stored in the compiled code. */
1436    
1437     if (c == OP_XCLASS) code += GET(code, 1);
1438    
1439 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1440     repeated character types, we have to test for \p and \P, which have an extra
1441 ph10 218 two bytes of parameters. */
1442 nigel 91
1443 nigel 77 else
1444     {
1445 ph10 218 switch(c)
1446     {
1447     case OP_TYPESTAR:
1448     case OP_TYPEMINSTAR:
1449     case OP_TYPEPLUS:
1450     case OP_TYPEMINPLUS:
1451     case OP_TYPEQUERY:
1452     case OP_TYPEMINQUERY:
1453     case OP_TYPEPOSSTAR:
1454     case OP_TYPEPOSPLUS:
1455     case OP_TYPEPOSQUERY:
1456     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1457 ph10 220 break;
1458 ph10 221
1459     case OP_TYPEPOSUPTO:
1460     case OP_TYPEUPTO:
1461     case OP_TYPEMINUPTO:
1462     case OP_TYPEEXACT:
1463     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1464     break;
1465 ph10 220 }
1466    
1467 ph10 218 /* Add in the fixed length from the table */
1468    
1469 nigel 77 code += _pcre_OP_lengths[c];
1470 ph10 220
1471 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1472     by a multi-byte character. The length in the table is a minimum, so we have
1473     to arrange to skip the extra bytes. */
1474 ph10 220
1475 ph10 107 #ifdef SUPPORT_UTF8
1476 nigel 77 if (utf8) switch(c)
1477     {
1478     case OP_CHAR:
1479     case OP_CHARNC:
1480     case OP_EXACT:
1481     case OP_UPTO:
1482     case OP_MINUPTO:
1483 nigel 93 case OP_POSUPTO:
1484 nigel 77 case OP_STAR:
1485     case OP_MINSTAR:
1486 nigel 93 case OP_POSSTAR:
1487 nigel 77 case OP_PLUS:
1488     case OP_MINPLUS:
1489 nigel 93 case OP_POSPLUS:
1490 nigel 77 case OP_QUERY:
1491     case OP_MINQUERY:
1492 nigel 93 case OP_POSQUERY:
1493     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1494 nigel 77 break;
1495     }
1496 ph10 111 #endif
1497 nigel 77 }
1498     }
1499     }
1500    
1501    
1502    
1503     /*************************************************
1504     * Scan compiled branch for non-emptiness *
1505     *************************************************/
1506    
1507     /* This function scans through a branch of a compiled pattern to see whether it
1508 nigel 93 can match the empty string or not. It is called from could_be_empty()
1509     below and from compile_branch() when checking for an unlimited repeat of a
1510     group that can match nothing. Note that first_significant_code() skips over
1511 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1512     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1513     bracket whose current branch will already have been scanned.
1514 nigel 77
1515     Arguments:
1516     code points to start of search
1517     endcode points to where to stop
1518     utf8 TRUE if in UTF8 mode
1519    
1520     Returns: TRUE if what is matched could be empty
1521     */
1522    
1523     static BOOL
1524     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1525     {
1526     register int c;
1527 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1528 nigel 77 code < endcode;
1529     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1530     {
1531     const uschar *ccode;
1532    
1533     c = *code;
1534 ph10 286
1535     /* Skip over forward assertions; the other assertions are skipped by
1536 ph10 282 first_significant_code() with a TRUE final argument. */
1537 ph10 286
1538 ph10 282 if (c == OP_ASSERT)
1539 ph10 286 {
1540 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1541     c = *code;
1542     continue;
1543 ph10 286 }
1544 ph10 172
1545 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1546 nigel 77
1547 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1548     {
1549 ph10 172 code += _pcre_OP_lengths[c];
1550 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1551     c = *code;
1552     continue;
1553     }
1554    
1555     /* For other groups, scan the branches. */
1556 ph10 172
1557 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1558 nigel 77 {
1559     BOOL empty_branch;
1560     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1561    
1562     /* Scan a closed bracket */
1563    
1564     empty_branch = FALSE;
1565     do
1566     {
1567     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1568     empty_branch = TRUE;
1569     code += GET(code, 1);
1570     }
1571     while (*code == OP_ALT);
1572     if (!empty_branch) return FALSE; /* All branches are non-empty */
1573 ph10 172 c = *code;
1574 nigel 93 continue;
1575 nigel 77 }
1576    
1577 nigel 93 /* Handle the other opcodes */
1578    
1579     switch (c)
1580 nigel 77 {
1581 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1582     cannot be represented just by a bit map. This includes negated single
1583     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1584 ph10 220 actual length is stored in the compiled code, so we must update "code"
1585 ph10 216 here. */
1586 nigel 77
1587     #ifdef SUPPORT_UTF8
1588     case OP_XCLASS:
1589 ph10 216 ccode = code += GET(code, 1);
1590 nigel 77 goto CHECK_CLASS_REPEAT;
1591     #endif
1592    
1593     case OP_CLASS:
1594     case OP_NCLASS:
1595     ccode = code + 33;
1596    
1597     #ifdef SUPPORT_UTF8
1598     CHECK_CLASS_REPEAT:
1599     #endif
1600    
1601     switch (*ccode)
1602     {
1603     case OP_CRSTAR: /* These could be empty; continue */
1604     case OP_CRMINSTAR:
1605     case OP_CRQUERY:
1606     case OP_CRMINQUERY:
1607     break;
1608    
1609     default: /* Non-repeat => class must match */
1610     case OP_CRPLUS: /* These repeats aren't empty */
1611     case OP_CRMINPLUS:
1612     return FALSE;
1613    
1614     case OP_CRRANGE:
1615     case OP_CRMINRANGE:
1616     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1617     break;
1618     }
1619     break;
1620    
1621     /* Opcodes that must match a character */
1622    
1623     case OP_PROP:
1624     case OP_NOTPROP:
1625     case OP_EXTUNI:
1626     case OP_NOT_DIGIT:
1627     case OP_DIGIT:
1628     case OP_NOT_WHITESPACE:
1629     case OP_WHITESPACE:
1630     case OP_NOT_WORDCHAR:
1631     case OP_WORDCHAR:
1632     case OP_ANY:
1633     case OP_ANYBYTE:
1634     case OP_CHAR:
1635     case OP_CHARNC:
1636     case OP_NOT:
1637     case OP_PLUS:
1638     case OP_MINPLUS:
1639 nigel 93 case OP_POSPLUS:
1640 nigel 77 case OP_EXACT:
1641     case OP_NOTPLUS:
1642     case OP_NOTMINPLUS:
1643 nigel 93 case OP_NOTPOSPLUS:
1644 nigel 77 case OP_NOTEXACT:
1645     case OP_TYPEPLUS:
1646     case OP_TYPEMINPLUS:
1647 nigel 93 case OP_TYPEPOSPLUS:
1648 nigel 77 case OP_TYPEEXACT:
1649     return FALSE;
1650 ph10 227
1651     /* These are going to continue, as they may be empty, but we have to
1652     fudge the length for the \p and \P cases. */
1653    
1654 ph10 224 case OP_TYPESTAR:
1655     case OP_TYPEMINSTAR:
1656     case OP_TYPEPOSSTAR:
1657     case OP_TYPEQUERY:
1658     case OP_TYPEMINQUERY:
1659     case OP_TYPEPOSQUERY:
1660     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1661 ph10 227 break;
1662    
1663 ph10 224 /* Same for these */
1664 ph10 227
1665 ph10 224 case OP_TYPEUPTO:
1666     case OP_TYPEMINUPTO:
1667     case OP_TYPEPOSUPTO:
1668     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1669     break;
1670 nigel 77
1671     /* End of branch */
1672    
1673     case OP_KET:
1674     case OP_KETRMAX:
1675     case OP_KETRMIN:
1676     case OP_ALT:
1677     return TRUE;
1678    
1679 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1680     MINUPTO, and POSUPTO may be followed by a multibyte character */
1681 nigel 77
1682     #ifdef SUPPORT_UTF8
1683     case OP_STAR:
1684     case OP_MINSTAR:
1685 nigel 93 case OP_POSSTAR:
1686 nigel 77 case OP_QUERY:
1687     case OP_MINQUERY:
1688 nigel 93 case OP_POSQUERY:
1689 nigel 77 case OP_UPTO:
1690     case OP_MINUPTO:
1691 nigel 93 case OP_POSUPTO:
1692 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1693     break;
1694     #endif
1695     }
1696     }
1697    
1698     return TRUE;
1699     }
1700    
1701    
1702    
1703     /*************************************************
1704     * Scan compiled regex for non-emptiness *
1705     *************************************************/
1706    
1707     /* This function is called to check for left recursive calls. We want to check
1708     the current branch of the current pattern to see if it could match the empty
1709     string. If it could, we must look outwards for branches at other levels,
1710     stopping when we pass beyond the bracket which is the subject of the recursion.
1711    
1712     Arguments:
1713     code points to start of the recursion
1714     endcode points to where to stop (current RECURSE item)
1715     bcptr points to the chain of current (unclosed) branch starts
1716     utf8 TRUE if in UTF-8 mode
1717    
1718     Returns: TRUE if what is matched could be empty
1719     */
1720    
1721     static BOOL
1722     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1723     BOOL utf8)
1724     {
1725     while (bcptr != NULL && bcptr->current >= code)
1726     {
1727     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1728     bcptr = bcptr->outer;
1729     }
1730     return TRUE;
1731     }
1732    
1733    
1734    
1735     /*************************************************
1736     * Check for POSIX class syntax *
1737     *************************************************/
1738    
1739     /* This function is called when the sequence "[:" or "[." or "[=" is
1740 ph10 295 encountered in a character class. It checks whether this is followed by a
1741 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1742 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
1743 nigel 77
1744 ph10 298 Originally, this function only recognized a sequence of letters between the
1745     terminators, but it seems that Perl recognizes any sequence of characters,
1746     though of course unknown POSIX names are subsequently rejected. Perl gives an
1747     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1748     didn't consider this to be a POSIX class. Likewise for [:1234:].
1749 ph10 295
1750 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
1751     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1752     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1753     below handles the special case of \], but does not try to do any other escape
1754     processing. This makes it different from Perl for cases such as [:l\ower:]
1755 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1756 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1757 ph10 295 I think.
1758    
1759     Arguments:
1760 nigel 77 ptr pointer to the initial [
1761     endptr where to return the end pointer
1762    
1763     Returns: TRUE or FALSE
1764     */
1765    
1766     static BOOL
1767 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1768 nigel 77 {
1769     int terminator; /* Don't combine these lines; the Solaris cc */
1770     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1771 ph10 295 for (++ptr; *ptr != 0; ptr++)
1772 nigel 77 {
1773 ph10 295 if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1774 ph10 298 {
1775     if (*ptr == ']') return FALSE;
1776 ph10 295 if (*ptr == terminator && ptr[1] == ']')
1777     {
1778     *endptr = ptr;
1779     return TRUE;
1780 ph10 298 }
1781     }
1782     }
1783 nigel 77 return FALSE;
1784     }
1785    
1786    
1787    
1788    
1789     /*************************************************
1790     * Check POSIX class name *
1791     *************************************************/
1792    
1793     /* This function is called to check the name given in a POSIX-style class entry
1794     such as [:alnum:].
1795    
1796     Arguments:
1797     ptr points to the first letter
1798     len the length of the name
1799    
1800     Returns: a value representing the name, or -1 if unknown
1801     */
1802    
1803     static int
1804     check_posix_name(const uschar *ptr, int len)
1805     {
1806 ph10 240 const char *pn = posix_names;
1807 nigel 77 register int yield = 0;
1808     while (posix_name_lengths[yield] != 0)
1809     {
1810     if (len == posix_name_lengths[yield] &&
1811 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
1812 ph10 243 pn += posix_name_lengths[yield] + 1;
1813 nigel 77 yield++;
1814     }
1815     return -1;
1816     }
1817    
1818    
1819     /*************************************************
1820     * Adjust OP_RECURSE items in repeated group *
1821     *************************************************/
1822    
1823     /* OP_RECURSE items contain an offset from the start of the regex to the group
1824     that is referenced. This means that groups can be replicated for fixed
1825     repetition simply by copying (because the recursion is allowed to refer to
1826     earlier groups that are outside the current group). However, when a group is
1827     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1828     it, after it has been compiled. This means that any OP_RECURSE items within it
1829     that refer to the group itself or any contained groups have to have their
1830 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1831     the partially compiled regex must be temporarily terminated with OP_END.
1832 nigel 77
1833 nigel 93 This function has been extended with the possibility of forward references for
1834     recursions and subroutine calls. It must also check the list of such references
1835     for the group we are dealing with. If it finds that one of the recursions in
1836     the current group is on this list, it adjusts the offset in the list, not the
1837     value in the reference (which is a group number).
1838    
1839 nigel 77 Arguments:
1840     group points to the start of the group
1841     adjust the amount by which the group is to be moved
1842     utf8 TRUE in UTF-8 mode
1843     cd contains pointers to tables etc.
1844 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1845 nigel 77
1846     Returns: nothing
1847     */
1848    
1849     static void
1850 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1851     uschar *save_hwm)
1852 nigel 77 {
1853     uschar *ptr = group;
1854 ph10 224
1855 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1856     {
1857 nigel 93 int offset;
1858     uschar *hc;
1859    
1860     /* See if this recursion is on the forward reference list. If so, adjust the
1861     reference. */
1862    
1863     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1864     {
1865     offset = GET(hc, 0);
1866     if (cd->start_code + offset == ptr + 1)
1867     {
1868     PUT(hc, 0, offset + adjust);
1869     break;
1870     }
1871     }
1872    
1873     /* Otherwise, adjust the recursion offset if it's after the start of this
1874     group. */
1875    
1876     if (hc >= cd->hwm)
1877     {
1878     offset = GET(ptr, 1);
1879     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1880     }
1881    
1882 nigel 77 ptr += 1 + LINK_SIZE;
1883     }
1884     }
1885    
1886    
1887    
1888     /*************************************************
1889     * Insert an automatic callout point *
1890     *************************************************/
1891    
1892     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1893     callout points before each pattern item.
1894    
1895     Arguments:
1896     code current code pointer
1897     ptr current pattern pointer
1898     cd pointers to tables etc
1899    
1900     Returns: new code pointer
1901     */
1902    
1903     static uschar *
1904     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1905     {
1906     *code++ = OP_CALLOUT;
1907     *code++ = 255;
1908     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1909     PUT(code, LINK_SIZE, 0); /* Default length */
1910     return code + 2*LINK_SIZE;
1911     }
1912    
1913    
1914    
1915     /*************************************************
1916     * Complete a callout item *
1917     *************************************************/
1918    
1919     /* A callout item contains the length of the next item in the pattern, which
1920     we can't fill in till after we have reached the relevant point. This is used
1921     for both automatic and manual callouts.
1922    
1923     Arguments:
1924     previous_callout points to previous callout item
1925     ptr current pattern pointer
1926     cd pointers to tables etc
1927    
1928     Returns: nothing
1929     */
1930    
1931     static void
1932     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1933     {
1934     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1935     PUT(previous_callout, 2 + LINK_SIZE, length);
1936     }
1937    
1938    
1939    
1940     #ifdef SUPPORT_UCP
1941     /*************************************************
1942     * Get othercase range *
1943     *************************************************/
1944    
1945     /* This function is passed the start and end of a class range, in UTF-8 mode
1946     with UCP support. It searches up the characters, looking for internal ranges of
1947     characters in the "other" case. Each call returns the next one, updating the
1948     start address.
1949    
1950     Arguments:
1951     cptr points to starting character value; updated
1952     d end value
1953     ocptr where to put start of othercase range
1954     odptr where to put end of othercase range
1955    
1956     Yield: TRUE when range returned; FALSE when no more
1957     */
1958    
1959     static BOOL
1960 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1961     unsigned int *odptr)
1962 nigel 77 {
1963 nigel 93 unsigned int c, othercase, next;
1964 nigel 77
1965     for (c = *cptr; c <= d; c++)
1966 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1967 nigel 77
1968     if (c > d) return FALSE;
1969    
1970     *ocptr = othercase;
1971     next = othercase + 1;
1972    
1973     for (++c; c <= d; c++)
1974     {
1975 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1976 nigel 77 next++;
1977     }
1978    
1979     *odptr = next - 1;
1980     *cptr = c;
1981    
1982     return TRUE;
1983     }
1984     #endif /* SUPPORT_UCP */
1985    
1986    
1987 nigel 93
1988 nigel 77 /*************************************************
1989 nigel 93 * Check if auto-possessifying is possible *
1990     *************************************************/
1991    
1992     /* This function is called for unlimited repeats of certain items, to see
1993     whether the next thing could possibly match the repeated item. If not, it makes
1994     sense to automatically possessify the repeated item.
1995    
1996     Arguments:
1997     op_code the repeated op code
1998     this data for this item, depends on the opcode
1999     utf8 TRUE in UTF-8 mode
2000     utf8_char used for utf8 character bytes, NULL if not relevant
2001     ptr next character in pattern
2002     options options bits
2003     cd contains pointers to tables etc.
2004    
2005     Returns: TRUE if possessifying is wanted
2006     */
2007    
2008     static BOOL
2009     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2010     const uschar *ptr, int options, compile_data *cd)
2011     {
2012     int next;
2013    
2014     /* Skip whitespace and comments in extended mode */
2015    
2016     if ((options & PCRE_EXTENDED) != 0)
2017     {
2018     for (;;)
2019     {
2020     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2021     if (*ptr == '#')
2022     {
2023     while (*(++ptr) != 0)
2024     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2025     }
2026     else break;
2027     }
2028     }
2029    
2030     /* If the next item is one that we can handle, get its value. A non-negative
2031     value is a character, a negative value is an escape value. */
2032    
2033     if (*ptr == '\\')
2034     {
2035     int temperrorcode = 0;
2036     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2037     if (temperrorcode != 0) return FALSE;
2038     ptr++; /* Point after the escape sequence */
2039     }
2040    
2041     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2042     {
2043     #ifdef SUPPORT_UTF8
2044     if (utf8) { GETCHARINC(next, ptr); } else
2045     #endif
2046     next = *ptr++;
2047     }
2048    
2049     else return FALSE;
2050    
2051     /* Skip whitespace and comments in extended mode */
2052    
2053     if ((options & PCRE_EXTENDED) != 0)
2054     {
2055     for (;;)
2056     {
2057     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2058     if (*ptr == '#')
2059     {
2060     while (*(++ptr) != 0)
2061     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2062     }
2063     else break;
2064     }
2065     }
2066    
2067     /* If the next thing is itself optional, we have to give up. */
2068    
2069     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2070     return FALSE;
2071    
2072     /* Now compare the next item with the previous opcode. If the previous is a
2073     positive single character match, "item" either contains the character or, if
2074     "item" is greater than 127 in utf8 mode, the character's bytes are in
2075     utf8_char. */
2076    
2077    
2078     /* Handle cases when the next item is a character. */
2079    
2080     if (next >= 0) switch(op_code)
2081     {
2082     case OP_CHAR:
2083     #ifdef SUPPORT_UTF8
2084     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2085     #endif
2086     return item != next;
2087    
2088     /* For CHARNC (caseless character) we must check the other case. If we have
2089     Unicode property support, we can use it to test the other case of
2090     high-valued characters. */
2091    
2092     case OP_CHARNC:
2093     #ifdef SUPPORT_UTF8
2094     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2095     #endif
2096     if (item == next) return FALSE;
2097     #ifdef SUPPORT_UTF8
2098     if (utf8)
2099     {
2100     unsigned int othercase;
2101     if (next < 128) othercase = cd->fcc[next]; else
2102     #ifdef SUPPORT_UCP
2103     othercase = _pcre_ucp_othercase((unsigned int)next);
2104     #else
2105     othercase = NOTACHAR;
2106     #endif
2107     return (unsigned int)item != othercase;
2108     }
2109     else
2110     #endif /* SUPPORT_UTF8 */
2111     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2112    
2113     /* For OP_NOT, "item" must be a single-byte character. */
2114    
2115     case OP_NOT:
2116     if (next < 0) return FALSE; /* Not a character */
2117     if (item == next) return TRUE;
2118     if ((options & PCRE_CASELESS) == 0) return FALSE;
2119     #ifdef SUPPORT_UTF8
2120     if (utf8)
2121     {
2122     unsigned int othercase;
2123     if (next < 128) othercase = cd->fcc[next]; else
2124     #ifdef SUPPORT_UCP
2125     othercase = _pcre_ucp_othercase(next);
2126     #else
2127     othercase = NOTACHAR;
2128     #endif
2129     return (unsigned int)item == othercase;
2130     }
2131     else
2132     #endif /* SUPPORT_UTF8 */
2133     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2134    
2135     case OP_DIGIT:
2136     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2137    
2138     case OP_NOT_DIGIT:
2139     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2140    
2141     case OP_WHITESPACE:
2142     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2143    
2144     case OP_NOT_WHITESPACE:
2145     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2146    
2147     case OP_WORDCHAR:
2148     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2149    
2150     case OP_NOT_WORDCHAR:
2151     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2152    
2153 ph10 180 case OP_HSPACE:
2154     case OP_NOT_HSPACE:
2155     switch(next)
2156     {
2157     case 0x09:
2158     case 0x20:
2159     case 0xa0:
2160     case 0x1680:
2161     case 0x180e:
2162     case 0x2000:
2163     case 0x2001:
2164     case 0x2002:
2165     case 0x2003:
2166     case 0x2004:
2167     case 0x2005:
2168     case 0x2006:
2169     case 0x2007:
2170     case 0x2008:
2171     case 0x2009:
2172     case 0x200A:
2173     case 0x202f:
2174     case 0x205f:
2175     case 0x3000:
2176     return op_code != OP_HSPACE;
2177     default:
2178     return op_code == OP_HSPACE;
2179     }
2180    
2181     case OP_VSPACE:
2182     case OP_NOT_VSPACE:
2183     switch(next)
2184     {
2185     case 0x0a:
2186     case 0x0b:
2187     case 0x0c:
2188     case 0x0d:
2189     case 0x85:
2190     case 0x2028:
2191     case 0x2029:
2192     return op_code != OP_VSPACE;
2193     default:
2194     return op_code == OP_VSPACE;
2195     }
2196    
2197 nigel 93 default:
2198     return FALSE;
2199     }
2200    
2201    
2202     /* Handle the case when the next item is \d, \s, etc. */
2203    
2204     switch(op_code)
2205     {
2206     case OP_CHAR:
2207     case OP_CHARNC:
2208     #ifdef SUPPORT_UTF8
2209     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2210     #endif
2211     switch(-next)
2212     {
2213     case ESC_d:
2214     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2215    
2216     case ESC_D:
2217     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2218    
2219     case ESC_s:
2220     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2221    
2222     case ESC_S:
2223     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2224    
2225     case ESC_w:
2226     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2227    
2228     case ESC_W:
2229     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2230 ph10 182
2231 ph10 180 case ESC_h:
2232     case ESC_H:
2233     switch(item)
2234     {
2235     case 0x09:
2236     case 0x20:
2237     case 0xa0:
2238     case 0x1680:
2239     case 0x180e:
2240     case 0x2000:
2241     case 0x2001:
2242     case 0x2002:
2243     case 0x2003:
2244     case 0x2004:
2245     case 0x2005:
2246     case 0x2006:
2247     case 0x2007:
2248     case 0x2008:
2249     case 0x2009:
2250     case 0x200A:
2251     case 0x202f:
2252     case 0x205f:
2253     case 0x3000:
2254     return -next != ESC_h;
2255     default:
2256     return -next == ESC_h;
2257 ph10 182 }
2258    
2259 ph10 180 case ESC_v:
2260     case ESC_V:
2261     switch(item)
2262     {
2263     case 0x0a:
2264     case 0x0b:
2265     case 0x0c:
2266     case 0x0d:
2267     case 0x85:
2268     case 0x2028:
2269     case 0x2029:
2270     return -next != ESC_v;
2271     default:
2272     return -next == ESC_v;
2273 ph10 182 }
2274 nigel 93
2275     default:
2276     return FALSE;
2277     }
2278    
2279     case OP_DIGIT:
2280 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2281     next == -ESC_h || next == -ESC_v;
2282 nigel 93
2283     case OP_NOT_DIGIT:
2284     return next == -ESC_d;
2285    
2286     case OP_WHITESPACE:
2287     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2288    
2289     case OP_NOT_WHITESPACE:
2290 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2291 nigel 93
2292 ph10 180 case OP_HSPACE:
2293     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2294    
2295     case OP_NOT_HSPACE:
2296     return next == -ESC_h;
2297 ph10 182
2298 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2299 ph10 182 case OP_VSPACE:
2300 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2301    
2302     case OP_NOT_VSPACE:
2303 ph10 182 return next == -ESC_v;
2304 ph10 180
2305 nigel 93 case OP_WORDCHAR:
2306 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2307 nigel 93
2308     case OP_NOT_WORDCHAR:
2309     return next == -ESC_w || next == -ESC_d;
2310 ph10 182
2311 nigel 93 default:
2312     return FALSE;
2313     }
2314    
2315     /* Control does not reach here */
2316     }
2317    
2318    
2319    
2320     /*************************************************
2321 nigel 77 * Compile one branch *
2322     *************************************************/
2323    
2324 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2325 nigel 77 changed during the branch, the pointer is used to change the external options
2326 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2327     to find out the amount of memory needed, as well as during the real compile
2328     phase. The value of lengthptr distinguishes the two phases.
2329 nigel 77
2330     Arguments:
2331     optionsptr pointer to the option bits
2332     codeptr points to the pointer to the current code point
2333     ptrptr points to the current pattern pointer
2334     errorcodeptr points to error code variable
2335     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2336     reqbyteptr set to the last literal character required, else < 0
2337     bcptr points to current branch chain
2338     cd contains pointers to tables etc.
2339 nigel 93 lengthptr NULL during the real compile phase
2340     points to length accumulator during pre-compile phase
2341 nigel 77
2342     Returns: TRUE on success
2343     FALSE, with *errorcodeptr set non-zero on error
2344     */
2345    
2346     static BOOL
2347 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2348     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2349     compile_data *cd, int *lengthptr)
2350 nigel 77 {
2351     int repeat_type, op_type;
2352     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2353     int bravalue = 0;
2354     int greedy_default, greedy_non_default;
2355     int firstbyte, reqbyte;
2356     int zeroreqbyte, zerofirstbyte;
2357     int req_caseopt, reqvary, tempreqvary;
2358     int options = *optionsptr;
2359     int after_manual_callout = 0;
2360 nigel 93 int length_prevgroup = 0;
2361 nigel 77 register int c;
2362     register uschar *code = *codeptr;
2363 nigel 93 uschar *last_code = code;
2364     uschar *orig_code = code;
2365 nigel 77 uschar *tempcode;
2366     BOOL inescq = FALSE;
2367     BOOL groupsetfirstbyte = FALSE;
2368     const uschar *ptr = *ptrptr;
2369     const uschar *tempptr;
2370     uschar *previous = NULL;
2371     uschar *previous_callout = NULL;
2372 nigel 93 uschar *save_hwm = NULL;
2373 nigel 77 uschar classbits[32];
2374    
2375     #ifdef SUPPORT_UTF8
2376     BOOL class_utf8;
2377     BOOL utf8 = (options & PCRE_UTF8) != 0;
2378     uschar *class_utf8data;
2379 ph10 300 uschar *class_utf8data_base;
2380 nigel 77 uschar utf8_char[6];
2381     #else
2382     BOOL utf8 = FALSE;
2383 nigel 93 uschar *utf8_char = NULL;
2384 nigel 77 #endif
2385    
2386 nigel 93 #ifdef DEBUG
2387     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2388     #endif
2389    
2390 nigel 77 /* Set up the default and non-default settings for greediness */
2391    
2392     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2393     greedy_non_default = greedy_default ^ 1;
2394    
2395     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2396     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2397     matches a non-fixed char first char; reqbyte just remains unset if we never
2398     find one.
2399    
2400     When we hit a repeat whose minimum is zero, we may have to adjust these values
2401     to take the zero repeat into account. This is implemented by setting them to
2402     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2403     item types that can be repeated set these backoff variables appropriately. */
2404    
2405     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2406    
2407     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2408     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2409     value > 255. It is added into the firstbyte or reqbyte variables to record the
2410     case status of the value. This is used only for ASCII characters. */
2411    
2412     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2413    
2414     /* Switch on next character until the end of the branch */
2415    
2416     for (;; ptr++)
2417     {
2418     BOOL negate_class;
2419 ph10 286 BOOL should_flip_negation;
2420 nigel 77 BOOL possessive_quantifier;
2421     BOOL is_quantifier;
2422 nigel 93 BOOL is_recurse;
2423 ph10 180 BOOL reset_bracount;
2424 nigel 77 int class_charcount;
2425     int class_lastchar;
2426     int newoptions;
2427     int recno;
2428 ph10 172 int refsign;
2429 nigel 77 int skipbytes;
2430     int subreqbyte;
2431     int subfirstbyte;
2432 nigel 93 int terminator;
2433 nigel 77 int mclength;
2434     uschar mcbuffer[8];
2435    
2436 nigel 93 /* Get next byte in the pattern */
2437 nigel 77
2438     c = *ptr;
2439    
2440 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2441     previous cycle of this loop. */
2442    
2443     if (lengthptr != NULL)
2444     {
2445     #ifdef DEBUG
2446     if (code > cd->hwm) cd->hwm = code; /* High water info */
2447     #endif
2448     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2449     {
2450     *errorcodeptr = ERR52;
2451     goto FAILED;
2452     }
2453    
2454     /* There is at least one situation where code goes backwards: this is the
2455     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2456     the class is simply eliminated. However, it is created first, so we have to
2457     allow memory for it. Therefore, don't ever reduce the length at this point.
2458     */
2459    
2460     if (code < last_code) code = last_code;
2461 ph10 202
2462     /* Paranoid check for integer overflow */
2463    
2464     if (OFLOW_MAX - *lengthptr < code - last_code)
2465     {
2466     *errorcodeptr = ERR20;
2467     goto FAILED;
2468     }
2469    
2470 nigel 93 *lengthptr += code - last_code;
2471     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2472    
2473     /* If "previous" is set and it is not at the start of the work space, move
2474     it back to there, in order to avoid filling up the work space. Otherwise,
2475     if "previous" is NULL, reset the current code pointer to the start. */
2476    
2477     if (previous != NULL)
2478     {
2479     if (previous > orig_code)
2480     {
2481     memmove(orig_code, previous, code - previous);
2482     code -= previous - orig_code;
2483     previous = orig_code;
2484     }
2485     }
2486     else code = orig_code;
2487    
2488     /* Remember where this code item starts so we can pick up the length
2489     next time round. */
2490    
2491     last_code = code;
2492     }
2493    
2494     /* In the real compile phase, just check the workspace used by the forward
2495     reference list. */
2496    
2497     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2498     {
2499     *errorcodeptr = ERR52;
2500     goto FAILED;
2501     }
2502    
2503 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2504    
2505     if (inescq && c != 0)
2506     {
2507     if (c == '\\' && ptr[1] == 'E')
2508     {
2509     inescq = FALSE;
2510     ptr++;
2511     continue;
2512     }
2513     else
2514     {
2515     if (previous_callout != NULL)
2516     {
2517 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2518     complete_callout(previous_callout, ptr, cd);
2519 nigel 77 previous_callout = NULL;
2520     }
2521     if ((options & PCRE_AUTO_CALLOUT) != 0)
2522     {
2523     previous_callout = code;
2524     code = auto_callout(code, ptr, cd);
2525     }
2526     goto NORMAL_CHAR;
2527     }
2528     }
2529    
2530     /* Fill in length of a previous callout, except when the next thing is
2531     a quantifier. */
2532    
2533     is_quantifier = c == '*' || c == '+' || c == '?' ||
2534     (c == '{' && is_counted_repeat(ptr+1));
2535    
2536     if (!is_quantifier && previous_callout != NULL &&
2537     after_manual_callout-- <= 0)
2538     {
2539 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2540     complete_callout(previous_callout, ptr, cd);
2541 nigel 77 previous_callout = NULL;
2542     }
2543    
2544     /* In extended mode, skip white space and comments */
2545    
2546     if ((options & PCRE_EXTENDED) != 0)
2547     {
2548     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2549     if (c == '#')
2550     {
2551 nigel 93 while (*(++ptr) != 0)
2552 nigel 91 {
2553 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2554 nigel 91 }
2555 nigel 93 if (*ptr != 0) continue;
2556    
2557 nigel 91 /* Else fall through to handle end of string */
2558     c = 0;
2559 nigel 77 }
2560     }
2561    
2562     /* No auto callout for quantifiers. */
2563    
2564     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2565     {
2566     previous_callout = code;
2567     code = auto_callout(code, ptr, cd);
2568     }
2569    
2570     switch(c)
2571     {
2572 nigel 93 /* ===================================================================*/
2573     case 0: /* The branch terminates at string end */
2574     case '|': /* or | or ) */
2575 nigel 77 case ')':
2576     *firstbyteptr = firstbyte;
2577     *reqbyteptr = reqbyte;
2578     *codeptr = code;
2579     *ptrptr = ptr;
2580 nigel 93 if (lengthptr != NULL)
2581     {
2582 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2583     {
2584     *errorcodeptr = ERR20;
2585     goto FAILED;
2586     }
2587 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2588     DPRINTF((">> end branch\n"));
2589     }
2590 nigel 77 return TRUE;
2591    
2592 nigel 93
2593     /* ===================================================================*/
2594 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2595     the setting of any following char as a first character. */
2596    
2597     case '^':
2598     if ((options & PCRE_MULTILINE) != 0)
2599     {
2600     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2601     }
2602     previous = NULL;
2603     *code++ = OP_CIRC;
2604     break;
2605    
2606     case '$':
2607     previous = NULL;
2608     *code++ = OP_DOLL;
2609     break;
2610    
2611     /* There can never be a first char if '.' is first, whatever happens about
2612     repeats. The value of reqbyte doesn't change either. */
2613    
2614     case '.':
2615     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2616     zerofirstbyte = firstbyte;
2617     zeroreqbyte = reqbyte;
2618     previous = code;
2619     *code++ = OP_ANY;
2620     break;
2621    
2622 nigel 93
2623     /* ===================================================================*/
2624 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2625     32-byte bitmap of the permitted characters, except in the special case
2626     where there is only one such character. For negated classes, we build the
2627     map as usual, then invert it at the end. However, we use a different opcode
2628     so that data characters > 255 can be handled correctly.
2629 nigel 77
2630     If the class contains characters outside the 0-255 range, a different
2631     opcode is compiled. It may optionally have a bit map for characters < 256,
2632     but those above are are explicitly listed afterwards. A flag byte tells
2633     whether the bitmap is present, and whether this is a negated class or not.
2634     */
2635    
2636     case '[':
2637     previous = code;
2638    
2639     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2640     they are encountered at the top level, so we'll do that too. */
2641    
2642     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2643 ph10 295 check_posix_syntax(ptr, &tempptr))
2644 nigel 77 {
2645     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2646     goto FAILED;
2647     }
2648    
2649 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2650 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2651 ph10 205 skip them too. This makes for compatibility with Perl. */
2652 ph10 208
2653 ph10 205 negate_class = FALSE;
2654     for (;;)
2655 nigel 77 {
2656     c = *(++ptr);
2657 ph10 205 if (c == '\\')
2658     {
2659 ph10 208 if (ptr[1] == 'E') ptr++;
2660 ph10 205 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2661 ph10 208 else break;
2662 ph10 205 }
2663     else if (!negate_class && c == '^')
2664     negate_class = TRUE;
2665     else break;
2666 ph10 208 }
2667 nigel 77
2668 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
2669     negation flag at the end, so that support for characters > 255 works
2670 ph10 264 correctly (they are all included in the class). */
2671    
2672     should_flip_negation = FALSE;
2673    
2674 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
2675 nigel 93 of just a single character (as long as it's < 256). However, For higher
2676     valued UTF-8 characters, we don't yet do any optimization. */
2677 nigel 77
2678     class_charcount = 0;
2679     class_lastchar = -1;
2680    
2681 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2682     temporary bit of memory, in case the class contains only 1 character (less
2683     than 256), because in that case the compiled code doesn't use the bit map.
2684     */
2685    
2686     memset(classbits, 0, 32 * sizeof(uschar));
2687    
2688 nigel 77 #ifdef SUPPORT_UTF8
2689     class_utf8 = FALSE; /* No chars >= 256 */
2690 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2691 ph10 300 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2692 nigel 77 #endif
2693    
2694     /* Process characters until ] is reached. By writing this as a "do" it
2695 nigel 93 means that an initial ] is taken as a data character. At the start of the
2696     loop, c contains the first byte of the character. */
2697 nigel 77
2698 nigel 93 if (c != 0) do
2699 nigel 77 {
2700 nigel 93 const uschar *oldptr;
2701    
2702 nigel 77 #ifdef SUPPORT_UTF8
2703     if (utf8 && c > 127)
2704     { /* Braces are required because the */
2705     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2706     }
2707 ph10 300
2708     /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2709     data and reset the pointer. This is so that very large classes that
2710     contain a zillion UTF-8 characters no longer overwrite the work space
2711     (which is on the stack). */
2712    
2713     if (lengthptr != NULL)
2714     {
2715     *lengthptr += class_utf8data - class_utf8data_base;
2716     class_utf8data = class_utf8data_base;
2717     }
2718    
2719 nigel 77 #endif
2720    
2721     /* Inside \Q...\E everything is literal except \E */
2722    
2723     if (inescq)
2724     {
2725 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2726 nigel 77 {
2727 nigel 93 inescq = FALSE; /* Reset literal state */
2728     ptr++; /* Skip the 'E' */
2729     continue; /* Carry on with next */
2730 nigel 77 }
2731 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2732 nigel 77 }
2733    
2734     /* Handle POSIX class names. Perl allows a negation extension of the
2735     form [:^name:]. A square bracket that doesn't match the syntax is
2736     treated as a literal. We also recognize the POSIX constructions
2737     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2738     5.6 and 5.8 do. */
2739    
2740     if (c == '[' &&
2741     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2742 ph10 295 check_posix_syntax(ptr, &tempptr))
2743 nigel 77 {
2744     BOOL local_negate = FALSE;
2745 nigel 87 int posix_class, taboffset, tabopt;
2746 nigel 77 register const uschar *cbits = cd->cbits;
2747 nigel 87 uschar pbits[32];
2748 nigel 77
2749     if (ptr[1] != ':')
2750     {
2751     *errorcodeptr = ERR31;
2752     goto FAILED;
2753     }
2754    
2755     ptr += 2;
2756     if (*ptr == '^')
2757     {
2758     local_negate = TRUE;
2759 ph10 286 should_flip_negation = TRUE; /* Note negative special */
2760 nigel 77 ptr++;
2761     }
2762    
2763     posix_class = check_posix_name(ptr, tempptr - ptr);
2764     if (posix_class < 0)
2765     {
2766     *errorcodeptr = ERR30;
2767     goto FAILED;
2768     }
2769    
2770     /* If matching is caseless, upper and lower are converted to
2771     alpha. This relies on the fact that the class table starts with
2772     alpha, lower, upper as the first 3 entries. */
2773    
2774     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2775     posix_class = 0;
2776    
2777 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2778     because we may be adding and subtracting from it, and we don't want to
2779     subtract bits that may be in the main map already. At the end we or the
2780     result into the bit map that is being built. */
2781 nigel 77
2782     posix_class *= 3;
2783 nigel 87
2784     /* Copy in the first table (always present) */
2785    
2786     memcpy(pbits, cbits + posix_class_maps[posix_class],
2787     32 * sizeof(uschar));
2788    
2789     /* If there is a second table, add or remove it as required. */
2790    
2791     taboffset = posix_class_maps[posix_class + 1];
2792     tabopt = posix_class_maps[posix_class + 2];
2793    
2794     if (taboffset >= 0)
2795 nigel 77 {
2796 nigel 87 if (tabopt >= 0)
2797     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2798 nigel 77 else
2799 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2800 nigel 77 }
2801    
2802 nigel 87 /* Not see if we need to remove any special characters. An option
2803     value of 1 removes vertical space and 2 removes underscore. */
2804    
2805     if (tabopt < 0) tabopt = -tabopt;
2806     if (tabopt == 1) pbits[1] &= ~0x3c;
2807     else if (tabopt == 2) pbits[11] &= 0x7f;
2808    
2809     /* Add the POSIX table or its complement into the main table that is
2810     being built and we are done. */
2811    
2812     if (local_negate)
2813     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2814     else
2815     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2816    
2817 nigel 77 ptr = tempptr + 1;
2818     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2819     continue; /* End of POSIX syntax handling */
2820     }
2821    
2822     /* Backslash may introduce a single character, or it may introduce one
2823 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2824     case. Inside a class (and only there) it is treated as backspace.
2825     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2826 ph10 205 to 'or' into the one we are building. We assume they have more than one
2827 nigel 77 character in them, so set class_charcount bigger than one. */
2828    
2829     if (c == '\\')
2830     {
2831 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2832     if (*errorcodeptr != 0) goto FAILED;
2833 nigel 77
2834 ph10 275 if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
2835 nigel 77 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2836 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2837 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2838     {
2839     if (ptr[1] == '\\' && ptr[2] == 'E')
2840     {
2841     ptr += 2; /* avoid empty string */
2842     }
2843     else inescq = TRUE;
2844     continue;
2845     }
2846 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2847 nigel 77
2848     if (c < 0)
2849     {
2850     register const uschar *cbits = cd->cbits;
2851     class_charcount += 2; /* Greater than 1 is what matters */
2852 nigel 93
2853     /* Save time by not doing this in the pre-compile phase. */
2854    
2855     if (lengthptr == NULL) switch (-c)
2856 nigel 77 {
2857     case ESC_d:
2858     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2859     continue;
2860    
2861     case ESC_D:
2862 ph10 286 should_flip_negation = TRUE;
2863 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2864     continue;
2865    
2866     case ESC_w:
2867     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2868     continue;
2869    
2870     case ESC_W:
2871 ph10 286 should_flip_negation = TRUE;
2872 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2873     continue;
2874    
2875     case ESC_s:
2876     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2877     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2878     continue;
2879    
2880     case ESC_S:
2881 ph10 286 should_flip_negation = TRUE;
2882 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2883     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2884     continue;
2885    
2886 nigel 93 default: /* Not recognized; fall through */
2887     break; /* Need "default" setting to stop compiler warning. */
2888     }
2889    
2890     /* In the pre-compile phase, just do the recognition. */
2891    
2892     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2893     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2894 ph10 180
2895 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
2896     they use extra memory. */
2897 ph10 180
2898 ph10 178 if (-c == ESC_h)
2899     {
2900     SETBIT(classbits, 0x09); /* VT */
2901     SETBIT(classbits, 0x20); /* SPACE */
2902 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
2903 ph10 178 #ifdef SUPPORT_UTF8
2904     if (utf8)
2905 ph10 180 {
2906 ph10 178 class_utf8 = TRUE;
2907     *class_utf8data++ = XCL_SINGLE;
2908 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2909 ph10 178 *class_utf8data++ = XCL_SINGLE;
2910 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2911     *class_utf8data++ = XCL_RANGE;
2912     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2913     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2914 ph10 178 *class_utf8data++ = XCL_SINGLE;
2915 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2916 ph10 178 *class_utf8data++ = XCL_SINGLE;
2917 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2918 ph10 178 *class_utf8data++ = XCL_SINGLE;
2919 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2920     }
2921     #endif
2922     continue;
2923     }
2924 nigel 93
2925 ph10 178 if (-c == ESC_H)
2926     {
2927     for (c = 0; c < 32; c++)
2928     {
2929     int x = 0xff;
2930     switch (c)
2931 ph10 180 {
2932 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
2933     case 0x20/8: x ^= 1 << (0x20%8); break;
2934     case 0xa0/8: x ^= 1 << (0xa0%8); break;
2935     default: break;
2936     }
2937     classbits[c] |= x;
2938 ph10 180 }
2939    
2940 ph10 178 #ifdef SUPPORT_UTF8
2941     if (utf8)
2942 ph10 180 {
2943 ph10 178 class_utf8 = TRUE;
2944 ph10 180 *class_utf8data++ = XCL_RANGE;
2945     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2946     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2947     *class_utf8data++ = XCL_RANGE;
2948     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2949     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2950     *class_utf8data++ = XCL_RANGE;
2951     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2952     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2953     *class_utf8data++ = XCL_RANGE;
2954     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2955     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2956     *class_utf8data++ = XCL_RANGE;
2957     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2958     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2959     *class_utf8data++ = XCL_RANGE;
2960     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2961     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2962     *class_utf8data++ = XCL_RANGE;
2963     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2964     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2965     }
2966     #endif
2967     continue;
2968     }
2969 ph10 178
2970     if (-c == ESC_v)
2971     {
2972     SETBIT(classbits, 0x0a); /* LF */
2973     SETBIT(classbits, 0x0b); /* VT */
2974 ph10 180 SETBIT(classbits, 0x0c); /* FF */
2975     SETBIT(classbits, 0x0d); /* CR */
2976     SETBIT(classbits, 0x85); /* NEL */
2977 ph10 178 #ifdef SUPPORT_UTF8
2978     if (utf8)
2979 ph10 180 {
2980 ph10 178 class_utf8 = TRUE;
2981 ph10 180 *class_utf8data++ = XCL_RANGE;
2982     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2983     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2984     }
2985     #endif
2986     continue;
2987     }
2988 ph10 178
2989     if (-c == ESC_V)
2990     {
2991     for (c = 0; c < 32; c++)
2992     {
2993     int x = 0xff;
2994     switch (c)
2995 ph10 180 {
2996 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
2997     x ^= 1 << (0x0b%8);
2998     x ^= 1 << (0x0c%8);
2999 ph10 180 x ^= 1 << (0x0d%8);
3000 ph10 178 break;
3001     case 0x85/8: x ^= 1 << (0x85%8); break;
3002     default: break;
3003     }
3004     classbits[c] |= x;
3005 ph10 180 }
3006    
3007 ph10 178 #ifdef SUPPORT_UTF8
3008     if (utf8)
3009 ph10 180 {
3010 ph10 178 class_utf8 = TRUE;
3011 ph10 180 *class_utf8data++ = XCL_RANGE;
3012     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3013     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3014     *class_utf8data++ = XCL_RANGE;
3015     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3016     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3017     }
3018     #endif
3019     continue;
3020     }
3021 ph10 178
3022 nigel 93 /* We need to deal with \P and \p in both phases. */
3023    
3024 nigel 77 #ifdef SUPPORT_UCP
3025 nigel 93 if (-c == ESC_p || -c == ESC_P)
3026     {
3027     BOOL negated;
3028     int pdata;
3029     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3030     if (ptype < 0) goto FAILED;
3031     class_utf8 = TRUE;
3032     *class_utf8data++ = ((-c == ESC_p) != negated)?
3033     XCL_PROP : XCL_NOTPROP;
3034     *class_utf8data++ = ptype;
3035     *class_utf8data++ = pdata;
3036     class_charcount -= 2; /* Not a < 256 character */
3037 nigel 77 continue;
3038 nigel 93 }
3039 nigel 77 #endif
3040 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
3041     strict mode. By default, for compatibility with Perl, they are
3042     treated as literals. */
3043 nigel 77
3044 nigel 93 if ((options & PCRE_EXTRA) != 0)
3045     {
3046     *errorcodeptr = ERR7;
3047     goto FAILED;
3048     }
3049 nigel 77
3050 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3051     c = *ptr; /* Get the final character and fall through */
3052 nigel 77 }
3053    
3054     /* Fall through if we have a single character (c >= 0). This may be
3055 nigel 93 greater than 256 in UTF-8 mode. */
3056 nigel 77
3057     } /* End of backslash handling */
3058    
3059     /* A single character may be followed by '-' to form a range. However,
3060     Perl does not permit ']' to be the end of the range. A '-' character
3061 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3062     entirely. The code for handling \Q and \E is messy. */
3063 nigel 77
3064 nigel 93 CHECK_RANGE:
3065     while (ptr[1] == '\\' && ptr[2] == 'E')
3066 nigel 77 {
3067 nigel 93 inescq = FALSE;
3068     ptr += 2;
3069     }
3070    
3071     oldptr = ptr;
3072 ph10 231
3073 ph10 230 /* Remember \r or \n */
3074 ph10 231
3075     if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3076    
3077 ph10 230 /* Check for range */
3078 nigel 93
3079     if (!inescq && ptr[1] == '-')
3080     {
3081 nigel 77 int d;
3082     ptr += 2;
3083 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3084 nigel 77
3085 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3086     mode. */
3087    
3088     while (*ptr == '\\' && ptr[1] == 'Q')
3089     {
3090     ptr += 2;
3091     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3092     inescq = TRUE;
3093     break;
3094     }
3095    
3096     if (*ptr == 0 || (!inescq && *ptr == ']'))
3097     {
3098     ptr = oldptr;
3099     goto LONE_SINGLE_CHARACTER;
3100     }
3101    
3102 nigel 77 #ifdef SUPPORT_UTF8
3103     if (utf8)
3104     { /* Braces are required because the */
3105     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3106     }
3107     else
3108     #endif
3109     d = *ptr; /* Not UTF-8 mode */
3110    
3111     /* The second part of a range can be a single-character escape, but
3112     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3113     in such circumstances. */
3114    
3115 nigel 93 if (!inescq && d == '\\')
3116 nigel 77 {
3117 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3118     if (*errorcodeptr != 0) goto FAILED;
3119 nigel 77
3120 ph10 275 /* \b is backspace; \X is literal X; \R is literal R; any other
3121 nigel 93 special means the '-' was literal */
3122 nigel 77
3123     if (d < 0)
3124     {
3125     if (d == -ESC_b) d = '\b';
3126 nigel 93 else if (d == -ESC_X) d = 'X';
3127     else if (d == -ESC_R) d = 'R'; else
3128 nigel 77 {
3129 nigel 93 ptr = oldptr;
3130 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3131     }
3132     }
3133     }
3134    
3135 nigel 93 /* Check that the two values are in the correct order. Optimize
3136     one-character ranges */
3137 nigel 77
3138 nigel 93 if (d < c)
3139     {
3140     *errorcodeptr = ERR8;
3141     goto FAILED;
3142     }
3143    
3144 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3145    
3146 ph10 230 /* Remember \r or \n */
3147 ph10 231
3148     if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3149    
3150 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3151     matching, we have to use an XCLASS with extra data items. Caseless
3152     matching for characters > 127 is available only if UCP support is
3153     available. */
3154    
3155     #ifdef SUPPORT_UTF8
3156     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3157     {
3158     class_utf8 = TRUE;
3159    
3160     /* With UCP support, we can find the other case equivalents of
3161     the relevant characters. There may be several ranges. Optimize how
3162     they fit with the basic range. */
3163    
3164     #ifdef SUPPORT_UCP
3165     if ((options & PCRE_CASELESS) != 0)
3166     {
3167 nigel 93 unsigned int occ, ocd;
3168     unsigned int cc = c;
3169     unsigned int origd = d;
3170 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3171     {
3172 ph10 180 if (occ >= (unsigned int)c &&
3173     ocd <= (unsigned int)d)
3174 ph10 176 continue; /* Skip embedded ranges */
3175 nigel 77
3176 ph10 180 if (occ < (unsigned int)c &&
3177 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3178 nigel 77 { /* if there is overlap, */
3179     c = occ; /* noting that if occ < c */
3180     continue; /* we can't have ocd > d */
3181     } /* because a subrange is */
3182 ph10 180 if (ocd > (unsigned int)d &&
3183 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3184 nigel 77 { /* the basic range. */
3185     d = ocd;
3186     continue;
3187     }
3188    
3189     if (occ == ocd)
3190     {
3191     *class_utf8data++ = XCL_SINGLE;
3192     }
3193     else
3194     {
3195     *class_utf8data++ = XCL_RANGE;
3196     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3197     }
3198     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3199     }
3200     }
3201     #endif /* SUPPORT_UCP */
3202    
3203     /* Now record the original range, possibly modified for UCP caseless
3204     overlapping ranges. */
3205    
3206     *class_utf8data++ = XCL_RANGE;
3207     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3208     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3209    
3210     /* With UCP support, we are done. Without UCP support, there is no
3211     caseless matching for UTF-8 characters > 127; we can use the bit map
3212     for the smaller ones. */
3213    
3214     #ifdef SUPPORT_UCP
3215     continue; /* With next character in the class */
3216     #else
3217     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3218    
3219     /* Adjust upper limit and fall through to set up the map */
3220    
3221     d = 127;
3222    
3223     #endif /* SUPPORT_UCP */
3224     }
3225     #endif /* SUPPORT_UTF8 */
3226    
3227     /* We use the bit map for all cases when not in UTF-8 mode; else
3228     ranges that lie entirely within 0-127 when there is UCP support; else
3229     for partial ranges without UCP support. */
3230    
3231 nigel 93 class_charcount += d - c + 1;
3232     class_lastchar = d;
3233    
3234     /* We can save a bit of time by skipping this in the pre-compile. */
3235    
3236     if (lengthptr == NULL) for (; c <= d; c++)
3237 nigel 77 {
3238     classbits[c/8] |= (1 << (c&7));
3239     if ((options & PCRE_CASELESS) != 0)
3240     {
3241     int uc = cd->fcc[c]; /* flip case */
3242     classbits[uc/8] |= (1 << (uc&7));
3243     }
3244     }
3245    
3246     continue; /* Go get the next char in the class */
3247     }
3248    
3249     /* Handle a lone single character - we can get here for a normal
3250     non-escape char, or after \ that introduces a single character or for an
3251     apparent range that isn't. */
3252    
3253     LONE_SINGLE_CHARACTER:
3254 ph10 231
3255 nigel 77 /* Handle a character that cannot go in the bit map */
3256    
3257     #ifdef SUPPORT_UTF8
3258     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3259     {
3260     class_utf8 = TRUE;
3261     *class_utf8data++ = XCL_SINGLE;
3262     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3263    
3264     #ifdef SUPPORT_UCP
3265     if ((options & PCRE_CASELESS) != 0)
3266     {
3267 nigel 93 unsigned int othercase;
3268     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3269 nigel 77 {
3270     *class_utf8data++ = XCL_SINGLE;
3271     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3272     }
3273     }
3274     #endif /* SUPPORT_UCP */
3275    
3276     }
3277     else
3278     #endif /* SUPPORT_UTF8 */
3279    
3280     /* Handle a single-byte character */
3281     {
3282     classbits[c/8] |= (1 << (c&7));
3283     if ((options & PCRE_CASELESS) != 0)
3284     {
3285     c = cd->fcc[c]; /* flip case */
3286     classbits[c/8] |= (1 << (c&7));
3287     }
3288     class_charcount++;
3289     class_lastchar = c;
3290     }
3291     }
3292    
3293 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3294 nigel 77
3295 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3296 nigel 77
3297 nigel 93 if (c == 0) /* Missing terminating ']' */
3298     {
3299     *errorcodeptr = ERR6;
3300     goto FAILED;
3301     }
3302 ph10 231
3303    
3304 ph10 230 /* This code has been disabled because it would mean that \s counts as
3305     an explicit \r or \n reference, and that's not really what is wanted. Now
3306     we set the flag only if there is a literal "\r" or "\n" in the class. */
3307 ph10 227
3308 ph10 230 #if 0
3309 ph10 226 /* Remember whether \r or \n are in this class */
3310 ph10 227
3311 ph10 226 if (negate_class)
3312     {
3313 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3314 ph10 226 }
3315     else
3316     {
3317 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3318 ph10 227 }
3319 ph10 230 #endif
3320 ph10 227
3321 ph10 231
3322 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3323 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3324     use of \p or \P, in other words, no use of any XCLASS features, we can
3325     optimize.
3326    
3327 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3328     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3329     operate on single-bytes only. This is an historical hangover. Maybe one day
3330     we can tidy these opcodes to handle multi-byte characters.
3331 nigel 77
3332     The optimization throws away the bit map. We turn the item into a
3333     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3334     that OP_NOT does not support multibyte characters. In the positive case, it
3335     can cause firstbyte to be set. Otherwise, there can be no first char if
3336     this item is first, whatever repeat count may follow. In the case of
3337     reqbyte, save the previous value for reinstating. */
3338    
3339     #ifdef SUPPORT_UTF8
3340 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3341 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3342 nigel 77 #else
3343     if (class_charcount == 1)
3344     #endif
3345     {
3346     zeroreqbyte = reqbyte;
3347    
3348     /* The OP_NOT opcode works on one-byte characters only. */
3349    
3350     if (negate_class)
3351     {
3352     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3353     zerofirstbyte = firstbyte;
3354     *code++ = OP_NOT;
3355     *code++ = class_lastchar;
3356     break;
3357     }
3358    
3359     /* For a single, positive character, get the value into mcbuffer, and
3360     then we can handle this with the normal one-character code. */
3361    
3362     #ifdef SUPPORT_UTF8
3363     if (utf8 && class_lastchar > 127)
3364     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3365     else
3366     #endif
3367     {
3368     mcbuffer[0] = class_lastchar;
3369     mclength = 1;
3370     }
3371     goto ONE_CHAR;
3372     } /* End of 1-char optimization */
3373    
3374     /* The general case - not the one-char optimization. If this is the first
3375     thing in the branch, there can be no first char setting, whatever the
3376     repeat count. Any reqbyte setting must remain unchanged after any kind of
3377     repeat. */
3378    
3379     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3380     zerofirstbyte = firstbyte;
3381     zeroreqbyte = reqbyte;
3382    
3383     /* If there are characters with values > 255, we have to compile an
3384 ph10 286 extended class, with its own opcode, unless there was a negated special
3385     such as \S in the class, because in that case all characters > 255 are in
3386     the class, so any that were explicitly given as well can be ignored. If
3387 ph10 264 (when there are explicit characters > 255 that must be listed) there are no
3388     characters < 256, we can omit the bitmap in the actual compiled code. */
3389 nigel 77
3390     #ifdef SUPPORT_UTF8
3391 ph10 264 if (class_utf8 && !should_flip_negation)
3392 nigel 77 {
3393     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3394     *code++ = OP_XCLASS;
3395     code += LINK_SIZE;
3396     *code = negate_class? XCL_NOT : 0;
3397    
3398 nigel 93 /* If the map is required, move up the extra data to make room for it;
3399     otherwise just move the code pointer to the end of the extra data. */
3400 nigel 77
3401     if (class_charcount > 0)
3402     {
3403     *code++ |= XCL_MAP;
3404 nigel 93 memmove(code + 32, code, class_utf8data - code);
3405 nigel 77 memcpy(code, classbits, 32);
3406 nigel 93 code = class_utf8data + 32;
3407 nigel 77 }
3408 nigel 93 else code = class_utf8data;
3409 nigel 77
3410     /* Now fill in the complete length of the item */
3411    
3412     PUT(previous, 1, code - previous);
3413     break; /* End of class handling */
3414     }
3415     #endif
3416    
3417 ph10 286 /* If there are no characters > 255, set the opcode to OP_CLASS or
3418     OP_NCLASS, depending on whether the whole class was negated and whether
3419     there were negative specials such as \S in the class. Then copy the 32-byte
3420 ph10 264 map into the code vector, negating it if necessary. */
3421 ph10 286
3422 ph10 264 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3423 nigel 77 if (negate_class)
3424     {
3425 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3426     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3427 nigel 77 }
3428     else
3429     {
3430     memcpy(code, classbits, 32);
3431     }
3432     code += 32;
3433     break;
3434    
3435 nigel 93
3436     /* ===================================================================*/
3437 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3438     has been tested above. */
3439    
3440     case '{':
3441     if (!is_quantifier) goto NORMAL_CHAR;
3442     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3443     if (*errorcodeptr != 0) goto FAILED;
3444     goto REPEAT;
3445    
3446     case '*':
3447     repeat_min = 0;
3448     repeat_max = -1;
3449     goto REPEAT;
3450    
3451     case '+':
3452     repeat_min = 1;
3453     repeat_max = -1;
3454     goto REPEAT;
3455    
3456     case '?':
3457     repeat_min = 0;
3458     repeat_max = 1;
3459    
3460     REPEAT:
3461     if (previous == NULL)
3462     {
3463     *errorcodeptr = ERR9;
3464     goto FAILED;
3465     }
3466    
3467     if (repeat_min == 0)
3468     {
3469     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3470     reqbyte = zeroreqbyte; /* Ditto */
3471     }
3472    
3473     /* Remember whether this is a variable length repeat */
3474    
3475     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3476    
3477     op_type = 0; /* Default single-char op codes */
3478     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3479    
3480     /* Save start of previous item, in case we have to move it up to make space
3481     for an inserted OP_ONCE for the additional '+' extension. */
3482    
3483     tempcode = previous;
3484    
3485     /* If the next character is '+', we have a possessive quantifier. This
3486     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3487     If the next character is '?' this is a minimizing repeat, by default,
3488     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3489     repeat type to the non-default. */
3490    
3491     if (ptr[1] == '+')
3492     {
3493     repeat_type = 0; /* Force greedy */
3494     possessive_quantifier = TRUE;
3495     ptr++;
3496     }
3497     else if (ptr[1] == '?')
3498     {
3499     repeat_type = greedy_non_default;
3500     ptr++;
3501     }
3502     else repeat_type = greedy_default;
3503    
3504     /* If previous was a character match, abolish the item and generate a
3505     repeat item instead. If a char item has a minumum of more than one, ensure
3506     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3507     the first thing in a branch because the x will have gone into firstbyte
3508     instead. */
3509    
3510     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3511     {
3512     /* Deal with UTF-8 characters that take up more than one byte. It's
3513     easier to write this out separately than try to macrify it. Use c to
3514     hold the length of the character in bytes, plus 0x80 to flag that it's a
3515     length rather than a small character. */
3516    
3517     #ifdef SUPPORT_UTF8
3518     if (utf8 && (code[-1] & 0x80) != 0)
3519     {
3520     uschar *lastchar = code - 1;
3521     while((*lastchar & 0xc0) == 0x80) lastchar--;
3522     c = code - lastchar; /* Length of UTF-8 character */
3523     memcpy(utf8_char, lastchar, c); /* Save the char */
3524     c |= 0x80; /* Flag c as a length */
3525     }
3526     else
3527     #endif
3528    
3529     /* Handle the case of a single byte - either with no UTF8 support, or
3530     with UTF-8 disabled, or for a UTF-8 character < 128. */
3531    
3532     {
3533     c = code[-1];
3534     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3535     }
3536    
3537 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3538     the line is something that cannot possibly match this character. If so,
3539     automatically possessifying this item gains some performance in the case
3540     where the match fails. */
3541    
3542     if (!possessive_quantifier &&
3543     repeat_max < 0 &&
3544     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3545     options, cd))
3546     {
3547     repeat_type = 0; /* Force greedy */
3548     possessive_quantifier = TRUE;
3549     }
3550    
3551 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3552     }
3553    
3554     /* If previous was a single negated character ([^a] or similar), we use
3555     one of the special opcodes, replacing it. The code is shared with single-
3556     character repeats by setting opt_type to add a suitable offset into
3557 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3558     currently used only for single-byte chars. */
3559 nigel 77
3560     else if (*previous == OP_NOT)
3561     {
3562     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3563     c = previous[1];
3564 nigel 93 if (!possessive_quantifier &&
3565     repeat_max < 0 &&
3566     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3567     {
3568     repeat_type = 0; /* Force greedy */
3569     possessive_quantifier = TRUE;
3570     }
3571 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3572     }
3573    
3574     /* If previous was a character type match (\d or similar), abolish it and
3575     create a suitable repeat item. The code is shared with single-character
3576     repeats by setting op_type to add a suitable offset into repeat_type. Note
3577     the the Unicode property types will be present only when SUPPORT_UCP is
3578     defined, but we don't wrap the little bits of code here because it just
3579     makes it horribly messy. */
3580    
3581     else if (*previous < OP_EODN)
3582     {
3583     uschar *oldcode;
3584 nigel 87 int prop_type, prop_value;
3585 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3586     c = *previous;
3587    
3588 nigel 93 if (!possessive_quantifier &&
3589     repeat_max < 0 &&
3590     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3591     {
3592     repeat_type = 0; /* Force greedy */
3593     possessive_quantifier = TRUE;
3594     }
3595    
3596 nigel 77 OUTPUT_SINGLE_REPEAT:
3597 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3598     {
3599     prop_type = previous[1];
3600     prop_value = previous[2];
3601     }
3602     else prop_type = prop_value = -1;
3603 nigel 77
3604     oldcode = code;
3605     code = previous; /* Usually overwrite previous item */
3606    
3607     /* If the maximum is zero then the minimum must also be zero; Perl allows
3608     this case, so we do too - by simply omitting the item altogether. */
3609    
3610     if (repeat_max == 0) goto END_REPEAT;
3611    
3612     /* All real repeats make it impossible to handle partial matching (maybe
3613     one day we will be able to remove this restriction). */
3614    
3615 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3616 nigel 77
3617     /* Combine the op_type with the repeat_type */
3618    
3619     repeat_type += op_type;
3620    
3621     /* A minimum of zero is handled either as the special case * or ?, or as
3622     an UPTO, with the maximum given. */
3623    
3624     if (repeat_min == 0)
3625     {
3626     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3627     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3628     else
3629     {
3630     *code++ = OP_UPTO + repeat_type;
3631     PUT2INC(code, 0, repeat_max);
3632     }
3633     }
3634    
3635     /* A repeat minimum of 1 is optimized into some special cases. If the
3636 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3637 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3638     one less than the maximum. */
3639    
3640     else if (repeat_min == 1)
3641     {
3642     if (repeat_max == -1)
3643     *code++ = OP_PLUS + repeat_type;
3644     else
3645     {
3646     code = oldcode; /* leave previous item in place */
3647     if (repeat_max == 1) goto END_REPEAT;
3648     *code++ = OP_UPTO + repeat_type;
3649     PUT2INC(code, 0, repeat_max - 1);
3650     }
3651     }
3652    
3653     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3654     handled as an EXACT followed by an UPTO. */
3655    
3656     else
3657     {
3658     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3659     PUT2INC(code, 0, repeat_min);
3660    
3661     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3662     we have to insert the character for the previous code. For a repeated
3663 nigel 87 Unicode property match, there are two extra bytes that define the
3664 nigel 77 required property. In UTF-8 mode, long characters have their length in
3665     c, with the 0x80 bit as a flag. */
3666    
3667     if (repeat_max < 0)
3668     {
3669     #ifdef SUPPORT_UTF8
3670     if (utf8 && c >= 128)
3671     {
3672     memcpy(code, utf8_char, c & 7);
3673     code += c & 7;
3674     }
3675     else
3676     #endif
3677     {
3678     *code++ = c;
3679 nigel 87 if (prop_type >= 0)
3680     {
3681     *code++ = prop_type;
3682     *code++ = prop_value;
3683     }
3684 nigel 77 }
3685     *code++ = OP_STAR + repeat_type;
3686     }
3687    
3688     /* Else insert an UPTO if the max is greater than the min, again
3689 nigel 93 preceded by the character, for the previously inserted code. If the
3690     UPTO is just for 1 instance, we can use QUERY instead. */
3691 nigel 77
3692     else if (repeat_max != repeat_min)
3693     {
3694     #ifdef SUPPORT_UTF8
3695     if (utf8 && c >= 128)
3696     {
3697     memcpy(code, utf8_char, c & 7);
3698     code += c & 7;
3699     }
3700     else
3701     #endif
3702     *code++ = c;
3703 nigel 87 if (prop_type >= 0)
3704     {
3705     *code++ = prop_type;
3706     *code++ = prop_value;
3707     }
3708 nigel 77 repeat_max -= repeat_min;
3709 nigel 93
3710     if (repeat_max == 1)
3711     {
3712     *code++ = OP_QUERY + repeat_type;
3713     }
3714     else
3715     {
3716     *code++ = OP_UPTO + repeat_type;
3717     PUT2INC(code, 0, repeat_max);
3718     }
3719 nigel 77 }
3720     }
3721    
3722     /* The character or character type itself comes last in all cases. */
3723    
3724     #ifdef SUPPORT_UTF8
3725     if (utf8 && c >= 128)
3726     {
3727     memcpy(code, utf8_char, c & 7);
3728     code += c & 7;
3729     }
3730     else
3731     #endif
3732     *code++ = c;
3733    
3734 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3735     define the required property. */
3736 nigel 77
3737     #ifdef SUPPORT_UCP
3738 nigel 87 if (prop_type >= 0)
3739     {
3740     *code++ = prop_type;
3741     *code++ = prop_value;
3742     }
3743 nigel 77 #endif
3744     }
3745    
3746     /* If previous was a character class or a back reference, we put the repeat
3747     stuff after it, but just skip the item if the repeat was {0,0}. */
3748    
3749     else if (*previous == OP_CLASS ||
3750     *previous == OP_NCLASS ||
3751     #ifdef SUPPORT_UTF8
3752     *previous == OP_XCLASS ||
3753     #endif
3754     *previous == OP_REF)
3755     {
3756     if (repeat_max == 0)
3757     {
3758     code = previous;
3759     goto END_REPEAT;
3760     }
3761    
3762     /* All real repeats make it impossible to handle partial matching (maybe
3763     one day we will be able to remove this restriction). */
3764    
3765 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3766 nigel 77
3767     if (repeat_min == 0 && repeat_max == -1)
3768     *code++ = OP_CRSTAR + repeat_type;
3769     else if (repeat_min == 1 && repeat_max == -1)
3770     *code++ = OP_CRPLUS + repeat_type;
3771     else if (repeat_min == 0 && repeat_max == 1)
3772     *code++ = OP_CRQUERY + repeat_type;
3773     else
3774     {
3775     *code++ = OP_CRRANGE + repeat_type;
3776     PUT2INC(code, 0, repeat_min);
3777     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3778     PUT2INC(code, 0, repeat_max);
3779     }
3780     }
3781    
3782     /* If previous was a bracket group, we may have to replicate it in certain
3783     cases. */
3784    
3785 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3786     *previous == OP_ONCE || *previous == OP_COND)
3787 nigel 77 {
3788     register int i;
3789     int ketoffset = 0;
3790     int len = code - previous;
3791     uschar *bralink = NULL;
3792    
3793 nigel 93 /* Repeating a DEFINE group is pointless */
3794    
3795     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3796     {
3797     *errorcodeptr = ERR55;
3798     goto FAILED;
3799     }
3800    
3801 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3802     by scanning through from the start, and compute the offset back to it
3803     from the current code pointer. There may be an OP_OPT setting following
3804     the final KET, so we can't find the end just by going back from the code
3805     pointer. */
3806    
3807     if (repeat_max == -1)
3808     {
3809     register uschar *ket = previous;
3810     do ket += GET(ket, 1); while (*ket != OP_KET);
3811     ketoffset = code - ket;
3812     }
3813    
3814     /* The case of a zero minimum is special because of the need to stick
3815     OP_BRAZERO in front of it, and because the group appears once in the
3816     data, whereas in other cases it appears the minimum number of times. For
3817     this reason, it is simplest to treat this case separately, as otherwise
3818     the code gets far too messy. There are several special subcases when the
3819     minimum is zero. */
3820    
3821     if (repeat_min == 0)
3822     {
3823     /* If the maximum is also zero, we just omit the group from the output
3824     altogether. */
3825    
3826     if (repeat_max == 0)
3827     {
3828     code = previous;
3829     goto END_REPEAT;
3830     }
3831    
3832     /* If the maximum is 1 or unlimited, we just have to stick in the
3833     BRAZERO and do no more at this point. However, we do need to adjust
3834     any OP_RECURSE calls inside the group that refer to the group itself or
3835 nigel 93 any internal or forward referenced group, because the offset is from
3836     the start of the whole regex. Temporarily terminate the pattern while
3837     doing this. */
3838 nigel 77
3839     if (repeat_max <= 1)
3840     {
3841     *code = OP_END;
3842 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3843 nigel 77 memmove(previous+1, previous, len);
3844     code++;
3845     *previous++ = OP_BRAZERO + repeat_type;
3846     }
3847    
3848     /* If the maximum is greater than 1 and limited, we have to replicate
3849     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3850     The first one has to be handled carefully because it's the original
3851     copy, which has to be moved up. The remainder can be handled by code
3852     that is common with the non-zero minimum case below. We have to
3853     adjust the value or repeat_max, since one less copy is required. Once
3854     again, we may have to adjust any OP_RECURSE calls inside the group. */
3855    
3856     else
3857     {
3858     int offset;
3859     *code = OP_END;
3860 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3861 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3862     code += 2 + LINK_SIZE;
3863     *previous++ = OP_BRAZERO + repeat_type;
3864     *previous++ = OP_BRA;
3865    
3866     /* We chain together the bracket offset fields that have to be
3867     filled in later when the ends of the brackets are reached. */
3868    
3869     offset = (bralink == NULL)? 0 : previous - bralink;
3870     bralink = previous;
3871     PUTINC(previous, 0, offset);
3872     }
3873    
3874     repeat_max--;
3875     }
3876    
3877     /* If the minimum is greater than zero, replicate the group as many
3878     times as necessary, and adjust the maximum to the number of subsequent
3879     copies that we need. If we set a first char from the group, and didn't
3880 nigel 93 set a required char, copy the latter from the former. If there are any
3881     forward reference subroutine calls in the group, there will be entries on
3882     the workspace list; replicate these with an appropriate increment. */
3883 nigel 77
3884     else
3885     {
3886     if (repeat_min > 1)
3887     {
3888 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3889 ph10 202 just adjust the length as if we had. Do some paranoid checks for
3890     potential integer overflow. */
3891 nigel 93
3892     if (lengthptr != NULL)
3893 ph10 202 {
3894     int delta = (repeat_min - 1)*length_prevgroup;
3895     if ((double)(repeat_min - 1)*(double)length_prevgroup >
3896     (double)INT_MAX ||
3897     OFLOW_MAX - *lengthptr < delta)
3898     {
3899     *errorcodeptr = ERR20;
3900     goto FAILED;
3901     }
3902     *lengthptr += delta;
3903     }
3904 nigel 93
3905     /* This is compiling for real */
3906    
3907     else
3908 nigel 77 {
3909 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3910     for (i = 1; i < repeat_min; i++)
3911     {
3912     uschar *hc;
3913     uschar *this_hwm = cd->hwm;
3914     memcpy(code, previous, len);
3915     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3916     {
3917     PUT(cd->hwm, 0, GET(hc, 0) + len);
3918     cd->hwm += LINK_SIZE;
3919     }
3920     save_hwm = this_hwm;
3921     code += len;
3922     }
3923 nigel 77 }
3924     }
3925 nigel 93
3926 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3927     }
3928    
3929     /* This code is common to both the zero and non-zero minimum cases. If
3930     the maximum is limited, it replicates the group in a nested fashion,
3931     remembering the bracket starts on a stack. In the case of a zero minimum,
3932     the first one was set up above. In all cases the repeat_max now specifies
3933 nigel 93 the number of additional copies needed. Again, we must remember to
3934     replicate entries on the forward reference list. */
3935 nigel 77
3936     if (repeat_max >= 0)
3937     {
3938 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3939     just adjust the length as if we had. For each repetition we must add 1
3940     to the length for BRAZERO and for all but the last repetition we must
3941 ph10 202 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3942     paranoid checks to avoid integer overflow. */
3943 nigel 93
3944     if (lengthptr != NULL && repeat_max > 0)
3945 ph10 202 {
3946     int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3947     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3948     if ((double)repeat_max *
3949     (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3950     > (double)INT_MAX ||
3951     OFLOW_MAX - *lengthptr < delta)
3952     {
3953     *errorcodeptr = ERR20;
3954     goto FAILED;
3955     }
3956     *lengthptr += delta;
3957     }
3958 nigel 93
3959     /* This is compiling for real */
3960    
3961     else for (i = repeat_max - 1; i >= 0; i--)
3962 nigel 77 {
3963 nigel 93 uschar *hc;
3964     uschar *this_hwm = cd->hwm;
3965    
3966 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3967    
3968     /* All but the final copy start a new nesting, maintaining the
3969     chain of brackets outstanding. */
3970    
3971     if (i != 0)
3972     {
3973     int offset;
3974     *code++ = OP_BRA;
3975     offset = (bralink == NULL)? 0 : code - bralink;
3976     bralink = code;
3977     PUTINC(code, 0, offset);
3978     }
3979    
3980     memcpy(code, previous, len);
3981 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3982     {
3983     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3984     cd->hwm += LINK_SIZE;
3985     }
3986     save_hwm = this_hwm;
3987 nigel 77 code += len;
3988     }
3989    
3990     /* Now chain through the pending brackets, and fill in their length
3991     fields (which are holding the chain links pro tem). */
3992    
3993     while (bralink != NULL)
3994     {
3995     int oldlinkoffset;
3996     int offset = code - bralink + 1;
3997     uschar *bra = code - offset;
3998     oldlinkoffset = GET(bra, 1);
3999     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4000     *code++ = OP_KET;
4001     PUTINC(code, 0, offset);
4002     PUT(bra, 1, offset);
4003     }
4004     }
4005    
4006     /* If the maximum is unlimited, set a repeater in the final copy. We
4007     can't just offset backwards from the current code point, because we
4008     don't know if there's been an options resetting after the ket. The
4009 nigel 93 correct offset was computed above.
4010 nigel 77
4011 nigel 93 Then, when we are doing the actual compile phase, check to see whether
4012     this group is a non-atomic one that could match an empty string. If so,
4013     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4014     that runtime checking can be done. [This check is also applied to
4015     atomic groups at runtime, but in a different way.] */
4016    
4017     else
4018     {
4019     uschar *ketcode = code - ketoffset;
4020     uschar *bracode = ketcode - GET(ketcode, 1);
4021     *ketcode = OP_KETRMAX + repeat_type;
4022     if (lengthptr == NULL && *bracode != OP_ONCE)
4023     {
4024     uschar *scode = bracode;
4025     do
4026     {
4027     if (could_be_empty_branch(scode, ketcode, utf8))
4028     {
4029     *bracode += OP_SBRA - OP_BRA;
4030     break;
4031     }
4032     scode += GET(scode, 1);
4033     }
4034     while (*scode == OP_ALT);
4035     }
4036     }
4037 nigel 77 }
4038    
4039     /* Else there's some kind of shambles */
4040    
4041     else
4042     {
4043     *errorcodeptr = ERR11;
4044     goto FAILED;
4045     }
4046    
4047 nigel 93 /* If the character following a repeat is '+', or if certain optimization
4048     tests above succeeded, possessive_quantifier is TRUE. For some of the
4049     simpler opcodes, there is an special alternative opcode for this. For
4050     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4051     The '+' notation is just syntactic sugar, taken from Sun's Java package,
4052     but the special opcodes can optimize it a bit. The repeated item starts at
4053     tempcode, not at previous, which might be the first part of a string whose
4054     (former) last char we repeated.
4055 nigel 77
4056 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4057     an 'upto' may follow. We skip over an 'exact' item, and then test the
4058     length of what remains before proceeding. */
4059    
4060 nigel 77 if (possessive_quantifier)
4061     {
4062 nigel 93 int len;
4063     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4064     *tempcode == OP_NOTEXACT)
4065 ph10 285 tempcode += _pcre_OP_lengths[*tempcode] +
4066 ph10 286 ((*tempcode == OP_TYPEEXACT &&
4067     (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4068 nigel 93 len = code - tempcode;
4069     if (len > 0) switch (*tempcode)
4070     {
4071     case OP_STAR: *tempcode = OP_POSSTAR; break;
4072     case OP_PLUS: *tempcode = OP_POSPLUS; break;
4073     case OP_QUERY: *tempcode = OP_POSQUERY; break;
4074     case OP_UPTO: *tempcode = OP_POSUPTO; break;
4075    
4076     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4077     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4078     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4079     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4080    
4081     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4082     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4083     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4084     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4085    
4086     default:
4087     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4088     code += 1 + LINK_SIZE;
4089     len += 1 + LINK_SIZE;
4090     tempcode[0] = OP_ONCE;
4091     *code++ = OP_KET;