/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 323 - (hide annotations) (download)
Wed Mar 5 17:23:42 2008 UTC (6 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 197102 byte(s)
Remove a line of dead code, identified by coverity and reported by Nuno Lopes.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 305 Copyright (c) 1997-2008 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 nigel 77 static const short int escapes[] = {
102     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 nigel 77 0, 0, -ESC_z /* x - z */
112     };
113    
114 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
115 nigel 77 static const short int escapes[] = {
116     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139     };
140     #endif
141    
142    
143 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144     searched linearly. Put all the names into a single string, in order to reduce
145 ph10 240 the number of relocations when a shared library is dynamically linked. */
146 ph10 210
147     typedef struct verbitem {
148     int len;
149     int op;
150 ph10 211 } verbitem;
151 ph10 210
152 ph10 240 static const char verbnames[] =
153 ph10 243 "ACCEPT\0"
154     "COMMIT\0"
155     "F\0"
156     "FAIL\0"
157     "PRUNE\0"
158     "SKIP\0"
159     "THEN";
160 ph10 240
161 ph10 210 static verbitem verbs[] = {
162 ph10 240 { 6, OP_ACCEPT },
163     { 6, OP_COMMIT },
164     { 1, OP_FAIL },
165     { 4, OP_FAIL },
166     { 5, OP_PRUNE },
167     { 4, OP_SKIP },
168     { 4, OP_THEN }
169 ph10 210 };
170    
171     static int verbcount = sizeof(verbs)/sizeof(verbitem);
172    
173    
174 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
175     now all in a single string, to reduce the number of relocations when a shared
176 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
177     length entry. The first three must be alpha, lower, upper, as this is assumed
178     for handling case independence. */
179 nigel 77
180 ph10 240 static const char posix_names[] =
181 ph10 243 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182     "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 ph10 240 "word\0" "xdigit";
184 nigel 77
185     static const uschar posix_name_lengths[] = {
186     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187    
188 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
189     base map, with an optional addition or removal of another map. Then, for some
190     classes, there is some additional tweaking: for [:blank:] the vertical space
191     characters are removed, and for [:alpha:] and [:alnum:] the underscore
192     character is removed. The triples in the table consist of the base map offset,
193     second map offset or -1 if no second map, and a non-negative value for map
194     addition or a negative value for map subtraction (if there are two maps). The
195     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196     remove vertical space characters, 2 => remove underscore. */
197 nigel 77
198     static const int posix_class_maps[] = {
199 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
200     cbit_lower, -1, 0, /* lower */
201     cbit_upper, -1, 0, /* upper */
202     cbit_word, -1, 2, /* alnum - word without underscore */
203     cbit_print, cbit_cntrl, 0, /* ascii */
204     cbit_space, -1, 1, /* blank - a GNU extension */
205     cbit_cntrl, -1, 0, /* cntrl */
206     cbit_digit, -1, 0, /* digit */
207     cbit_graph, -1, 0, /* graph */
208     cbit_print, -1, 0, /* print */
209     cbit_punct, -1, 0, /* punct */
210     cbit_space, -1, 0, /* space */
211     cbit_word, -1, 0, /* word - a Perl extension */
212     cbit_xdigit,-1, 0 /* xdigit */
213 nigel 77 };
214    
215    
216 nigel 93 #define STRING(a) # a
217     #define XSTRING(s) STRING(s)
218    
219 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
220 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
221     they are documented. Always add a new error instead. Messages marked DEAD below
222 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
223     the number of relocations needed when a shared library is loaded dynamically,
224     it is now one long string. We cannot use a table of offsets, because the
225     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226     simply count through to the one we want - this isn't a performance issue
227 ph10 240 because these strings are used only when there is a compilation error. */
228 nigel 77
229 ph10 240 static const char error_texts[] =
230     "no error\0"
231     "\\ at end of pattern\0"
232     "\\c at end of pattern\0"
233     "unrecognized character follows \\\0"
234     "numbers out of order in {} quantifier\0"
235 nigel 77 /* 5 */
236 ph10 240 "number too big in {} quantifier\0"
237     "missing terminating ] for character class\0"
238     "invalid escape sequence in character class\0"
239     "range out of order in character class\0"
240     "nothing to repeat\0"
241 nigel 77 /* 10 */
242 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243     "internal error: unexpected repeat\0"
244 ph10 269 "unrecognized character after (? or (?-\0"
245 ph10 240 "POSIX named classes are supported only within a class\0"
246     "missing )\0"
247 nigel 77 /* 15 */
248 ph10 240 "reference to non-existent subpattern\0"
249     "erroffset passed as NULL\0"
250     "unknown option bit(s) set\0"
251     "missing ) after comment\0"
252     "parentheses nested too deeply\0" /** DEAD **/
253 nigel 77 /* 20 */
254 ph10 240 "regular expression is too large\0"
255     "failed to get memory\0"
256     "unmatched parentheses\0"
257     "internal error: code overflow\0"
258     "unrecognized character after (?<\0"
259 nigel 77 /* 25 */
260 ph10 240 "lookbehind assertion is not fixed length\0"
261     "malformed number or name after (?(\0"
262     "conditional group contains more than two branches\0"
263     "assertion expected after (?(\0"
264     "(?R or (?[+-]digits must be followed by )\0"
265 nigel 77 /* 30 */
266 ph10 240 "unknown POSIX class name\0"
267     "POSIX collating elements are not supported\0"
268     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269     "spare error\0" /** DEAD **/
270     "character value in \\x{...} sequence is too large\0"
271 nigel 77 /* 35 */
272 ph10 240 "invalid condition (?(0)\0"
273     "\\C not allowed in lookbehind assertion\0"
274     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275     "number after (?C is > 255\0"
276     "closing ) for (?C expected\0"
277 nigel 77 /* 40 */
278 ph10 240 "recursive call could loop indefinitely\0"
279     "unrecognized character after (?P\0"
280     "syntax error in subpattern name (missing terminator)\0"
281     "two named subpatterns have the same name\0"
282     "invalid UTF-8 string\0"
283 nigel 77 /* 45 */
284 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
285     "malformed \\P or \\p sequence\0"
286     "unknown property name after \\P or \\p\0"
287     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 nigel 91 /* 50 */
290 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
291     "octal value is greater than \\377 (not in UTF-8 mode)\0"
292     "internal error: overran compiling workspace\0"
293     "internal error: previously-checked referenced subpattern not found\0"
294     "DEFINE group contains more than one branch\0"
295 nigel 93 /* 55 */
296 ph10 240 "repeating a DEFINE group is not allowed\0"
297     "inconsistent NEWLINE options\0"
298     "\\g is not followed by a braced name or an optionally braced non-zero number\0"
299     "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
300     "(*VERB) with an argument is not supported\0"
301 ph10 211 /* 60 */
302 ph10 240 "(*VERB) not recognized\0"
303 ph10 268 "number is too big\0"
304 ph10 272 "subpattern name expected\0"
305 ph10 269 "digit expected after (?+";
306 nigel 77
307    
308     /* Table to identify digits and hex digits. This is used when compiling
309     patterns. Note that the tables in chartables are dependent on the locale, and
310     may mark arbitrary characters as digits - but the PCRE compiling code expects
311     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
312     a private table here. It costs 256 bytes, but it is a lot faster than doing
313     character value tests (at least in some simple cases I timed), and in some
314     applications one wants PCRE to compile efficiently as well as match
315     efficiently.
316    
317     For convenience, we use the same bit definitions as in chartables:
318    
319     0x04 decimal digit
320     0x08 hexadecimal digit
321    
322     Then we can use ctype_digit and ctype_xdigit in the code. */
323    
324 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
325 nigel 77 static const unsigned char digitab[] =
326     {
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
333     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
334     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
335     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
336     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
337     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
338     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
339     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
340     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
341     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
342     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
343     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
344     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
347     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
348     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
350     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
351     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
352     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
355     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
356     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
357     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
358     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
359    
360 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
361 nigel 77 static const unsigned char digitab[] =
362     {
363     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
364     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
365     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
366     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
367     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
368     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
369     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
370     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
371     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
374 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
375 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
376     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
377     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
378     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
379     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
380     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
381     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
382     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
383     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
387     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
388     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
393     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
394     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
395    
396     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
397     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
398     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
399     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
400     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
401     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
402     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
403     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
404     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
405     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
406     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
407     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
408 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
409 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
410     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
411     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
412     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
413     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
414     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
415     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
416     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
417     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
418     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
419     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
420     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
421     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
422     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
423     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
424     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
425     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
426     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
427     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
428     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
429     #endif
430    
431    
432     /* Definition to allow mutual recursion */
433    
434     static BOOL
435 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
436 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
437 nigel 77
438    
439    
440     /*************************************************
441 ph10 240 * Find an error text *
442     *************************************************/
443    
444 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
445     some of the text is of unknown length, we can't use a table of offsets.
446     Instead, just count through the strings. This is not a performance issue
447 ph10 240 because it happens only when there has been a compilation error.
448    
449     Argument: the error number
450     Returns: pointer to the error string
451     */
452    
453     static const char *
454     find_error_text(int n)
455     {
456     const char *s = error_texts;
457 ph10 243 for (; n > 0; n--) while (*s++ != 0);
458 ph10 240 return s;
459     }
460    
461    
462     /*************************************************
463 nigel 77 * Handle escapes *
464     *************************************************/
465    
466     /* This function is called when a \ has been encountered. It either returns a
467     positive value for a simple escape such as \n, or a negative value which
468 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
469     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
470     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
471     ptr is pointing at the \. On exit, it is on the final character of the escape
472     sequence.
473 nigel 77
474     Arguments:
475     ptrptr points to the pattern position pointer
476     errorcodeptr points to the errorcode variable
477     bracount number of previous extracting brackets
478     options the options bits
479     isclass TRUE if inside a character class
480    
481     Returns: zero or positive => a data character
482     negative => a special escape sequence
483 ph10 213 on error, errorcodeptr is set
484 nigel 77 */
485    
486     static int
487     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
488     int options, BOOL isclass)
489     {
490 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
491     const uschar *ptr = *ptrptr + 1;
492 nigel 77 int c, i;
493    
494 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
495     ptr--; /* Set pointer back to the last byte */
496    
497 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
498    
499     if (c == 0) *errorcodeptr = ERR1;
500    
501 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
502     in a table. A non-zero result is something that can be returned immediately.
503 nigel 77 Otherwise further processing may be required. */
504    
505 ph10 97 #ifndef EBCDIC /* ASCII coding */
506 ph10 274 else if (c < '0' || c > 'z') {} /* Not alphanumeric */
507 nigel 77 else if ((i = escapes[c - '0']) != 0) c = i;
508    
509 ph10 97 #else /* EBCDIC coding */
510 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
511 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
512     #endif
513    
514     /* Escapes that need further processing, or are illegal. */
515    
516     else
517     {
518     const uschar *oldptr;
519 nigel 93 BOOL braced, negated;
520    
521 nigel 77 switch (c)
522     {
523     /* A number of Perl escapes are not handled by PCRE. We give an explicit
524     error. */
525    
526     case 'l':
527     case 'L':
528     case 'N':
529     case 'u':
530     case 'U':
531     *errorcodeptr = ERR37;
532     break;
533    
534 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
535     is an absolute backreference. If negative, it is a relative backreference.
536 ph10 172 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
537     reference to a named group. This is part of Perl's movement towards a
538     unified syntax for back references. As this is synonymous with \k{name}, we
539 ph10 171 fudge it up by pretending it really was \k. */
540 nigel 93
541     case 'g':
542     if (ptr[1] == '{')
543     {
544 ph10 171 const uschar *p;
545     for (p = ptr+2; *p != 0 && *p != '}'; p++)
546     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
547 ph10 172 if (*p != 0 && *p != '}')
548 ph10 171 {
549     c = -ESC_k;
550     break;
551 ph10 172 }
552 nigel 93 braced = TRUE;
553     ptr++;
554     }
555     else braced = FALSE;
556    
557     if (ptr[1] == '-')
558     {
559     negated = TRUE;
560     ptr++;
561     }
562     else negated = FALSE;
563    
564     c = 0;
565     while ((digitab[ptr[1]] & ctype_digit) != 0)
566     c = c * 10 + *(++ptr) - '0';
567 ph10 220
568 ph10 213 if (c < 0)
569     {
570     *errorcodeptr = ERR61;
571     break;
572 ph10 220 }
573 nigel 93
574     if (c == 0 || (braced && *(++ptr) != '}'))
575     {
576     *errorcodeptr = ERR57;
577 ph10 213 break;
578 nigel 93 }
579    
580     if (negated)
581     {
582     if (c > bracount)
583     {
584     *errorcodeptr = ERR15;
585 ph10 213 break;
586 nigel 93 }
587     c = bracount - (c - 1);
588     }
589    
590     c = -(ESC_REF + c);
591     break;
592    
593 nigel 77 /* The handling of escape sequences consisting of a string of digits
594     starting with one that is not zero is not straightforward. By experiment,
595     the way Perl works seems to be as follows:
596    
597     Outside a character class, the digits are read as a decimal number. If the
598     number is less than 10, or if there are that many previous extracting
599     left brackets, then it is a back reference. Otherwise, up to three octal
600     digits are read to form an escaped byte. Thus \123 is likely to be octal
601     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
602     value is greater than 377, the least significant 8 bits are taken. Inside a
603     character class, \ followed by a digit is always an octal number. */
604    
605     case '1': case '2': case '3': case '4': case '5':
606     case '6': case '7': case '8': case '9':
607    
608     if (!isclass)
609     {
610     oldptr = ptr;
611     c -= '0';
612     while ((digitab[ptr[1]] & ctype_digit) != 0)
613     c = c * 10 + *(++ptr) - '0';
614 ph10 213 if (c < 0)
615     {
616     *errorcodeptr = ERR61;
617 ph10 220 break;
618     }
619 nigel 77 if (c < 10 || c <= bracount)
620     {
621     c = -(ESC_REF + c);
622     break;
623     }
624     ptr = oldptr; /* Put the pointer back and fall through */
625     }
626    
627     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
628     generates a binary zero byte and treats the digit as a following literal.
629     Thus we have to pull back the pointer by one. */
630    
631     if ((c = *ptr) >= '8')
632     {
633     ptr--;
634     c = 0;
635     break;
636     }
637    
638     /* \0 always starts an octal number, but we may drop through to here with a
639 nigel 91 larger first octal digit. The original code used just to take the least
640     significant 8 bits of octal numbers (I think this is what early Perls used
641     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
642     than 3 octal digits. */
643 nigel 77
644     case '0':
645     c -= '0';
646     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
647     c = c * 8 + *(++ptr) - '0';
648 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
649 nigel 77 break;
650    
651 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
652     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
653     treated as a data character. */
654 nigel 77
655     case 'x':
656 nigel 87 if (ptr[1] == '{')
657 nigel 77 {
658     const uschar *pt = ptr + 2;
659 nigel 87 int count = 0;
660    
661 nigel 77 c = 0;
662     while ((digitab[*pt] & ctype_xdigit) != 0)
663     {
664 nigel 87 register int cc = *pt++;
665     if (c == 0 && cc == '0') continue; /* Leading zeroes */
666 nigel 77 count++;
667 nigel 87
668 ph10 97 #ifndef EBCDIC /* ASCII coding */
669 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
670 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
671 ph10 97 #else /* EBCDIC coding */
672 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
673 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
674 nigel 77 #endif
675     }
676 nigel 87
677 nigel 77 if (*pt == '}')
678     {
679 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
680 nigel 77 ptr = pt;
681     break;
682     }
683 nigel 87
684 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
685     recognize this construct; fall through to the normal \x handling. */
686     }
687    
688 nigel 87 /* Read just a single-byte hex-defined char */
689 nigel 77
690     c = 0;
691     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
692     {
693     int cc; /* Some compilers don't like ++ */
694     cc = *(++ptr); /* in initializers */
695 ph10 97 #ifndef EBCDIC /* ASCII coding */
696 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
697     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
698 ph10 97 #else /* EBCDIC coding */
699 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
700     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
701     #endif
702     }
703     break;
704    
705 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
706     This coding is ASCII-specific, but then the whole concept of \cx is
707     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
708 nigel 77
709     case 'c':
710     c = *(++ptr);
711     if (c == 0)
712     {
713     *errorcodeptr = ERR2;
714 ph10 213 break;
715 nigel 77 }
716    
717 ph10 97 #ifndef EBCDIC /* ASCII coding */
718 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
719     c ^= 0x40;
720 ph10 97 #else /* EBCDIC coding */
721 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
722     c ^= 0xC0;
723     #endif
724     break;
725    
726     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
727 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
728     otherwise, for Perl compatibility, it is a literal. This code looks a bit
729     odd, but there used to be some cases other than the default, and there may
730     be again in future, so I haven't "optimized" it. */
731 nigel 77
732     default:
733     if ((options & PCRE_EXTRA) != 0) switch(c)
734     {
735     default:
736     *errorcodeptr = ERR3;
737     break;
738     }
739     break;
740     }
741     }
742    
743     *ptrptr = ptr;
744     return c;
745     }
746    
747    
748    
749     #ifdef SUPPORT_UCP
750     /*************************************************
751     * Handle \P and \p *
752     *************************************************/
753    
754     /* This function is called after \P or \p has been encountered, provided that
755     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
756     pointing at the P or p. On exit, it is pointing at the final character of the
757     escape sequence.
758    
759     Argument:
760     ptrptr points to the pattern position pointer
761     negptr points to a boolean that is set TRUE for negation else FALSE
762 nigel 87 dptr points to an int that is set to the detailed property value
763 nigel 77 errorcodeptr points to the error code variable
764    
765 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
766 nigel 77 */
767    
768     static int
769 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
770 nigel 77 {
771     int c, i, bot, top;
772     const uschar *ptr = *ptrptr;
773 nigel 87 char name[32];
774 nigel 77
775     c = *(++ptr);
776     if (c == 0) goto ERROR_RETURN;
777    
778     *negptr = FALSE;
779    
780 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
781     negation. */
782 nigel 77
783     if (c == '{')
784     {
785     if (ptr[1] == '^')
786     {
787     *negptr = TRUE;
788     ptr++;
789     }
790 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
791 nigel 77 {
792     c = *(++ptr);
793     if (c == 0) goto ERROR_RETURN;
794     if (c == '}') break;
795     name[i] = c;
796     }
797 nigel 87 if (c !='}') goto ERROR_RETURN;
798 nigel 77 name[i] = 0;
799     }
800    
801     /* Otherwise there is just one following character */
802    
803     else
804     {
805     name[0] = c;
806     name[1] = 0;
807     }
808    
809     *ptrptr = ptr;
810    
811     /* Search for a recognized property name using binary chop */
812    
813     bot = 0;
814     top = _pcre_utt_size;
815    
816     while (bot < top)
817     {
818 nigel 87 i = (bot + top) >> 1;
819 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
820 nigel 87 if (c == 0)
821     {
822     *dptr = _pcre_utt[i].value;
823     return _pcre_utt[i].type;
824     }
825 nigel 77 if (c > 0) bot = i + 1; else top = i;
826     }
827    
828     *errorcodeptr = ERR47;
829     *ptrptr = ptr;
830     return -1;
831    
832     ERROR_RETURN:
833     *errorcodeptr = ERR46;
834     *ptrptr = ptr;
835     return -1;
836     }
837     #endif
838    
839    
840    
841    
842     /*************************************************
843     * Check for counted repeat *
844     *************************************************/
845    
846     /* This function is called when a '{' is encountered in a place where it might
847     start a quantifier. It looks ahead to see if it really is a quantifier or not.
848     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
849     where the ddds are digits.
850    
851     Arguments:
852     p pointer to the first char after '{'
853    
854     Returns: TRUE or FALSE
855     */
856    
857     static BOOL
858     is_counted_repeat(const uschar *p)
859     {
860     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
861     while ((digitab[*p] & ctype_digit) != 0) p++;
862     if (*p == '}') return TRUE;
863    
864     if (*p++ != ',') return FALSE;
865     if (*p == '}') return TRUE;
866    
867     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
868     while ((digitab[*p] & ctype_digit) != 0) p++;
869    
870     return (*p == '}');
871     }
872    
873    
874    
875     /*************************************************
876     * Read repeat counts *
877     *************************************************/
878    
879     /* Read an item of the form {n,m} and return the values. This is called only
880     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
881     so the syntax is guaranteed to be correct, but we need to check the values.
882    
883     Arguments:
884     p pointer to first char after '{'
885     minp pointer to int for min
886     maxp pointer to int for max
887     returned as -1 if no max
888     errorcodeptr points to error code variable
889    
890     Returns: pointer to '}' on success;
891     current ptr on error, with errorcodeptr set non-zero
892     */
893    
894     static const uschar *
895     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
896     {
897     int min = 0;
898     int max = -1;
899    
900 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
901     an integer overflow. */
902    
903 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
904 nigel 81 if (min < 0 || min > 65535)
905     {
906     *errorcodeptr = ERR5;
907     return p;
908     }
909 nigel 77
910 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
911     Also, max must not be less than min. */
912    
913 nigel 77 if (*p == '}') max = min; else
914     {
915     if (*(++p) != '}')
916     {
917     max = 0;
918     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
919 nigel 81 if (max < 0 || max > 65535)
920     {
921     *errorcodeptr = ERR5;
922     return p;
923     }
924 nigel 77 if (max < min)
925     {
926     *errorcodeptr = ERR4;
927     return p;
928     }
929     }
930     }
931    
932 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
933     '}'. */
934 nigel 77
935 nigel 81 *minp = min;
936     *maxp = max;
937 nigel 77 return p;
938     }
939    
940    
941    
942     /*************************************************
943 nigel 93 * Find forward referenced subpattern *
944 nigel 91 *************************************************/
945    
946 nigel 93 /* This function scans along a pattern's text looking for capturing
947     subpatterns, and counting them. If it finds a named pattern that matches the
948     name it is given, it returns its number. Alternatively, if the name is NULL, it
949     returns when it reaches a given numbered subpattern. This is used for forward
950     references to subpatterns. We know that if (?P< is encountered, the name will
951     be terminated by '>' because that is checked in the first pass.
952 nigel 91
953     Arguments:
954 nigel 93 ptr current position in the pattern
955     count current count of capturing parens so far encountered
956     name name to seek, or NULL if seeking a numbered subpattern
957     lorn name length, or subpattern number if name is NULL
958     xmode TRUE if we are in /x mode
959 nigel 91
960     Returns: the number of the named subpattern, or -1 if not found
961     */
962    
963     static int
964 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
965     BOOL xmode)
966 nigel 91 {
967     const uschar *thisname;
968 nigel 93
969 nigel 91 for (; *ptr != 0; ptr++)
970     {
971 nigel 93 int term;
972    
973     /* Skip over backslashed characters and also entire \Q...\E */
974    
975     if (*ptr == '\\')
976     {
977     if (*(++ptr) == 0) return -1;
978     if (*ptr == 'Q') for (;;)
979     {
980     while (*(++ptr) != 0 && *ptr != '\\');
981     if (*ptr == 0) return -1;
982     if (*(++ptr) == 'E') break;
983     }
984     continue;
985     }
986    
987     /* Skip over character classes */
988    
989     if (*ptr == '[')
990     {
991     while (*(++ptr) != ']')
992     {
993 ph10 220 if (*ptr == 0) return -1;
994 nigel 93 if (*ptr == '\\')
995     {
996     if (*(++ptr) == 0) return -1;
997     if (*ptr == 'Q') for (;;)
998     {
999     while (*(++ptr) != 0 && *ptr != '\\');
1000     if (*ptr == 0) return -1;
1001     if (*(++ptr) == 'E') break;
1002     }
1003     continue;
1004     }
1005     }
1006     continue;
1007     }
1008    
1009     /* Skip comments in /x mode */
1010    
1011     if (xmode && *ptr == '#')
1012     {
1013     while (*(++ptr) != 0 && *ptr != '\n');
1014     if (*ptr == 0) return -1;
1015     continue;
1016     }
1017    
1018     /* An opening parens must now be a real metacharacter */
1019    
1020 nigel 91 if (*ptr != '(') continue;
1021 ph10 210 if (ptr[1] != '?' && ptr[1] != '*')
1022 nigel 93 {
1023     count++;
1024     if (name == NULL && count == lorn) return count;
1025     continue;
1026     }
1027    
1028     ptr += 2;
1029     if (*ptr == 'P') ptr++; /* Allow optional P */
1030    
1031     /* We have to disambiguate (?<! and (?<= from (?<name> */
1032    
1033     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1034     *ptr != '\'')
1035     continue;
1036    
1037 nigel 91 count++;
1038 nigel 93
1039     if (name == NULL && count == lorn) return count;
1040     term = *ptr++;
1041     if (term == '<') term = '>';
1042 nigel 91 thisname = ptr;
1043 nigel 93 while (*ptr != term) ptr++;
1044     if (name != NULL && lorn == ptr - thisname &&
1045     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1046 nigel 91 return count;
1047     }
1048 nigel 93
1049 nigel 91 return -1;
1050     }
1051    
1052    
1053    
1054     /*************************************************
1055 nigel 77 * Find first significant op code *
1056     *************************************************/
1057    
1058     /* This is called by several functions that scan a compiled expression looking
1059     for a fixed first character, or an anchoring op code etc. It skips over things
1060     that do not influence this. For some calls, a change of option is important.
1061     For some calls, it makes sense to skip negative forward and all backward
1062     assertions, and also the \b assertion; for others it does not.
1063    
1064     Arguments:
1065     code pointer to the start of the group
1066     options pointer to external options
1067     optbit the option bit whose changing is significant, or
1068     zero if none are
1069     skipassert TRUE if certain assertions are to be skipped
1070    
1071     Returns: pointer to the first significant opcode
1072     */
1073    
1074     static const uschar*
1075     first_significant_code(const uschar *code, int *options, int optbit,
1076     BOOL skipassert)
1077     {
1078     for (;;)
1079     {
1080     switch ((int)*code)
1081     {
1082     case OP_OPT:
1083     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1084     *options = (int)code[1];
1085     code += 2;
1086     break;
1087    
1088     case OP_ASSERT_NOT:
1089     case OP_ASSERTBACK:
1090     case OP_ASSERTBACK_NOT:
1091     if (!skipassert) return code;
1092     do code += GET(code, 1); while (*code == OP_ALT);
1093     code += _pcre_OP_lengths[*code];
1094     break;
1095    
1096     case OP_WORD_BOUNDARY:
1097     case OP_NOT_WORD_BOUNDARY:
1098     if (!skipassert) return code;
1099     /* Fall through */
1100    
1101     case OP_CALLOUT:
1102     case OP_CREF:
1103 nigel 93 case OP_RREF:
1104     case OP_DEF:
1105 nigel 77 code += _pcre_OP_lengths[*code];
1106     break;
1107    
1108     default:
1109     return code;
1110     }
1111     }
1112     /* Control never reaches here */
1113     }
1114    
1115    
1116    
1117    
1118     /*************************************************
1119     * Find the fixed length of a pattern *
1120     *************************************************/
1121    
1122     /* Scan a pattern and compute the fixed length of subject that will match it,
1123     if the length is fixed. This is needed for dealing with backward assertions.
1124     In UTF8 mode, the result is in characters rather than bytes.
1125    
1126     Arguments:
1127     code points to the start of the pattern (the bracket)
1128     options the compiling options
1129    
1130     Returns: the fixed length, or -1 if there is no fixed length,
1131     or -2 if \C was encountered
1132     */
1133    
1134     static int
1135     find_fixedlength(uschar *code, int options)
1136     {
1137     int length = -1;
1138    
1139     register int branchlength = 0;
1140     register uschar *cc = code + 1 + LINK_SIZE;
1141    
1142     /* Scan along the opcodes for this branch. If we get to the end of the
1143     branch, check the length against that of the other branches. */
1144    
1145     for (;;)
1146     {
1147     int d;
1148     register int op = *cc;
1149     switch (op)
1150     {
1151 nigel 93 case OP_CBRA:
1152 nigel 77 case OP_BRA:
1153     case OP_ONCE:
1154     case OP_COND:
1155 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1156 nigel 77 if (d < 0) return d;
1157     branchlength += d;
1158     do cc += GET(cc, 1); while (*cc == OP_ALT);
1159     cc += 1 + LINK_SIZE;
1160     break;
1161    
1162     /* Reached end of a branch; if it's a ket it is the end of a nested
1163     call. If it's ALT it is an alternation in a nested call. If it is
1164     END it's the end of the outer call. All can be handled by the same code. */
1165    
1166     case OP_ALT:
1167     case OP_KET:
1168     case OP_KETRMAX:
1169     case OP_KETRMIN:
1170     case OP_END:
1171     if (length < 0) length = branchlength;
1172     else if (length != branchlength) return -1;
1173     if (*cc != OP_ALT) return length;
1174     cc += 1 + LINK_SIZE;
1175     branchlength = 0;
1176     break;
1177    
1178     /* Skip over assertive subpatterns */
1179    
1180     case OP_ASSERT:
1181     case OP_ASSERT_NOT:
1182     case OP_ASSERTBACK:
1183     case OP_ASSERTBACK_NOT:
1184     do cc += GET(cc, 1); while (*cc == OP_ALT);
1185     /* Fall through */
1186    
1187     /* Skip over things that don't match chars */
1188    
1189     case OP_REVERSE:
1190     case OP_CREF:
1191 nigel 93 case OP_RREF:
1192     case OP_DEF:
1193 nigel 77 case OP_OPT:
1194     case OP_CALLOUT:
1195     case OP_SOD:
1196     case OP_SOM:
1197     case OP_EOD:
1198     case OP_EODN:
1199     case OP_CIRC:
1200     case OP_DOLL:
1201     case OP_NOT_WORD_BOUNDARY:
1202     case OP_WORD_BOUNDARY:
1203     cc += _pcre_OP_lengths[*cc];
1204     break;
1205    
1206     /* Handle literal characters */
1207    
1208     case OP_CHAR:
1209     case OP_CHARNC:
1210 nigel 91 case OP_NOT:
1211 nigel 77 branchlength++;
1212     cc += 2;
1213     #ifdef SUPPORT_UTF8
1214     if ((options & PCRE_UTF8) != 0)
1215     {
1216     while ((*cc & 0xc0) == 0x80) cc++;
1217     }
1218     #endif
1219     break;
1220    
1221     /* Handle exact repetitions. The count is already in characters, but we
1222     need to skip over a multibyte character in UTF8 mode. */
1223    
1224     case OP_EXACT:
1225     branchlength += GET2(cc,1);
1226     cc += 4;
1227     #ifdef SUPPORT_UTF8
1228     if ((options & PCRE_UTF8) != 0)
1229     {
1230     while((*cc & 0x80) == 0x80) cc++;
1231     }
1232     #endif
1233     break;
1234    
1235     case OP_TYPEEXACT:
1236     branchlength += GET2(cc,1);
1237 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1238 nigel 77 cc += 4;
1239     break;
1240    
1241     /* Handle single-char matchers */
1242    
1243     case OP_PROP:
1244     case OP_NOTPROP:
1245 nigel 87 cc += 2;
1246 nigel 77 /* Fall through */
1247    
1248     case OP_NOT_DIGIT:
1249     case OP_DIGIT:
1250     case OP_NOT_WHITESPACE:
1251     case OP_WHITESPACE:
1252     case OP_NOT_WORDCHAR:
1253     case OP_WORDCHAR:
1254     case OP_ANY:
1255     branchlength++;
1256     cc++;
1257     break;
1258    
1259     /* The single-byte matcher isn't allowed */
1260    
1261     case OP_ANYBYTE:
1262     return -2;
1263    
1264     /* Check a class for variable quantification */
1265    
1266     #ifdef SUPPORT_UTF8
1267     case OP_XCLASS:
1268     cc += GET(cc, 1) - 33;
1269     /* Fall through */
1270     #endif
1271    
1272     case OP_CLASS:
1273     case OP_NCLASS:
1274     cc += 33;
1275    
1276     switch (*cc)
1277     {
1278     case OP_CRSTAR:
1279     case OP_CRMINSTAR:
1280     case OP_CRQUERY:
1281     case OP_CRMINQUERY:
1282     return -1;
1283    
1284     case OP_CRRANGE:
1285     case OP_CRMINRANGE:
1286     if (GET2(cc,1) != GET2(cc,3)) return -1;
1287     branchlength += GET2(cc,1);
1288     cc += 5;
1289     break;
1290    
1291     default:
1292     branchlength++;
1293     }
1294     break;
1295    
1296     /* Anything else is variable length */
1297    
1298     default:
1299     return -1;
1300     }
1301     }
1302     /* Control never gets here */
1303     }
1304    
1305    
1306    
1307    
1308     /*************************************************
1309     * Scan compiled regex for numbered bracket *
1310     *************************************************/
1311    
1312     /* This little function scans through a compiled pattern until it finds a
1313     capturing bracket with the given number.
1314    
1315     Arguments:
1316     code points to start of expression
1317     utf8 TRUE in UTF-8 mode
1318     number the required bracket number
1319    
1320     Returns: pointer to the opcode for the bracket, or NULL if not found
1321     */
1322    
1323     static const uschar *
1324     find_bracket(const uschar *code, BOOL utf8, int number)
1325     {
1326     for (;;)
1327     {
1328     register int c = *code;
1329     if (c == OP_END) return NULL;
1330 nigel 91
1331     /* XCLASS is used for classes that cannot be represented just by a bit
1332     map. This includes negated single high-valued characters. The length in
1333     the table is zero; the actual length is stored in the compiled code. */
1334    
1335     if (c == OP_XCLASS) code += GET(code, 1);
1336    
1337 nigel 93 /* Handle capturing bracket */
1338 nigel 91
1339 nigel 93 else if (c == OP_CBRA)
1340 nigel 77 {
1341 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1342 nigel 77 if (n == number) return (uschar *)code;
1343 nigel 93 code += _pcre_OP_lengths[c];
1344 nigel 77 }
1345 nigel 91
1346 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1347     repeated character types, we have to test for \p and \P, which have an extra
1348 ph10 218 two bytes of parameters. */
1349 nigel 91
1350 nigel 77 else
1351     {
1352 ph10 218 switch(c)
1353     {
1354     case OP_TYPESTAR:
1355     case OP_TYPEMINSTAR:
1356     case OP_TYPEPLUS:
1357     case OP_TYPEMINPLUS:
1358     case OP_TYPEQUERY:
1359     case OP_TYPEMINQUERY:
1360     case OP_TYPEPOSSTAR:
1361     case OP_TYPEPOSPLUS:
1362     case OP_TYPEPOSQUERY:
1363     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1364 ph10 220 break;
1365 ph10 221
1366     case OP_TYPEUPTO:
1367     case OP_TYPEMINUPTO:
1368     case OP_TYPEEXACT:
1369     case OP_TYPEPOSUPTO:
1370     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1371     break;
1372 ph10 220 }
1373    
1374 ph10 218 /* Add in the fixed length from the table */
1375 ph10 220
1376 nigel 77 code += _pcre_OP_lengths[c];
1377 ph10 220
1378 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1379     a multi-byte character. The length in the table is a minimum, so we have to
1380     arrange to skip the extra bytes. */
1381 ph10 220
1382 ph10 107 #ifdef SUPPORT_UTF8
1383 nigel 77 if (utf8) switch(c)
1384     {
1385     case OP_CHAR:
1386     case OP_CHARNC:
1387     case OP_EXACT:
1388     case OP_UPTO:
1389     case OP_MINUPTO:
1390 nigel 93 case OP_POSUPTO:
1391 nigel 77 case OP_STAR:
1392     case OP_MINSTAR:
1393 nigel 93 case OP_POSSTAR:
1394 nigel 77 case OP_PLUS:
1395     case OP_MINPLUS:
1396 nigel 93 case OP_POSPLUS:
1397 nigel 77 case OP_QUERY:
1398     case OP_MINQUERY:
1399 nigel 93 case OP_POSQUERY:
1400     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1401 nigel 77 break;
1402     }
1403 ph10 111 #endif
1404 nigel 77 }
1405     }
1406     }
1407    
1408    
1409    
1410     /*************************************************
1411     * Scan compiled regex for recursion reference *
1412     *************************************************/
1413    
1414     /* This little function scans through a compiled pattern until it finds an
1415     instance of OP_RECURSE.
1416    
1417     Arguments:
1418     code points to start of expression
1419     utf8 TRUE in UTF-8 mode
1420    
1421     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1422     */
1423    
1424     static const uschar *
1425     find_recurse(const uschar *code, BOOL utf8)
1426     {
1427     for (;;)
1428     {
1429     register int c = *code;
1430     if (c == OP_END) return NULL;
1431 nigel 91 if (c == OP_RECURSE) return code;
1432 ph10 220
1433 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1434     map. This includes negated single high-valued characters. The length in
1435     the table is zero; the actual length is stored in the compiled code. */
1436    
1437     if (c == OP_XCLASS) code += GET(code, 1);
1438    
1439 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1440     repeated character types, we have to test for \p and \P, which have an extra
1441 ph10 218 two bytes of parameters. */
1442 nigel 91
1443 nigel 77 else
1444     {
1445 ph10 218 switch(c)
1446     {
1447     case OP_TYPESTAR:
1448     case OP_TYPEMINSTAR:
1449     case OP_TYPEPLUS:
1450     case OP_TYPEMINPLUS:
1451     case OP_TYPEQUERY:
1452     case OP_TYPEMINQUERY:
1453     case OP_TYPEPOSSTAR:
1454     case OP_TYPEPOSPLUS:
1455     case OP_TYPEPOSQUERY:
1456     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1457 ph10 220 break;
1458 ph10 221
1459     case OP_TYPEPOSUPTO:
1460     case OP_TYPEUPTO:
1461     case OP_TYPEMINUPTO:
1462     case OP_TYPEEXACT:
1463     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1464     break;
1465 ph10 220 }
1466    
1467 ph10 218 /* Add in the fixed length from the table */
1468    
1469 nigel 77 code += _pcre_OP_lengths[c];
1470 ph10 220
1471 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1472     by a multi-byte character. The length in the table is a minimum, so we have
1473     to arrange to skip the extra bytes. */
1474 ph10 220
1475 ph10 107 #ifdef SUPPORT_UTF8
1476 nigel 77 if (utf8) switch(c)
1477     {
1478     case OP_CHAR:
1479     case OP_CHARNC:
1480     case OP_EXACT:
1481     case OP_UPTO:
1482     case OP_MINUPTO:
1483 nigel 93 case OP_POSUPTO:
1484 nigel 77 case OP_STAR:
1485     case OP_MINSTAR:
1486 nigel 93 case OP_POSSTAR:
1487 nigel 77 case OP_PLUS:
1488     case OP_MINPLUS:
1489 nigel 93 case OP_POSPLUS:
1490 nigel 77 case OP_QUERY:
1491     case OP_MINQUERY:
1492 nigel 93 case OP_POSQUERY:
1493     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1494 nigel 77 break;
1495     }
1496 ph10 111 #endif
1497 nigel 77 }
1498     }
1499     }
1500    
1501    
1502    
1503     /*************************************************
1504     * Scan compiled branch for non-emptiness *
1505     *************************************************/
1506    
1507     /* This function scans through a branch of a compiled pattern to see whether it
1508 nigel 93 can match the empty string or not. It is called from could_be_empty()
1509     below and from compile_branch() when checking for an unlimited repeat of a
1510     group that can match nothing. Note that first_significant_code() skips over
1511 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1512     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1513     bracket whose current branch will already have been scanned.
1514 nigel 77
1515     Arguments:
1516     code points to start of search
1517     endcode points to where to stop
1518     utf8 TRUE if in UTF8 mode
1519    
1520     Returns: TRUE if what is matched could be empty
1521     */
1522    
1523     static BOOL
1524     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1525     {
1526     register int c;
1527 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1528 nigel 77 code < endcode;
1529     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1530     {
1531     const uschar *ccode;
1532    
1533     c = *code;
1534 ph10 286
1535     /* Skip over forward assertions; the other assertions are skipped by
1536 ph10 282 first_significant_code() with a TRUE final argument. */
1537 ph10 286
1538 ph10 282 if (c == OP_ASSERT)
1539 ph10 286 {
1540 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1541     c = *code;
1542     continue;
1543 ph10 286 }
1544 ph10 172
1545 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1546 nigel 77
1547 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1548     {
1549 ph10 172 code += _pcre_OP_lengths[c];
1550 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1551     c = *code;
1552     continue;
1553     }
1554    
1555     /* For other groups, scan the branches. */
1556 ph10 172
1557 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1558 nigel 77 {
1559     BOOL empty_branch;
1560     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1561    
1562     /* Scan a closed bracket */
1563    
1564     empty_branch = FALSE;
1565     do
1566     {
1567     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1568     empty_branch = TRUE;
1569     code += GET(code, 1);
1570     }
1571     while (*code == OP_ALT);
1572     if (!empty_branch) return FALSE; /* All branches are non-empty */
1573 ph10 172 c = *code;
1574 nigel 93 continue;
1575 nigel 77 }
1576    
1577 nigel 93 /* Handle the other opcodes */
1578    
1579     switch (c)
1580 nigel 77 {
1581 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1582     cannot be represented just by a bit map. This includes negated single
1583     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1584 ph10 220 actual length is stored in the compiled code, so we must update "code"
1585 ph10 216 here. */
1586 nigel 77
1587     #ifdef SUPPORT_UTF8
1588     case OP_XCLASS:
1589 ph10 216 ccode = code += GET(code, 1);
1590 nigel 77 goto CHECK_CLASS_REPEAT;
1591     #endif
1592    
1593     case OP_CLASS:
1594     case OP_NCLASS:
1595     ccode = code + 33;
1596    
1597     #ifdef SUPPORT_UTF8
1598     CHECK_CLASS_REPEAT:
1599     #endif
1600    
1601     switch (*ccode)
1602     {
1603     case OP_CRSTAR: /* These could be empty; continue */
1604     case OP_CRMINSTAR:
1605     case OP_CRQUERY:
1606     case OP_CRMINQUERY:
1607     break;
1608    
1609     default: /* Non-repeat => class must match */
1610     case OP_CRPLUS: /* These repeats aren't empty */
1611     case OP_CRMINPLUS:
1612     return FALSE;
1613    
1614     case OP_CRRANGE:
1615     case OP_CRMINRANGE:
1616     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1617     break;
1618     }
1619     break;
1620    
1621     /* Opcodes that must match a character */
1622    
1623     case OP_PROP:
1624     case OP_NOTPROP:
1625     case OP_EXTUNI:
1626     case OP_NOT_DIGIT:
1627     case OP_DIGIT:
1628     case OP_NOT_WHITESPACE:
1629     case OP_WHITESPACE:
1630     case OP_NOT_WORDCHAR:
1631     case OP_WORDCHAR:
1632     case OP_ANY:
1633     case OP_ANYBYTE:
1634     case OP_CHAR:
1635     case OP_CHARNC:
1636     case OP_NOT:
1637     case OP_PLUS:
1638     case OP_MINPLUS:
1639 nigel 93 case OP_POSPLUS:
1640 nigel 77 case OP_EXACT:
1641     case OP_NOTPLUS:
1642     case OP_NOTMINPLUS:
1643 nigel 93 case OP_NOTPOSPLUS:
1644 nigel 77 case OP_NOTEXACT:
1645     case OP_TYPEPLUS:
1646     case OP_TYPEMINPLUS:
1647 nigel 93 case OP_TYPEPOSPLUS:
1648 nigel 77 case OP_TYPEEXACT:
1649     return FALSE;
1650 ph10 227
1651     /* These are going to continue, as they may be empty, but we have to
1652     fudge the length for the \p and \P cases. */
1653    
1654 ph10 224 case OP_TYPESTAR:
1655     case OP_TYPEMINSTAR:
1656     case OP_TYPEPOSSTAR:
1657     case OP_TYPEQUERY:
1658     case OP_TYPEMINQUERY:
1659     case OP_TYPEPOSQUERY:
1660     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1661 ph10 227 break;
1662    
1663 ph10 224 /* Same for these */
1664 ph10 227
1665 ph10 224 case OP_TYPEUPTO:
1666     case OP_TYPEMINUPTO:
1667     case OP_TYPEPOSUPTO:
1668     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1669     break;
1670 nigel 77
1671     /* End of branch */
1672    
1673     case OP_KET:
1674     case OP_KETRMAX:
1675     case OP_KETRMIN:
1676     case OP_ALT:
1677     return TRUE;
1678    
1679 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1680     MINUPTO, and POSUPTO may be followed by a multibyte character */
1681 nigel 77
1682     #ifdef SUPPORT_UTF8
1683     case OP_STAR:
1684     case OP_MINSTAR:
1685 nigel 93 case OP_POSSTAR:
1686 nigel 77 case OP_QUERY:
1687     case OP_MINQUERY:
1688 nigel 93 case OP_POSQUERY:
1689 nigel 77 case OP_UPTO:
1690     case OP_MINUPTO:
1691 nigel 93 case OP_POSUPTO:
1692 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1693     break;
1694     #endif
1695     }
1696     }
1697    
1698     return TRUE;
1699     }
1700    
1701    
1702    
1703     /*************************************************
1704     * Scan compiled regex for non-emptiness *
1705     *************************************************/
1706    
1707     /* This function is called to check for left recursive calls. We want to check
1708     the current branch of the current pattern to see if it could match the empty
1709     string. If it could, we must look outwards for branches at other levels,
1710     stopping when we pass beyond the bracket which is the subject of the recursion.
1711    
1712     Arguments:
1713     code points to start of the recursion
1714     endcode points to where to stop (current RECURSE item)
1715     bcptr points to the chain of current (unclosed) branch starts
1716     utf8 TRUE if in UTF-8 mode
1717    
1718     Returns: TRUE if what is matched could be empty
1719     */
1720    
1721     static BOOL
1722     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1723     BOOL utf8)
1724     {
1725     while (bcptr != NULL && bcptr->current >= code)
1726     {
1727     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1728     bcptr = bcptr->outer;
1729     }
1730     return TRUE;
1731     }
1732    
1733    
1734    
1735     /*************************************************
1736     * Check for POSIX class syntax *
1737     *************************************************/
1738    
1739     /* This function is called when the sequence "[:" or "[." or "[=" is
1740 ph10 295 encountered in a character class. It checks whether this is followed by a
1741 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1742 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
1743 nigel 77
1744 ph10 298 Originally, this function only recognized a sequence of letters between the
1745     terminators, but it seems that Perl recognizes any sequence of characters,
1746     though of course unknown POSIX names are subsequently rejected. Perl gives an
1747     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1748     didn't consider this to be a POSIX class. Likewise for [:1234:].
1749 ph10 295
1750 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
1751     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1752     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1753     below handles the special case of \], but does not try to do any other escape
1754     processing. This makes it different from Perl for cases such as [:l\ower:]
1755 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1756 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1757 ph10 295 I think.
1758    
1759     Arguments:
1760 nigel 77 ptr pointer to the initial [
1761     endptr where to return the end pointer
1762    
1763     Returns: TRUE or FALSE
1764     */
1765    
1766     static BOOL
1767 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1768 nigel 77 {
1769     int terminator; /* Don't combine these lines; the Solaris cc */
1770     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1771 ph10 295 for (++ptr; *ptr != 0; ptr++)
1772 nigel 77 {
1773 ph10 295 if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1774 ph10 298 {
1775     if (*ptr == ']') return FALSE;
1776 ph10 295 if (*ptr == terminator && ptr[1] == ']')
1777     {
1778     *endptr = ptr;
1779     return TRUE;
1780 ph10 298 }
1781     }
1782     }
1783 nigel 77 return FALSE;
1784     }
1785    
1786    
1787    
1788    
1789     /*************************************************
1790     * Check POSIX class name *
1791     *************************************************/
1792    
1793     /* This function is called to check the name given in a POSIX-style class entry
1794     such as [:alnum:].
1795    
1796     Arguments:
1797     ptr points to the first letter
1798     len the length of the name
1799    
1800     Returns: a value representing the name, or -1 if unknown
1801     */
1802    
1803     static int
1804     check_posix_name(const uschar *ptr, int len)
1805     {
1806 ph10 240 const char *pn = posix_names;
1807 nigel 77 register int yield = 0;
1808     while (posix_name_lengths[yield] != 0)
1809     {
1810     if (len == posix_name_lengths[yield] &&
1811 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
1812 ph10 243 pn += posix_name_lengths[yield] + 1;
1813 nigel 77 yield++;
1814     }
1815     return -1;
1816     }
1817    
1818    
1819     /*************************************************
1820     * Adjust OP_RECURSE items in repeated group *
1821     *************************************************/
1822    
1823     /* OP_RECURSE items contain an offset from the start of the regex to the group
1824     that is referenced. This means that groups can be replicated for fixed
1825     repetition simply by copying (because the recursion is allowed to refer to
1826     earlier groups that are outside the current group). However, when a group is
1827     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1828     it, after it has been compiled. This means that any OP_RECURSE items within it
1829     that refer to the group itself or any contained groups have to have their
1830 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1831     the partially compiled regex must be temporarily terminated with OP_END.
1832 nigel 77
1833 nigel 93 This function has been extended with the possibility of forward references for
1834     recursions and subroutine calls. It must also check the list of such references
1835     for the group we are dealing with. If it finds that one of the recursions in
1836     the current group is on this list, it adjusts the offset in the list, not the
1837     value in the reference (which is a group number).
1838    
1839 nigel 77 Arguments:
1840     group points to the start of the group
1841     adjust the amount by which the group is to be moved
1842     utf8 TRUE in UTF-8 mode
1843     cd contains pointers to tables etc.
1844 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1845 nigel 77
1846     Returns: nothing
1847     */
1848    
1849     static void
1850 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1851     uschar *save_hwm)
1852 nigel 77 {
1853     uschar *ptr = group;
1854 ph10 224
1855 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1856     {
1857 nigel 93 int offset;
1858     uschar *hc;
1859    
1860     /* See if this recursion is on the forward reference list. If so, adjust the
1861     reference. */
1862    
1863     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1864     {
1865     offset = GET(hc, 0);
1866     if (cd->start_code + offset == ptr + 1)
1867     {
1868     PUT(hc, 0, offset + adjust);
1869     break;
1870     }
1871     }
1872    
1873     /* Otherwise, adjust the recursion offset if it's after the start of this
1874     group. */
1875    
1876     if (hc >= cd->hwm)
1877     {
1878     offset = GET(ptr, 1);
1879     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1880     }
1881    
1882 nigel 77 ptr += 1 + LINK_SIZE;
1883     }
1884     }
1885    
1886    
1887    
1888     /*************************************************
1889     * Insert an automatic callout point *
1890     *************************************************/
1891    
1892     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1893     callout points before each pattern item.
1894    
1895     Arguments:
1896     code current code pointer
1897     ptr current pattern pointer
1898     cd pointers to tables etc
1899    
1900     Returns: new code pointer
1901     */
1902    
1903     static uschar *
1904     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1905     {
1906     *code++ = OP_CALLOUT;
1907     *code++ = 255;
1908     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1909     PUT(code, LINK_SIZE, 0); /* Default length */
1910     return code + 2*LINK_SIZE;
1911     }
1912    
1913    
1914    
1915     /*************************************************
1916     * Complete a callout item *
1917     *************************************************/
1918    
1919     /* A callout item contains the length of the next item in the pattern, which
1920     we can't fill in till after we have reached the relevant point. This is used
1921     for both automatic and manual callouts.
1922    
1923     Arguments:
1924     previous_callout points to previous callout item
1925     ptr current pattern pointer
1926     cd pointers to tables etc
1927    
1928     Returns: nothing
1929     */
1930    
1931     static void
1932     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1933     {
1934     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1935     PUT(previous_callout, 2 + LINK_SIZE, length);
1936     }
1937    
1938    
1939    
1940     #ifdef SUPPORT_UCP
1941     /*************************************************
1942     * Get othercase range *
1943     *************************************************/
1944    
1945     /* This function is passed the start and end of a class range, in UTF-8 mode
1946     with UCP support. It searches up the characters, looking for internal ranges of
1947     characters in the "other" case. Each call returns the next one, updating the
1948     start address.
1949    
1950     Arguments:
1951     cptr points to starting character value; updated
1952     d end value
1953     ocptr where to put start of othercase range
1954     odptr where to put end of othercase range
1955    
1956     Yield: TRUE when range returned; FALSE when no more
1957     */
1958    
1959     static BOOL
1960 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1961     unsigned int *odptr)
1962 nigel 77 {
1963 nigel 93 unsigned int c, othercase, next;
1964 nigel 77
1965     for (c = *cptr; c <= d; c++)
1966 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1967 nigel 77
1968     if (c > d) return FALSE;
1969    
1970     *ocptr = othercase;
1971     next = othercase + 1;
1972    
1973     for (++c; c <= d; c++)
1974     {
1975 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1976 nigel 77 next++;
1977     }
1978    
1979     *odptr = next - 1;
1980     *cptr = c;
1981    
1982     return TRUE;
1983     }
1984     #endif /* SUPPORT_UCP */
1985    
1986    
1987 nigel 93
1988 nigel 77 /*************************************************
1989 nigel 93 * Check if auto-possessifying is possible *
1990     *************************************************/
1991    
1992     /* This function is called for unlimited repeats of certain items, to see
1993     whether the next thing could possibly match the repeated item. If not, it makes
1994     sense to automatically possessify the repeated item.
1995    
1996     Arguments:
1997     op_code the repeated op code
1998     this data for this item, depends on the opcode
1999     utf8 TRUE in UTF-8 mode
2000     utf8_char used for utf8 character bytes, NULL if not relevant
2001     ptr next character in pattern
2002     options options bits
2003     cd contains pointers to tables etc.
2004    
2005     Returns: TRUE if possessifying is wanted
2006     */
2007    
2008     static BOOL
2009     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2010     const uschar *ptr, int options, compile_data *cd)
2011     {
2012     int next;
2013    
2014     /* Skip whitespace and comments in extended mode */
2015    
2016     if ((options & PCRE_EXTENDED) != 0)
2017     {
2018     for (;;)
2019     {
2020     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2021     if (*ptr == '#')
2022     {
2023     while (*(++ptr) != 0)
2024     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2025     }
2026     else break;
2027     }
2028     }
2029    
2030     /* If the next item is one that we can handle, get its value. A non-negative
2031     value is a character, a negative value is an escape value. */
2032    
2033     if (*ptr == '\\')
2034     {
2035     int temperrorcode = 0;
2036     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2037     if (temperrorcode != 0) return FALSE;
2038     ptr++; /* Point after the escape sequence */
2039     }
2040    
2041     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2042     {
2043     #ifdef SUPPORT_UTF8
2044     if (utf8) { GETCHARINC(next, ptr); } else
2045     #endif
2046     next = *ptr++;
2047     }
2048    
2049     else return FALSE;
2050    
2051     /* Skip whitespace and comments in extended mode */
2052    
2053     if ((options & PCRE_EXTENDED) != 0)
2054     {
2055     for (;;)
2056     {
2057     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2058     if (*ptr == '#')
2059     {
2060     while (*(++ptr) != 0)
2061     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2062     }
2063     else break;
2064     }
2065     }
2066    
2067     /* If the next thing is itself optional, we have to give up. */
2068    
2069     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2070     return FALSE;
2071    
2072     /* Now compare the next item with the previous opcode. If the previous is a
2073     positive single character match, "item" either contains the character or, if
2074     "item" is greater than 127 in utf8 mode, the character's bytes are in
2075     utf8_char. */
2076    
2077    
2078     /* Handle cases when the next item is a character. */
2079    
2080     if (next >= 0) switch(op_code)
2081     {
2082     case OP_CHAR:
2083     #ifdef SUPPORT_UTF8
2084     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2085     #endif
2086     return item != next;
2087    
2088     /* For CHARNC (caseless character) we must check the other case. If we have
2089     Unicode property support, we can use it to test the other case of
2090     high-valued characters. */
2091    
2092     case OP_CHARNC:
2093     #ifdef SUPPORT_UTF8
2094     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2095     #endif
2096     if (item == next) return FALSE;
2097     #ifdef SUPPORT_UTF8
2098     if (utf8)
2099     {
2100     unsigned int othercase;
2101     if (next < 128) othercase = cd->fcc[next]; else
2102     #ifdef SUPPORT_UCP
2103     othercase = _pcre_ucp_othercase((unsigned int)next);
2104     #else
2105     othercase = NOTACHAR;
2106     #endif
2107     return (unsigned int)item != othercase;
2108     }
2109     else
2110     #endif /* SUPPORT_UTF8 */
2111     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2112    
2113     /* For OP_NOT, "item" must be a single-byte character. */
2114    
2115     case OP_NOT:
2116     if (item == next) return TRUE;
2117     if ((options & PCRE_CASELESS) == 0) return FALSE;
2118     #ifdef SUPPORT_UTF8
2119     if (utf8)
2120     {
2121     unsigned int othercase;
2122     if (next < 128) othercase = cd->fcc[next]; else
2123     #ifdef SUPPORT_UCP
2124     othercase = _pcre_ucp_othercase(next);
2125     #else
2126     othercase = NOTACHAR;
2127     #endif
2128     return (unsigned int)item == othercase;
2129     }
2130     else
2131     #endif /* SUPPORT_UTF8 */
2132     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2133    
2134     case OP_DIGIT:
2135     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2136    
2137     case OP_NOT_DIGIT:
2138     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2139    
2140     case OP_WHITESPACE:
2141     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2142    
2143     case OP_NOT_WHITESPACE:
2144     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2145    
2146     case OP_WORDCHAR:
2147     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2148    
2149     case OP_NOT_WORDCHAR:
2150     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2151    
2152 ph10 180 case OP_HSPACE:
2153     case OP_NOT_HSPACE:
2154     switch(next)
2155     {
2156     case 0x09:
2157     case 0x20:
2158     case 0xa0:
2159     case 0x1680:
2160     case 0x180e:
2161     case 0x2000:
2162     case 0x2001:
2163     case 0x2002:
2164     case 0x2003:
2165     case 0x2004:
2166     case 0x2005:
2167     case 0x2006:
2168     case 0x2007:
2169     case 0x2008:
2170     case 0x2009:
2171     case 0x200A:
2172     case 0x202f:
2173     case 0x205f:
2174     case 0x3000:
2175     return op_code != OP_HSPACE;
2176     default:
2177     return op_code == OP_HSPACE;
2178     }
2179    
2180     case OP_VSPACE:
2181     case OP_NOT_VSPACE:
2182     switch(next)
2183     {
2184     case 0x0a:
2185     case 0x0b:
2186     case 0x0c:
2187     case 0x0d:
2188     case 0x85:
2189     case 0x2028:
2190     case 0x2029:
2191     return op_code != OP_VSPACE;
2192     default:
2193     return op_code == OP_VSPACE;
2194     }
2195    
2196 nigel 93 default:
2197     return FALSE;
2198     }
2199    
2200    
2201     /* Handle the case when the next item is \d, \s, etc. */
2202    
2203     switch(op_code)
2204     {
2205     case OP_CHAR:
2206     case OP_CHARNC:
2207     #ifdef SUPPORT_UTF8
2208     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2209     #endif
2210     switch(-next)
2211     {
2212     case ESC_d:
2213     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2214    
2215     case ESC_D:
2216     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2217    
2218     case ESC_s:
2219     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2220    
2221     case ESC_S:
2222     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2223    
2224     case ESC_w:
2225     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2226    
2227     case ESC_W:
2228     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2229 ph10 182
2230 ph10 180 case ESC_h:
2231     case ESC_H:
2232     switch(item)
2233     {
2234     case 0x09:
2235     case 0x20:
2236     case 0xa0:
2237     case 0x1680:
2238     case 0x180e:
2239     case 0x2000:
2240     case 0x2001:
2241     case 0x2002:
2242     case 0x2003:
2243     case 0x2004:
2244     case 0x2005:
2245     case 0x2006:
2246     case 0x2007:
2247     case 0x2008:
2248     case 0x2009:
2249     case 0x200A:
2250     case 0x202f:
2251     case 0x205f:
2252     case 0x3000:
2253     return -next != ESC_h;
2254     default:
2255     return -next == ESC_h;
2256 ph10 182 }
2257    
2258 ph10 180 case ESC_v:
2259     case ESC_V:
2260     switch(item)
2261     {
2262     case 0x0a:
2263     case 0x0b:
2264     case 0x0c:
2265     case 0x0d:
2266     case 0x85:
2267     case 0x2028:
2268     case 0x2029:
2269     return -next != ESC_v;
2270     default:
2271     return -next == ESC_v;
2272 ph10 182 }
2273 nigel 93
2274     default:
2275     return FALSE;
2276     }
2277    
2278     case OP_DIGIT:
2279 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2280     next == -ESC_h || next == -ESC_v;
2281 nigel 93
2282     case OP_NOT_DIGIT:
2283     return next == -ESC_d;
2284    
2285     case OP_WHITESPACE:
2286     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2287    
2288     case OP_NOT_WHITESPACE:
2289 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2290 nigel 93
2291 ph10 180 case OP_HSPACE:
2292     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2293    
2294     case OP_NOT_HSPACE:
2295     return next == -ESC_h;
2296 ph10 182
2297 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2298 ph10 182 case OP_VSPACE:
2299 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2300    
2301     case OP_NOT_VSPACE:
2302 ph10 182 return next == -ESC_v;
2303 ph10 180
2304 nigel 93 case OP_WORDCHAR:
2305 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2306 nigel 93
2307     case OP_NOT_WORDCHAR:
2308     return next == -ESC_w || next == -ESC_d;
2309 ph10 182
2310 nigel 93 default:
2311     return FALSE;
2312     }
2313    
2314     /* Control does not reach here */
2315     }
2316    
2317    
2318    
2319     /*************************************************
2320 nigel 77 * Compile one branch *
2321     *************************************************/
2322    
2323 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2324 nigel 77 changed during the branch, the pointer is used to change the external options
2325 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2326     to find out the amount of memory needed, as well as during the real compile
2327     phase. The value of lengthptr distinguishes the two phases.
2328 nigel 77
2329     Arguments:
2330     optionsptr pointer to the option bits
2331     codeptr points to the pointer to the current code point
2332     ptrptr points to the current pattern pointer
2333     errorcodeptr points to error code variable
2334     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2335     reqbyteptr set to the last literal character required, else < 0
2336     bcptr points to current branch chain
2337     cd contains pointers to tables etc.
2338 nigel 93 lengthptr NULL during the real compile phase
2339     points to length accumulator during pre-compile phase
2340 nigel 77
2341     Returns: TRUE on success
2342     FALSE, with *errorcodeptr set non-zero on error
2343     */
2344    
2345     static BOOL
2346 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2347     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2348     compile_data *cd, int *lengthptr)
2349 nigel 77 {
2350     int repeat_type, op_type;
2351     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2352     int bravalue = 0;
2353     int greedy_default, greedy_non_default;
2354     int firstbyte, reqbyte;
2355     int zeroreqbyte, zerofirstbyte;
2356     int req_caseopt, reqvary, tempreqvary;
2357     int options = *optionsptr;
2358     int after_manual_callout = 0;
2359 nigel 93 int length_prevgroup = 0;
2360 nigel 77 register int c;
2361     register uschar *code = *codeptr;
2362 nigel 93 uschar *last_code = code;
2363     uschar *orig_code = code;
2364 nigel 77 uschar *tempcode;
2365     BOOL inescq = FALSE;
2366     BOOL groupsetfirstbyte = FALSE;
2367     const uschar *ptr = *ptrptr;
2368     const uschar *tempptr;
2369     uschar *previous = NULL;
2370     uschar *previous_callout = NULL;
2371 nigel 93 uschar *save_hwm = NULL;
2372 nigel 77 uschar classbits[32];
2373    
2374     #ifdef SUPPORT_UTF8
2375     BOOL class_utf8;
2376     BOOL utf8 = (options & PCRE_UTF8) != 0;
2377     uschar *class_utf8data;
2378 ph10 300 uschar *class_utf8data_base;
2379 nigel 77 uschar utf8_char[6];
2380     #else
2381     BOOL utf8 = FALSE;
2382 nigel 93 uschar *utf8_char = NULL;
2383 nigel 77 #endif
2384    
2385 nigel 93 #ifdef DEBUG
2386     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2387     #endif
2388    
2389 nigel 77 /* Set up the default and non-default settings for greediness */
2390    
2391     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2392     greedy_non_default = greedy_default ^ 1;
2393    
2394     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2395     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2396     matches a non-fixed char first char; reqbyte just remains unset if we never
2397     find one.
2398    
2399     When we hit a repeat whose minimum is zero, we may have to adjust these values
2400     to take the zero repeat into account. This is implemented by setting them to
2401     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2402     item types that can be repeated set these backoff variables appropriately. */
2403    
2404     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2405    
2406     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2407     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2408     value > 255. It is added into the firstbyte or reqbyte variables to record the
2409     case status of the value. This is used only for ASCII characters. */
2410    
2411     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2412    
2413     /* Switch on next character until the end of the branch */
2414    
2415     for (;; ptr++)
2416     {
2417     BOOL negate_class;
2418 ph10 286 BOOL should_flip_negation;
2419 nigel 77 BOOL possessive_quantifier;
2420     BOOL is_quantifier;
2421 nigel 93 BOOL is_recurse;
2422 ph10 180 BOOL reset_bracount;
2423 nigel 77 int class_charcount;
2424     int class_lastchar;
2425     int newoptions;
2426     int recno;
2427 ph10 172 int refsign;
2428 nigel 77 int skipbytes;
2429     int subreqbyte;
2430     int subfirstbyte;
2431 nigel 93 int terminator;
2432 nigel 77 int mclength;
2433     uschar mcbuffer[8];
2434    
2435 nigel 93 /* Get next byte in the pattern */
2436 nigel 77
2437     c = *ptr;
2438    
2439 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2440     previous cycle of this loop. */
2441    
2442     if (lengthptr != NULL)
2443     {
2444     #ifdef DEBUG
2445     if (code > cd->hwm) cd->hwm = code; /* High water info */
2446     #endif
2447     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2448     {
2449     *errorcodeptr = ERR52;
2450     goto FAILED;
2451     }
2452    
2453     /* There is at least one situation where code goes backwards: this is the
2454     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2455     the class is simply eliminated. However, it is created first, so we have to
2456     allow memory for it. Therefore, don't ever reduce the length at this point.
2457     */
2458    
2459     if (code < last_code) code = last_code;
2460 ph10 202
2461     /* Paranoid check for integer overflow */
2462    
2463     if (OFLOW_MAX - *lengthptr < code - last_code)
2464     {
2465     *errorcodeptr = ERR20;
2466     goto FAILED;
2467     }
2468    
2469 nigel 93 *lengthptr += code - last_code;
2470     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2471    
2472     /* If "previous" is set and it is not at the start of the work space, move
2473     it back to there, in order to avoid filling up the work space. Otherwise,
2474     if "previous" is NULL, reset the current code pointer to the start. */
2475    
2476     if (previous != NULL)
2477     {
2478     if (previous > orig_code)
2479     {
2480     memmove(orig_code, previous, code - previous);
2481     code -= previous - orig_code;
2482     previous = orig_code;
2483     }
2484     }
2485     else code = orig_code;
2486    
2487     /* Remember where this code item starts so we can pick up the length
2488     next time round. */
2489    
2490     last_code = code;
2491     }
2492    
2493     /* In the real compile phase, just check the workspace used by the forward
2494     reference list. */
2495    
2496     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2497     {
2498     *errorcodeptr = ERR52;
2499     goto FAILED;
2500     }
2501    
2502 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2503    
2504     if (inescq && c != 0)
2505     {
2506     if (c == '\\' && ptr[1] == 'E')
2507     {
2508     inescq = FALSE;
2509     ptr++;
2510     continue;
2511     }
2512     else
2513     {
2514     if (previous_callout != NULL)
2515     {
2516 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2517     complete_callout(previous_callout, ptr, cd);
2518 nigel 77 previous_callout = NULL;
2519     }
2520     if ((options & PCRE_AUTO_CALLOUT) != 0)
2521     {
2522     previous_callout = code;
2523     code = auto_callout(code, ptr, cd);
2524     }
2525     goto NORMAL_CHAR;
2526     }
2527     }
2528    
2529     /* Fill in length of a previous callout, except when the next thing is
2530     a quantifier. */
2531    
2532     is_quantifier = c == '*' || c == '+' || c == '?' ||
2533     (c == '{' && is_counted_repeat(ptr+1));
2534    
2535     if (!is_quantifier && previous_callout != NULL &&
2536     after_manual_callout-- <= 0)
2537     {
2538 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2539     complete_callout(previous_callout, ptr, cd);
2540 nigel 77 previous_callout = NULL;
2541     }
2542    
2543     /* In extended mode, skip white space and comments */
2544    
2545     if ((options & PCRE_EXTENDED) != 0)
2546     {
2547     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2548     if (c == '#')
2549     {
2550 nigel 93 while (*(++ptr) != 0)
2551 nigel 91 {
2552 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2553 nigel 91 }
2554 nigel 93 if (*ptr != 0) continue;
2555    
2556 nigel 91 /* Else fall through to handle end of string */
2557     c = 0;
2558 nigel 77 }
2559     }
2560    
2561     /* No auto callout for quantifiers. */
2562    
2563     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2564     {
2565     previous_callout = code;
2566     code = auto_callout(code, ptr, cd);
2567     }
2568    
2569     switch(c)
2570     {
2571 nigel 93 /* ===================================================================*/
2572     case 0: /* The branch terminates at string end */
2573     case '|': /* or | or ) */
2574 nigel 77 case ')':
2575     *firstbyteptr = firstbyte;
2576     *reqbyteptr = reqbyte;
2577     *codeptr = code;
2578     *ptrptr = ptr;
2579 nigel 93 if (lengthptr != NULL)
2580     {
2581 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2582     {
2583     *errorcodeptr = ERR20;
2584     goto FAILED;
2585     }
2586 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2587     DPRINTF((">> end branch\n"));
2588     }
2589 nigel 77 return TRUE;
2590    
2591 nigel 93
2592     /* ===================================================================*/
2593 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2594     the setting of any following char as a first character. */
2595    
2596     case '^':
2597     if ((options & PCRE_MULTILINE) != 0)
2598     {
2599     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2600     }
2601     previous = NULL;
2602     *code++ = OP_CIRC;
2603     break;
2604    
2605     case '$':
2606     previous = NULL;
2607     *code++ = OP_DOLL;
2608     break;
2609    
2610     /* There can never be a first char if '.' is first, whatever happens about
2611     repeats. The value of reqbyte doesn't change either. */
2612    
2613     case '.':
2614     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2615     zerofirstbyte = firstbyte;
2616     zeroreqbyte = reqbyte;
2617     previous = code;
2618     *code++ = OP_ANY;
2619     break;
2620    
2621 nigel 93
2622     /* ===================================================================*/
2623 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2624     32-byte bitmap of the permitted characters, except in the special case
2625     where there is only one such character. For negated classes, we build the
2626     map as usual, then invert it at the end. However, we use a different opcode
2627     so that data characters > 255 can be handled correctly.
2628 nigel 77
2629     If the class contains characters outside the 0-255 range, a different
2630     opcode is compiled. It may optionally have a bit map for characters < 256,
2631     but those above are are explicitly listed afterwards. A flag byte tells
2632     whether the bitmap is present, and whether this is a negated class or not.
2633     */
2634    
2635     case '[':
2636     previous = code;
2637    
2638     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2639     they are encountered at the top level, so we'll do that too. */
2640    
2641     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2642 ph10 295 check_posix_syntax(ptr, &tempptr))
2643 nigel 77 {
2644     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2645     goto FAILED;
2646     }
2647    
2648 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2649 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2650 ph10 205 skip them too. This makes for compatibility with Perl. */
2651 ph10 208
2652 ph10 205 negate_class = FALSE;
2653     for (;;)
2654 nigel 77 {
2655     c = *(++ptr);
2656 ph10 205 if (c == '\\')
2657     {
2658 ph10 208 if (ptr[1] == 'E') ptr++;
2659 ph10 205 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2660 ph10 208 else break;
2661 ph10 205 }
2662     else if (!negate_class && c == '^')
2663     negate_class = TRUE;
2664     else break;
2665 ph10 208 }
2666 nigel 77
2667 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
2668     negation flag at the end, so that support for characters > 255 works
2669 ph10 264 correctly (they are all included in the class). */
2670    
2671     should_flip_negation = FALSE;
2672    
2673 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
2674 nigel 93 of just a single character (as long as it's < 256). However, For higher
2675     valued UTF-8 characters, we don't yet do any optimization. */
2676 nigel 77
2677     class_charcount = 0;
2678     class_lastchar = -1;
2679    
2680 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2681     temporary bit of memory, in case the class contains only 1 character (less
2682     than 256), because in that case the compiled code doesn't use the bit map.
2683     */
2684    
2685     memset(classbits, 0, 32 * sizeof(uschar));
2686    
2687 nigel 77 #ifdef SUPPORT_UTF8
2688     class_utf8 = FALSE; /* No chars >= 256 */
2689 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2690 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2691 nigel 77 #endif
2692    
2693     /* Process characters until ] is reached. By writing this as a "do" it
2694 nigel 93 means that an initial ] is taken as a data character. At the start of the
2695     loop, c contains the first byte of the character. */
2696 nigel 77
2697 nigel 93 if (c != 0) do
2698 nigel 77 {
2699 nigel 93 const uschar *oldptr;
2700    
2701 nigel 77 #ifdef SUPPORT_UTF8
2702     if (utf8 && c > 127)
2703     { /* Braces are required because the */
2704     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2705     }
2706 ph10 309
2707 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2708 ph10 309 data and reset the pointer. This is so that very large classes that
2709 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
2710 ph10 309 (which is on the stack). */
2711    
2712 ph10 300 if (lengthptr != NULL)
2713     {
2714     *lengthptr += class_utf8data - class_utf8data_base;
2715 ph10 309 class_utf8data = class_utf8data_base;
2716     }
2717    
2718 nigel 77 #endif
2719    
2720     /* Inside \Q...\E everything is literal except \E */
2721    
2722     if (inescq)
2723     {
2724 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2725 nigel 77 {
2726 nigel 93 inescq = FALSE; /* Reset literal state */
2727     ptr++; /* Skip the 'E' */
2728     continue; /* Carry on with next */
2729 nigel 77 }
2730 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2731 nigel 77 }
2732    
2733     /* Handle POSIX class names. Perl allows a negation extension of the
2734     form [:^name:]. A square bracket that doesn't match the syntax is
2735     treated as a literal. We also recognize the POSIX constructions
2736     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2737     5.6 and 5.8 do. */
2738    
2739     if (c == '[' &&
2740     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2741 ph10 295 check_posix_syntax(ptr, &tempptr))
2742 nigel 77 {
2743     BOOL local_negate = FALSE;
2744 nigel 87 int posix_class, taboffset, tabopt;
2745 nigel 77 register const uschar *cbits = cd->cbits;
2746 nigel 87 uschar pbits[32];
2747 nigel 77
2748     if (ptr[1] != ':')
2749     {
2750     *errorcodeptr = ERR31;
2751     goto FAILED;
2752     }
2753    
2754     ptr += 2;
2755     if (*ptr == '^')
2756     {
2757     local_negate = TRUE;
2758 ph10 286 should_flip_negation = TRUE; /* Note negative special */
2759 nigel 77 ptr++;
2760     }
2761    
2762     posix_class = check_posix_name(ptr, tempptr - ptr);
2763     if (posix_class < 0)
2764     {
2765     *errorcodeptr = ERR30;
2766     goto FAILED;
2767     }
2768    
2769     /* If matching is caseless, upper and lower are converted to
2770     alpha. This relies on the fact that the class table starts with
2771     alpha, lower, upper as the first 3 entries. */
2772    
2773     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2774     posix_class = 0;
2775    
2776 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2777     because we may be adding and subtracting from it, and we don't want to
2778     subtract bits that may be in the main map already. At the end we or the
2779     result into the bit map that is being built. */
2780 nigel 77
2781     posix_class *= 3;
2782 nigel 87
2783     /* Copy in the first table (always present) */
2784    
2785     memcpy(pbits, cbits + posix_class_maps[posix_class],
2786     32 * sizeof(uschar));
2787    
2788     /* If there is a second table, add or remove it as required. */
2789    
2790     taboffset = posix_class_maps[posix_class + 1];
2791     tabopt = posix_class_maps[posix_class + 2];
2792    
2793     if (taboffset >= 0)
2794 nigel 77 {
2795 nigel 87 if (tabopt >= 0)
2796     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2797 nigel 77 else
2798 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2799 nigel 77 }
2800    
2801 nigel 87 /* Not see if we need to remove any special characters. An option
2802     value of 1 removes vertical space and 2 removes underscore. */
2803    
2804     if (tabopt < 0) tabopt = -tabopt;
2805     if (tabopt == 1) pbits[1] &= ~0x3c;
2806     else if (tabopt == 2) pbits[11] &= 0x7f;
2807    
2808     /* Add the POSIX table or its complement into the main table that is
2809     being built and we are done. */
2810    
2811     if (local_negate)
2812     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2813     else
2814     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2815    
2816 nigel 77 ptr = tempptr + 1;
2817     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2818     continue; /* End of POSIX syntax handling */
2819     }
2820    
2821     /* Backslash may introduce a single character, or it may introduce one
2822 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2823     case. Inside a class (and only there) it is treated as backspace.
2824     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2825 ph10 205 to 'or' into the one we are building. We assume they have more than one
2826 nigel 77 character in them, so set class_charcount bigger than one. */
2827    
2828     if (c == '\\')
2829     {
2830 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2831     if (*errorcodeptr != 0) goto FAILED;
2832 nigel 77
2833 ph10 275 if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
2834 nigel 77 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2835 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2836 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2837     {
2838     if (ptr[1] == '\\' && ptr[2] == 'E')
2839     {
2840     ptr += 2; /* avoid empty string */
2841     }
2842     else inescq = TRUE;
2843     continue;
2844     }
2845 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2846 nigel 77
2847     if (c < 0)
2848     {
2849     register const uschar *cbits = cd->cbits;
2850     class_charcount += 2; /* Greater than 1 is what matters */
2851 nigel 93
2852     /* Save time by not doing this in the pre-compile phase. */
2853    
2854     if (lengthptr == NULL) switch (-c)
2855 nigel 77 {
2856     case ESC_d:
2857     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2858     continue;
2859    
2860     case ESC_D:
2861 ph10 286 should_flip_negation = TRUE;
2862 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2863     continue;
2864    
2865     case ESC_w:
2866     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2867     continue;
2868    
2869     case ESC_W:
2870 ph10 286 should_flip_negation = TRUE;
2871 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2872     continue;
2873    
2874     case ESC_s:
2875     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2876     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2877     continue;
2878    
2879     case ESC_S:
2880 ph10 286 should_flip_negation = TRUE;
2881 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2882     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2883     continue;
2884    
2885 nigel 93 default: /* Not recognized; fall through */
2886     break; /* Need "default" setting to stop compiler warning. */
2887     }
2888    
2889     /* In the pre-compile phase, just do the recognition. */
2890    
2891     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2892     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2893 ph10 180
2894 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
2895     they use extra memory. */
2896 ph10 180
2897 ph10 178 if (-c == ESC_h)
2898     {
2899     SETBIT(classbits, 0x09); /* VT */
2900     SETBIT(classbits, 0x20); /* SPACE */
2901 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
2902 ph10 178 #ifdef SUPPORT_UTF8
2903     if (utf8)
2904 ph10 180 {
2905 ph10 178 class_utf8 = TRUE;
2906     *class_utf8data++ = XCL_SINGLE;
2907 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2908 ph10 178 *class_utf8data++ = XCL_SINGLE;
2909 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2910     *class_utf8data++ = XCL_RANGE;
2911     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2912     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2913 ph10 178 *class_utf8data++ = XCL_SINGLE;
2914 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2915 ph10 178 *class_utf8data++ = XCL_SINGLE;
2916 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2917 ph10 178 *class_utf8data++ = XCL_SINGLE;
2918 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2919     }
2920     #endif
2921     continue;
2922     }
2923 nigel 93
2924 ph10 178 if (-c == ESC_H)
2925     {
2926     for (c = 0; c < 32; c++)
2927     {
2928     int x = 0xff;
2929     switch (c)
2930 ph10 180 {
2931 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
2932     case 0x20/8: x ^= 1 << (0x20%8); break;
2933     case 0xa0/8: x ^= 1 << (0xa0%8); break;
2934     default: break;
2935     }
2936     classbits[c] |= x;
2937 ph10 180 }
2938    
2939 ph10 178 #ifdef SUPPORT_UTF8
2940     if (utf8)
2941 ph10 180 {
2942 ph10 178 class_utf8 = TRUE;
2943 ph10 180 *class_utf8data++ = XCL_RANGE;
2944     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2945     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2946     *class_utf8data++ = XCL_RANGE;
2947     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2948     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2949     *class_utf8data++ = XCL_RANGE;
2950     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2951     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2952     *class_utf8data++ = XCL_RANGE;
2953     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2954     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2955     *class_utf8data++ = XCL_RANGE;
2956     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2957     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2958     *class_utf8data++ = XCL_RANGE;
2959     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2960     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2961     *class_utf8data++ = XCL_RANGE;
2962     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2963     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2964     }
2965     #endif
2966     continue;
2967     }
2968 ph10 178
2969     if (-c == ESC_v)
2970     {
2971     SETBIT(classbits, 0x0a); /* LF */
2972     SETBIT(classbits, 0x0b); /* VT */
2973 ph10 180 SETBIT(classbits, 0x0c); /* FF */
2974     SETBIT(classbits, 0x0d); /* CR */
2975     SETBIT(classbits, 0x85); /* NEL */
2976 ph10 178 #ifdef SUPPORT_UTF8
2977     if (utf8)
2978 ph10 180 {
2979 ph10 178 class_utf8 = TRUE;
2980 ph10 180 *class_utf8data++ = XCL_RANGE;
2981     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2982     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2983     }
2984     #endif
2985     continue;
2986     }
2987 ph10 178
2988     if (-c == ESC_V)
2989     {
2990     for (c = 0; c < 32; c++)
2991     {
2992     int x = 0xff;
2993     switch (c)
2994 ph10 180 {
2995 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
2996     x ^= 1 << (0x0b%8);
2997     x ^= 1 << (0x0c%8);
2998 ph10 180 x ^= 1 << (0x0d%8);
2999 ph10 178 break;
3000     case 0x85/8: x ^= 1 << (0x85%8); break;
3001     default: break;
3002     }
3003     classbits[c] |= x;
3004 ph10 180 }
3005    
3006 ph10 178 #ifdef SUPPORT_UTF8
3007     if (utf8)
3008 ph10 180 {
3009 ph10 178 class_utf8 = TRUE;
3010 ph10 180 *class_utf8data++ = XCL_RANGE;
3011     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3012     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3013     *class_utf8data++ = XCL_RANGE;
3014     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3015     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3016     }
3017     #endif
3018     continue;
3019     }
3020 ph10 178
3021 nigel 93 /* We need to deal with \P and \p in both phases. */
3022    
3023 nigel 77 #ifdef SUPPORT_UCP
3024 nigel 93 if (-c == ESC_p || -c == ESC_P)
3025     {
3026     BOOL negated;
3027     int pdata;
3028     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3029     if (ptype < 0) goto FAILED;
3030     class_utf8 = TRUE;
3031     *class_utf8data++ = ((-c == ESC_p) != negated)?
3032     XCL_PROP : XCL_NOTPROP;
3033     *class_utf8data++ = ptype;
3034     *class_utf8data++ = pdata;
3035     class_charcount -= 2; /* Not a < 256 character */
3036 nigel 77 continue;
3037 nigel 93 }
3038 nigel 77 #endif
3039 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
3040     strict mode. By default, for compatibility with Perl, they are
3041     treated as literals. */
3042 nigel 77
3043 nigel 93 if ((options & PCRE_EXTRA) != 0)
3044     {
3045     *errorcodeptr = ERR7;
3046     goto FAILED;
3047     }
3048 nigel 77
3049 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3050     c = *ptr; /* Get the final character and fall through */
3051 nigel 77 }
3052    
3053     /* Fall through if we have a single character (c >= 0). This may be
3054 nigel 93 greater than 256 in UTF-8 mode. */
3055 nigel 77
3056     } /* End of backslash handling */
3057    
3058     /* A single character may be followed by '-' to form a range. However,
3059     Perl does not permit ']' to be the end of the range. A '-' character
3060 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3061     entirely. The code for handling \Q and \E is messy. */
3062 nigel 77
3063 nigel 93 CHECK_RANGE:
3064     while (ptr[1] == '\\' && ptr[2] == 'E')
3065 nigel 77 {
3066 nigel 93 inescq = FALSE;
3067     ptr += 2;
3068     }
3069    
3070     oldptr = ptr;
3071 ph10 231
3072 ph10 230 /* Remember \r or \n */
3073 ph10 231
3074     if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3075    
3076 ph10 230 /* Check for range */
3077 nigel 93
3078     if (!inescq && ptr[1] == '-')
3079     {
3080 nigel 77 int d;
3081     ptr += 2;
3082 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3083 nigel 77
3084 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3085     mode. */
3086    
3087     while (*ptr == '\\' && ptr[1] == 'Q')
3088     {
3089     ptr += 2;
3090     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3091     inescq = TRUE;
3092     break;
3093     }
3094    
3095     if (*ptr == 0 || (!inescq && *ptr == ']'))
3096     {
3097     ptr = oldptr;
3098     goto LONE_SINGLE_CHARACTER;
3099     }
3100    
3101 nigel 77 #ifdef SUPPORT_UTF8
3102     if (utf8)
3103     { /* Braces are required because the */
3104     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3105     }
3106     else
3107     #endif
3108     d = *ptr; /* Not UTF-8 mode */
3109    
3110     /* The second part of a range can be a single-character escape, but
3111     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3112     in such circumstances. */
3113    
3114 nigel 93 if (!inescq && d == '\\')
3115 nigel 77 {
3116 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3117     if (*errorcodeptr != 0) goto FAILED;
3118 nigel 77
3119 ph10 275 /* \b is backspace; \X is literal X; \R is literal R; any other
3120 nigel 93 special means the '-' was literal */
3121 nigel 77
3122     if (d < 0)
3123     {
3124     if (d == -ESC_b) d = '\b';
3125 nigel 93 else if (d == -ESC_X) d = 'X';
3126     else if (d == -ESC_R) d = 'R'; else
3127 nigel 77 {
3128 nigel 93 ptr = oldptr;
3129 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3130     }
3131     }
3132     }
3133    
3134 nigel 93 /* Check that the two values are in the correct order. Optimize
3135     one-character ranges */
3136 nigel 77
3137 nigel 93 if (d < c)
3138     {
3139     *errorcodeptr = ERR8;
3140     goto FAILED;
3141     }
3142    
3143 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3144    
3145 ph10 230 /* Remember \r or \n */
3146 ph10 231
3147     if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3148    
3149 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3150     matching, we have to use an XCLASS with extra data items. Caseless
3151     matching for characters > 127 is available only if UCP support is
3152     available. */
3153    
3154     #ifdef SUPPORT_UTF8
3155     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3156     {
3157     class_utf8 = TRUE;
3158    
3159     /* With UCP support, we can find the other case equivalents of
3160     the relevant characters. There may be several ranges. Optimize how
3161     they fit with the basic range. */
3162    
3163     #ifdef SUPPORT_UCP
3164     if ((options & PCRE_CASELESS) != 0)
3165     {
3166 nigel 93 unsigned int occ, ocd;
3167     unsigned int cc = c;
3168     unsigned int origd = d;
3169 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3170     {
3171 ph10 180 if (occ >= (unsigned int)c &&
3172     ocd <= (unsigned int)d)
3173 ph10 176 continue; /* Skip embedded ranges */
3174 nigel 77
3175 ph10 180 if (occ < (unsigned int)c &&
3176 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3177 nigel 77 { /* if there is overlap, */
3178     c = occ; /* noting that if occ < c */
3179     continue; /* we can't have ocd > d */
3180     } /* because a subrange is */
3181 ph10 180 if (ocd > (unsigned int)d &&
3182 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3183 nigel 77 { /* the basic range. */
3184     d = ocd;
3185     continue;
3186     }
3187    
3188     if (occ == ocd)
3189     {
3190     *class_utf8data++ = XCL_SINGLE;
3191     }
3192     else
3193     {
3194     *class_utf8data++ = XCL_RANGE;
3195     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3196     }
3197     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3198     }
3199     }
3200     #endif /* SUPPORT_UCP */
3201    
3202     /* Now record the original range, possibly modified for UCP caseless
3203     overlapping ranges. */
3204    
3205     *class_utf8data++ = XCL_RANGE;
3206     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3207     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3208    
3209     /* With UCP support, we are done. Without UCP support, there is no
3210     caseless matching for UTF-8 characters > 127; we can use the bit map
3211     for the smaller ones. */
3212    
3213     #ifdef SUPPORT_UCP
3214     continue; /* With next character in the class */
3215     #else
3216     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3217    
3218     /* Adjust upper limit and fall through to set up the map */
3219    
3220     d = 127;
3221    
3222     #endif /* SUPPORT_UCP */
3223     }
3224     #endif /* SUPPORT_UTF8 */
3225    
3226     /* We use the bit map for all cases when not in UTF-8 mode; else
3227     ranges that lie entirely within 0-127 when there is UCP support; else
3228     for partial ranges without UCP support. */
3229    
3230 nigel 93 class_charcount += d - c + 1;
3231     class_lastchar = d;
3232    
3233     /* We can save a bit of time by skipping this in the pre-compile. */
3234    
3235     if (lengthptr == NULL) for (; c <= d; c++)
3236 nigel 77 {
3237     classbits[c/8] |= (1 << (c&7));
3238     if ((options & PCRE_CASELESS) != 0)
3239     {
3240     int uc = cd->fcc[c]; /* flip case */
3241     classbits[uc/8] |= (1 << (uc&7));
3242     }
3243     }
3244    
3245     continue; /* Go get the next char in the class */
3246     }
3247    
3248     /* Handle a lone single character - we can get here for a normal
3249     non-escape char, or after \ that introduces a single character or for an
3250     apparent range that isn't. */
3251    
3252     LONE_SINGLE_CHARACTER:
3253 ph10 231
3254 nigel 77 /* Handle a character that cannot go in the bit map */
3255    
3256     #ifdef SUPPORT_UTF8
3257     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3258     {
3259     class_utf8 = TRUE;
3260     *class_utf8data++ = XCL_SINGLE;
3261     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3262    
3263     #ifdef SUPPORT_UCP
3264     if ((options & PCRE_CASELESS) != 0)
3265     {
3266 nigel 93 unsigned int othercase;
3267     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3268 nigel 77 {
3269     *class_utf8data++ = XCL_SINGLE;
3270     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3271     }
3272     }
3273     #endif /* SUPPORT_UCP */
3274    
3275     }
3276     else
3277     #endif /* SUPPORT_UTF8 */
3278    
3279     /* Handle a single-byte character */
3280     {
3281     classbits[c/8] |= (1 << (c&7));
3282     if ((options & PCRE_CASELESS) != 0)
3283     {
3284     c = cd->fcc[c]; /* flip case */
3285     classbits[c/8] |= (1 << (c&7));
3286     }
3287     class_charcount++;
3288     class_lastchar = c;
3289     }
3290     }
3291    
3292 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3293 nigel 77
3294 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3295 nigel 77
3296 nigel 93 if (c == 0) /* Missing terminating ']' */
3297     {
3298     *errorcodeptr = ERR6;
3299     goto FAILED;
3300     }
3301 ph10 231
3302    
3303 ph10 230 /* This code has been disabled because it would mean that \s counts as
3304     an explicit \r or \n reference, and that's not really what is wanted. Now
3305     we set the flag only if there is a literal "\r" or "\n" in the class. */
3306 ph10 227
3307 ph10 230 #if 0
3308 ph10 226 /* Remember whether \r or \n are in this class */
3309 ph10 227
3310 ph10 226 if (negate_class)
3311     {
3312 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3313 ph10 226 }
3314     else
3315     {
3316 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3317 ph10 227 }
3318 ph10 230 #endif
3319 ph10 227
3320 ph10 231
3321 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3322 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3323     use of \p or \P, in other words, no use of any XCLASS features, we can
3324     optimize.
3325    
3326 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3327     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3328     operate on single-bytes only. This is an historical hangover. Maybe one day
3329     we can tidy these opcodes to handle multi-byte characters.
3330 nigel 77
3331     The optimization throws away the bit map. We turn the item into a
3332     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3333     that OP_NOT does not support multibyte characters. In the positive case, it
3334     can cause firstbyte to be set. Otherwise, there can be no first char if
3335     this item is first, whatever repeat count may follow. In the case of
3336     reqbyte, save the previous value for reinstating. */
3337    
3338     #ifdef SUPPORT_UTF8
3339 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3340 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3341 nigel 77 #else
3342     if (class_charcount == 1)
3343     #endif
3344     {
3345     zeroreqbyte = reqbyte;
3346    
3347     /* The OP_NOT opcode works on one-byte characters only. */
3348    
3349     if (negate_class)
3350     {
3351     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3352     zerofirstbyte = firstbyte;
3353     *code++ = OP_NOT;
3354     *code++ = class_lastchar;
3355     break;
3356     }
3357    
3358     /* For a single, positive character, get the value into mcbuffer, and
3359     then we can handle this with the normal one-character code. */
3360    
3361     #ifdef SUPPORT_UTF8
3362     if (utf8 && class_lastchar > 127)
3363     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3364     else
3365     #endif
3366     {
3367     mcbuffer[0] = class_lastchar;
3368     mclength = 1;
3369     }
3370     goto ONE_CHAR;
3371     } /* End of 1-char optimization */
3372    
3373     /* The general case - not the one-char optimization. If this is the first
3374     thing in the branch, there can be no first char setting, whatever the
3375     repeat count. Any reqbyte setting must remain unchanged after any kind of
3376     repeat. */
3377    
3378     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3379     zerofirstbyte = firstbyte;
3380     zeroreqbyte = reqbyte;
3381    
3382     /* If there are characters with values > 255, we have to compile an
3383 ph10 286 extended class, with its own opcode, unless there was a negated special
3384     such as \S in the class, because in that case all characters > 255 are in
3385     the class, so any that were explicitly given as well can be ignored. If
3386 ph10 264 (when there are explicit characters > 255 that must be listed) there are no
3387     characters < 256, we can omit the bitmap in the actual compiled code. */
3388 nigel 77
3389     #ifdef SUPPORT_UTF8
3390 ph10 264 if (class_utf8 && !should_flip_negation)
3391 nigel 77 {
3392     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3393     *code++ = OP_XCLASS;
3394     code += LINK_SIZE;
3395     *code = negate_class? XCL_NOT : 0;
3396    
3397 nigel 93 /* If the map is required, move up the extra data to make room for it;
3398     otherwise just move the code pointer to the end of the extra data. */
3399 nigel 77
3400     if (class_charcount > 0)
3401     {
3402     *code++ |= XCL_MAP;
3403 nigel 93 memmove(code + 32, code, class_utf8data - code);
3404 nigel 77 memcpy(code, classbits, 32);
3405 nigel 93 code = class_utf8data + 32;
3406 nigel 77 }
3407 nigel 93 else code = class_utf8data;
3408 nigel 77
3409     /* Now fill in the complete length of the item */
3410    
3411     PUT(previous, 1, code - previous);
3412     break; /* End of class handling */
3413     }
3414     #endif
3415    
3416 ph10 286 /* If there are no characters > 255, set the opcode to OP_CLASS or
3417     OP_NCLASS, depending on whether the whole class was negated and whether
3418     there were negative specials such as \S in the class. Then copy the 32-byte
3419 ph10 264 map into the code vector, negating it if necessary. */
3420 ph10 286
3421 ph10 264 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3422 nigel 77 if (negate_class)
3423     {
3424 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3425     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3426 nigel 77 }
3427     else
3428     {
3429     memcpy(code, classbits, 32);
3430     }
3431     code += 32;
3432     break;
3433    
3434 nigel 93
3435     /* ===================================================================*/
3436 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3437     has been tested above. */
3438    
3439     case '{':
3440     if (!is_quantifier) goto NORMAL_CHAR;
3441     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3442     if (*errorcodeptr != 0) goto FAILED;
3443     goto REPEAT;
3444    
3445     case '*':
3446     repeat_min = 0;
3447     repeat_max = -1;
3448     goto REPEAT;
3449    
3450     case '+':
3451     repeat_min = 1;
3452     repeat_max = -1;
3453     goto REPEAT;
3454    
3455     case '?':
3456     repeat_min = 0;
3457     repeat_max = 1;
3458    
3459     REPEAT:
3460     if (previous == NULL)
3461     {
3462     *errorcodeptr = ERR9;
3463     goto FAILED;
3464     }
3465    
3466     if (repeat_min == 0)
3467     {
3468     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3469     reqbyte = zeroreqbyte; /* Ditto */
3470     }
3471    
3472     /* Remember whether this is a variable length repeat */
3473    
3474     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3475    
3476     op_type = 0; /* Default single-char op codes */
3477     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3478    
3479     /* Save start of previous item, in case we have to move it up to make space
3480     for an inserted OP_ONCE for the additional '+' extension. */
3481    
3482     tempcode = previous;
3483    
3484     /* If the next character is '+', we have a possessive quantifier. This
3485     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3486     If the next character is '?' this is a minimizing repeat, by default,
3487     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3488     repeat type to the non-default. */
3489    
3490     if (ptr[1] == '+')
3491     {
3492     repeat_type = 0; /* Force greedy */
3493     possessive_quantifier = TRUE;
3494     ptr++;
3495     }
3496     else if (ptr[1] == '?')
3497     {
3498     repeat_type = greedy_non_default;
3499     ptr++;
3500     }
3501     else repeat_type = greedy_default;
3502    
3503     /* If previous was a character match, abolish the item and generate a
3504     repeat item instead. If a char item has a minumum of more than one, ensure
3505     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3506     the first thing in a branch because the x will have gone into firstbyte
3507     instead. */
3508    
3509     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3510     {
3511     /* Deal with UTF-8 characters that take up more than one byte. It's
3512     easier to write this out separately than try to macrify it. Use c to
3513     hold the length of the character in bytes, plus 0x80 to flag that it's a
3514     length rather than a small character. */
3515    
3516     #ifdef SUPPORT_UTF8
3517     if (utf8 && (code[-1] & 0x80) != 0)
3518     {
3519     uschar *lastchar = code - 1;
3520     while((*lastchar & 0xc0) == 0x80) lastchar--;
3521     c = code - lastchar; /* Length of UTF-8 character */
3522     memcpy(utf8_char, lastchar, c); /* Save the char */
3523     c |= 0x80; /* Flag c as a length */
3524     }
3525     else
3526     #endif
3527    
3528     /* Handle the case of a single byte - either with no UTF8 support, or
3529     with UTF-8 disabled, or for a UTF-8 character < 128. */
3530    
3531     {
3532     c = code[-1];
3533     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3534     }
3535    
3536 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3537     the line is something that cannot possibly match this character. If so,
3538     automatically possessifying this item gains some performance in the case
3539     where the match fails. */
3540    
3541     if (!possessive_quantifier &&
3542     repeat_max < 0 &&
3543     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3544     options, cd))
3545     {
3546     repeat_type = 0; /* Force greedy */
3547     possessive_quantifier = TRUE;
3548     }
3549    
3550 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3551     }
3552    
3553     /* If previous was a single negated character ([^a] or similar), we use
3554     one of the special opcodes, replacing it. The code is shared with single-
3555     character repeats by setting opt_type to add a suitable offset into
3556 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3557     currently used only for single-byte chars. */
3558 nigel 77
3559     else if (*previous == OP_NOT)
3560     {
3561     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3562     c = previous[1];
3563 nigel 93 if (!possessive_quantifier &&
3564     repeat_max < 0 &&
3565     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3566     {
3567     repeat_type = 0; /* Force greedy */
3568     possessive_quantifier = TRUE;
3569     }
3570 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3571     }
3572    
3573     /* If previous was a character type match (\d or similar), abolish it and
3574     create a suitable repeat item. The code is shared with single-character
3575     repeats by setting op_type to add a suitable offset into repeat_type. Note
3576     the the Unicode property types will be present only when SUPPORT_UCP is
3577     defined, but we don't wrap the little bits of code here because it just
3578     makes it horribly messy. */
3579    
3580     else if (*previous < OP_EODN)
3581     {
3582     uschar *oldcode;
3583 nigel 87 int prop_type, prop_value;
3584 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3585     c = *previous;
3586    
3587 nigel 93 if (!possessive_quantifier &&
3588     repeat_max < 0 &&
3589     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3590     {
3591     repeat_type = 0; /* Force greedy */
3592     possessive_quantifier = TRUE;
3593     }
3594    
3595 nigel 77 OUTPUT_SINGLE_REPEAT:
3596 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3597     {
3598     prop_type = previous[1];
3599     prop_value = previous[2];
3600     }
3601     else prop_type = prop_value = -1;
3602 nigel 77
3603     oldcode = code;
3604     code = previous; /* Usually overwrite previous item */
3605    
3606     /* If the maximum is zero then the minimum must also be zero; Perl allows
3607     this case, so we do too - by simply omitting the item altogether. */
3608    
3609     if (repeat_max == 0) goto END_REPEAT;
3610    
3611     /* All real repeats make it impossible to handle partial matching (maybe
3612     one day we will be able to remove this restriction). */
3613    
3614 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3615 nigel 77
3616     /* Combine the op_type with the repeat_type */
3617    
3618     repeat_type += op_type;
3619    
3620     /* A minimum of zero is handled either as the special case * or ?, or as
3621     an UPTO, with the maximum given. */
3622    
3623     if (repeat_min == 0)
3624     {
3625     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3626     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3627     else
3628     {
3629     *code++ = OP_UPTO + repeat_type;
3630     PUT2INC(code, 0, repeat_max);
3631     }
3632     }
3633    
3634     /* A repeat minimum of 1 is optimized into some special cases. If the
3635 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3636 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3637     one less than the maximum. */
3638    
3639     else if (repeat_min == 1)
3640     {
3641     if (repeat_max == -1)
3642     *code++ = OP_PLUS + repeat_type;
3643     else
3644     {
3645     code = oldcode; /* leave previous item in place */
3646     if (repeat_max == 1) goto END_REPEAT;
3647     *code++ = OP_UPTO + repeat_type;
3648     PUT2INC(code, 0, repeat_max - 1);
3649     }
3650     }
3651    
3652     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3653     handled as an EXACT followed by an UPTO. */
3654    
3655     else
3656     {
3657     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3658     PUT2INC(code, 0, repeat_min);
3659    
3660     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3661     we have to insert the character for the previous code. For a repeated
3662 nigel 87 Unicode property match, there are two extra bytes that define the
3663 nigel 77 required property. In UTF-8 mode, long characters have their length in
3664     c, with the 0x80 bit as a flag. */
3665    
3666     if (repeat_max < 0)
3667     {
3668     #ifdef SUPPORT_UTF8
3669     if (utf8 && c >= 128)
3670     {
3671     memcpy(code, utf8_char, c & 7);
3672     code += c & 7;
3673     }
3674     else
3675     #endif
3676     {
3677     *code++ = c;
3678 nigel 87 if (prop_type >= 0)
3679     {
3680     *code++ = prop_type;
3681     *code++ = prop_value;
3682     }
3683 nigel 77 }
3684     *code++ = OP_STAR + repeat_type;
3685     }
3686    
3687     /* Else insert an UPTO if the max is greater than the min, again
3688 nigel 93 preceded by the character, for the previously inserted code. If the
3689     UPTO is just for 1 instance, we can use QUERY instead. */
3690 nigel 77
3691     else if (repeat_max != repeat_min)
3692     {
3693     #ifdef SUPPORT_UTF8
3694     if (utf8 && c >= 128)
3695     {
3696     memcpy(code, utf8_char, c & 7);
3697     code += c & 7;
3698     }
3699     else
3700     #endif
3701     *code++ = c;
3702 nigel 87 if (prop_type >= 0)
3703     {
3704     *code++ = prop_type;
3705     *code++ = prop_value;
3706     }
3707 nigel 77 repeat_max -= repeat_min;
3708 nigel 93
3709     if (repeat_max == 1)
3710     {
3711     *code++ = OP_QUERY + repeat_type;
3712     }
3713     else
3714     {
3715     *code++ = OP_UPTO + repeat_type;
3716     PUT2INC(code, 0, repeat_max);
3717     }
3718 nigel 77 }
3719     }
3720    
3721     /* The character or character type itself comes last in all cases. */
3722    
3723     #ifdef SUPPORT_UTF8
3724     if (utf8 && c >= 128)
3725     {
3726     memcpy(code, utf8_char, c & 7);
3727     code += c & 7;
3728     }
3729     else
3730     #endif
3731     *code++ = c;
3732    
3733 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3734     define the required property. */
3735 nigel 77
3736     #ifdef SUPPORT_UCP
3737 nigel 87 if (prop_type >= 0)
3738     {
3739     *code++ = prop_type;
3740     *code++ = prop_value;
3741     }
3742 nigel 77 #endif
3743     }
3744    
3745     /* If previous was a character class or a back reference, we put the repeat
3746     stuff after it, but just skip the item if the repeat was {0,0}. */
3747    
3748     else if (*previous == OP_CLASS ||
3749     *previous == OP_NCLASS ||
3750     #ifdef SUPPORT_UTF8
3751     *previous == OP_XCLASS ||
3752     #endif
3753     *previous == OP_REF)
3754     {
3755     if (repeat_max == 0)
3756     {
3757     code = previous;
3758     goto END_REPEAT;
3759     }
3760    
3761     /* All real repeats make it impossible to handle partial matching (maybe
3762     one day we will be able to remove this restriction). */
3763    
3764 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3765 nigel 77
3766     if (repeat_min == 0 && repeat_max == -1)
3767     *code++ = OP_CRSTAR + repeat_type;
3768     else if (repeat_min == 1 && repeat_max == -1)
3769     *code++ = OP_CRPLUS + repeat_type;
3770     else if (repeat_min == 0 && repeat_max == 1)
3771     *code++ = OP_CRQUERY + repeat_type;
3772     else
3773     {
3774     *code++ = OP_CRRANGE + repeat_type;
3775     PUT2INC(code, 0, repeat_min);
3776     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3777     PUT2INC(code, 0, repeat_max);
3778     }
3779     }
3780    
3781     /* If previous was a bracket group, we may have to replicate it in certain
3782     cases. */
3783    
3784 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3785     *previous == OP_ONCE || *previous == OP_COND)
3786 nigel 77 {
3787     register int i;
3788     int ketoffset = 0;
3789     int len = code - previous;
3790     uschar *bralink = NULL;
3791    
3792 nigel 93 /* Repeating a DEFINE group is pointless */
3793    
3794     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3795     {
3796     *errorcodeptr = ERR55;
3797     goto FAILED;
3798     }
3799    
3800 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3801     by scanning through from the start, and compute the offset back to it
3802     from the current code pointer. There may be an OP_OPT setting following
3803     the final KET, so we can't find the end just by going back from the code
3804     pointer. */
3805    
3806     if (repeat_max == -1)
3807     {
3808     register uschar *ket = previous;
3809     do ket += GET(ket, 1); while (*ket != OP_KET);
3810     ketoffset = code - ket;
3811     }
3812    
3813     /* The case of a zero minimum is special because of the need to stick
3814     OP_BRAZERO in front of it, and because the group appears once in the
3815     data, whereas in other cases it appears the minimum number of times. For
3816     this reason, it is simplest to treat this case separately, as otherwise
3817     the code gets far too messy. There are several special subcases when the
3818     minimum is zero. */
3819    
3820     if (repeat_min == 0)
3821     {
3822     /* If the maximum is also zero, we just omit the group from the output
3823     altogether. */
3824    
3825     if (repeat_max == 0)
3826     {
3827     code = previous;
3828     goto END_REPEAT;
3829     }
3830    
3831     /* If the maximum is 1 or unlimited, we just have to stick in the
3832     BRAZERO and do no more at this point. However, we do need to adjust
3833     any OP_RECURSE calls inside the group that refer to the group itself or
3834 nigel 93 any internal or forward referenced group, because the offset is from
3835     the start of the whole regex. Temporarily terminate the pattern while
3836     doing this. */
3837 nigel 77
3838     if (repeat_max <= 1)
3839     {
3840     *code = OP_END;
3841 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3842 nigel 77 memmove(previous+1, previous, len);
3843     code++;
3844     *previous++ = OP_BRAZERO + repeat_type;
3845     }
3846    
3847     /* If the maximum is greater than 1 and limited, we have to replicate
3848     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3849     The first one has to be handled carefully because it's the original
3850     copy, which has to be moved up. The remainder can be handled by code
3851     that is common with the non-zero minimum case below. We have to
3852     adjust the value or repeat_max, since one less copy is required. Once
3853     again, we may have to adjust any OP_RECURSE calls inside the group. */
3854    
3855     else
3856     {
3857     int offset;
3858     *code = OP_END;
3859 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3860 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3861     code += 2 + LINK_SIZE;
3862     *previous++ = OP_BRAZERO + repeat_type;
3863     *previous++ = OP_BRA;
3864    
3865     /* We chain together the bracket offset fields that have to be
3866     filled in later when the ends of the brackets are reached. */
3867    
3868     offset = (bralink == NULL)? 0 : previous - bralink;
3869     bralink = previous;
3870     PUTINC(previous, 0, offset);
3871     }
3872    
3873     repeat_max--;
3874     }
3875    
3876     /* If the minimum is greater than zero, replicate the group as many
3877     times as necessary, and adjust the maximum to the number of subsequent
3878     copies that we need. If we set a first char from the group, and didn't
3879 nigel 93 set a required char, copy the latter from the former. If there are any
3880     forward reference subroutine calls in the group, there will be entries on
3881     the workspace list; replicate these with an appropriate increment. */
3882 nigel 77
3883     else
3884     {
3885     if (repeat_min > 1)
3886     {
3887 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3888 ph10 202 just adjust the length as if we had. Do some paranoid checks for
3889     potential integer overflow. */
3890 nigel 93
3891     if (lengthptr != NULL)
3892 ph10 202 {
3893     int delta = (repeat_min - 1)*length_prevgroup;
3894     if ((double)(repeat_min - 1)*(double)length_prevgroup >
3895     (double)INT_MAX ||
3896     OFLOW_MAX - *lengthptr < delta)
3897     {
3898     *errorcodeptr = ERR20;
3899     goto FAILED;
3900     }
3901     *lengthptr += delta;
3902     }
3903 nigel 93
3904     /* This is compiling for real */
3905    
3906     else
3907 nigel 77 {
3908 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3909     for (i = 1; i < repeat_min; i++)
3910     {
3911     uschar *hc;
3912     uschar *this_hwm = cd->hwm;
3913     memcpy(code, previous, len);
3914     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3915     {
3916     PUT(cd->hwm, 0, GET(hc, 0) + len);
3917     cd->hwm += LINK_SIZE;
3918     }
3919     save_hwm = this_hwm;
3920     code += len;
3921     }
3922 nigel 77 }
3923     }
3924 nigel 93
3925 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3926     }
3927    
3928     /* This code is common to both the zero and non-zero minimum cases. If
3929     the maximum is limited, it replicates the group in a nested fashion,
3930     remembering the bracket starts on a stack. In the case of a zero minimum,
3931     the first one was set up above. In all cases the repeat_max now specifies
3932 nigel 93 the number of additional copies needed. Again, we must remember to
3933     replicate entries on the forward reference list. */
3934 nigel 77
3935     if (repeat_max >= 0)
3936     {
3937 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3938     just adjust the length as if we had. For each repetition we must add 1
3939     to the length for BRAZERO and for all but the last repetition we must
3940 ph10 202 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3941     paranoid checks to avoid integer overflow. */
3942 nigel 93
3943     if (lengthptr != NULL && repeat_max > 0)
3944 ph10 202 {
3945     int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3946     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3947     if ((double)repeat_max *
3948     (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3949     > (double)INT_MAX ||
3950     OFLOW_MAX - *lengthptr < delta)
3951     {
3952     *errorcodeptr = ERR20;
3953     goto FAILED;
3954     }
3955     *lengthptr += delta;
3956     }
3957 nigel 93
3958     /* This is compiling for real */
3959    
3960     else for (i = repeat_max - 1; i >= 0; i--)
3961 nigel 77 {
3962 nigel 93 uschar *hc;
3963     uschar *this_hwm = cd->hwm;
3964    
3965 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3966    
3967     /* All but the final copy start a new nesting, maintaining the
3968     chain of brackets outstanding. */
3969    
3970     if (i != 0)
3971     {
3972     int offset;
3973     *code++ = OP_BRA;
3974     offset = (bralink == NULL)? 0 : code - bralink;
3975     bralink = code;
3976     PUTINC(code, 0, offset);
3977     }
3978    
3979     memcpy(code, previous, len);
3980 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3981     {
3982     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3983     cd->hwm += LINK_SIZE;
3984     }
3985     save_hwm = this_hwm;
3986 nigel 77 code += len;
3987     }
3988    
3989     /* Now chain through the pending brackets, and fill in their length
3990     fields (which are holding the chain links pro tem). */
3991    
3992     while (bralink != NULL)
3993     {
3994     int oldlinkoffset;
3995     int offset = code - bralink + 1;
3996     uschar *bra = code - offset;
3997     oldlinkoffset = GET(bra, 1);
3998     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3999     *code++ = OP_KET;
4000     PUTINC(code, 0, offset);
4001     PUT(bra, 1, offset);
4002     }
4003     }
4004    
4005     /* If the maximum is unlimited, set a repeater in the final copy. We
4006     can't just offset backwards from the current code point, because we
4007     don't know if there's been an options resetting after the ket. The
4008 nigel 93 correct offset was computed above.
4009 nigel 77
4010 nigel 93 Then, when we are doing the actual compile phase, check to see whether
4011     this group is a non-atomic one that could match an empty string. If so,
4012     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4013     that runtime checking can be done. [This check is also applied to
4014     atomic groups at runtime, but in a different way.] */
4015    
4016     else
4017     {
4018     uschar *ketcode = code - ketoffset;
4019     uschar *bracode = ketcode - GET(ketcode, 1);
4020     *ketcode = OP_KETRMAX + repeat_type;
4021     if (lengthptr == NULL && *bracode != OP_ONCE)
4022     {
4023     uschar *scode = bracode;
4024     do
4025     {
4026     if (could_be_empty_branch(scode, ketcode, utf8))
4027     {
4028     *bracode += OP_SBRA - OP_BRA;
4029     break;
4030     }
4031     scode += GET(scode, 1);
4032     }
4033     while (*scode == OP_ALT);
4034     }
4035     }
4036 nigel 77 }
4037    
4038     /* Else there's some kind of shambles */
4039    
4040     else
4041     {
4042     *errorcodeptr = ERR11;
4043     goto FAILED;
4044     }
4045    
4046 nigel 93 /* If the character following a repeat is '+', or if certain optimization
4047     tests above succeeded, possessive_quantifier is TRUE. For some of the
4048     simpler opcodes, there is an special alternative opcode for this. For
4049     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4050     The '+' notation is just syntactic sugar, taken from Sun's Java package,
4051     but the special opcodes can optimize it a bit. The repeated item starts at
4052     tempcode, not at previous, which might be the first part of a string whose
4053     (former) last char we repeated.
4054 nigel 77
4055 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4056     an 'upto' may follow. We skip over an 'exact' item, and then test the
4057     length of what remains before proceeding. */
4058    
4059 nigel 77 if (possessive_quantifier)
4060     {
4061 nigel 93 int len;
4062     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4063     *tempcode == OP_NOTEXACT)
4064 ph10 285 tempcode += _pcre_OP_lengths[*tempcode] +
4065 ph10 286 ((*tempcode == OP_TYPEEXACT &&
4066     (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4067 nigel 93 len = code - tempcode;
4068     if (len > 0) switch (*tempcode)
4069     {
4070     case OP_STAR: *tempcode = OP_POSSTAR; break;
4071     case OP_PLUS: *tempcode = OP_POSPLUS; break;
4072     case OP_QUERY: *tempcode = OP_POSQUERY; break;
4073     case OP_UPTO: *tempcode = OP_POSUPTO; break;
4074    
4075     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4076     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4077     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4078     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4079    
4080     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4081     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4082     case OP_NOTQUERY: *tempcode =