/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 285 - (hide annotations) (download)
Wed Dec 12 17:03:50 2007 UTC (6 years, 4 months ago) by ph10
Original Path: code/trunk/pcre_compile.c
File MIME type: text/plain
File size: 195678 byte(s)
Fix bad compiled code for things like /\pL{2}+/ in which a possessive 
quantifier with a fixed limit was applied to a character property.

</
1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 nigel 77 static const short int escapes[] = {
102     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 nigel 77 0, 0, -ESC_z /* x - z */
112     };
113    
114 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
115 nigel 77 static const short int escapes[] = {
116     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139     };
140     #endif
141    
142    
143 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144     searched linearly. Put all the names into a single string, in order to reduce
145 ph10 240 the number of relocations when a shared library is dynamically linked. */
146 ph10 210
147     typedef struct verbitem {
148     int len;
149     int op;
150 ph10 211 } verbitem;
151 ph10 210
152 ph10 240 static const char verbnames[] =
153 ph10 243 "ACCEPT\0"
154     "COMMIT\0"
155     "F\0"
156     "FAIL\0"
157     "PRUNE\0"
158     "SKIP\0"
159     "THEN";
160 ph10 240
161 ph10 210 static verbitem verbs[] = {
162 ph10 240 { 6, OP_ACCEPT },
163     { 6, OP_COMMIT },
164     { 1, OP_FAIL },
165     { 4, OP_FAIL },
166     { 5, OP_PRUNE },
167     { 4, OP_SKIP },
168     { 4, OP_THEN }
169 ph10 210 };
170    
171     static int verbcount = sizeof(verbs)/sizeof(verbitem);
172    
173    
174 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
175     now all in a single string, to reduce the number of relocations when a shared
176 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
177     length entry. The first three must be alpha, lower, upper, as this is assumed
178     for handling case independence. */
179 nigel 77
180 ph10 240 static const char posix_names[] =
181 ph10 243 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182     "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 ph10 240 "word\0" "xdigit";
184 nigel 77
185     static const uschar posix_name_lengths[] = {
186     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187    
188 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
189     base map, with an optional addition or removal of another map. Then, for some
190     classes, there is some additional tweaking: for [:blank:] the vertical space
191     characters are removed, and for [:alpha:] and [:alnum:] the underscore
192     character is removed. The triples in the table consist of the base map offset,
193     second map offset or -1 if no second map, and a non-negative value for map
194     addition or a negative value for map subtraction (if there are two maps). The
195     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196     remove vertical space characters, 2 => remove underscore. */
197 nigel 77
198     static const int posix_class_maps[] = {
199 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
200     cbit_lower, -1, 0, /* lower */
201     cbit_upper, -1, 0, /* upper */
202     cbit_word, -1, 2, /* alnum - word without underscore */
203     cbit_print, cbit_cntrl, 0, /* ascii */
204     cbit_space, -1, 1, /* blank - a GNU extension */
205     cbit_cntrl, -1, 0, /* cntrl */
206     cbit_digit, -1, 0, /* digit */
207     cbit_graph, -1, 0, /* graph */
208     cbit_print, -1, 0, /* print */
209     cbit_punct, -1, 0, /* punct */
210     cbit_space, -1, 0, /* space */
211     cbit_word, -1, 0, /* word - a Perl extension */
212     cbit_xdigit,-1, 0 /* xdigit */
213 nigel 77 };
214    
215    
216 nigel 93 #define STRING(a) # a
217     #define XSTRING(s) STRING(s)
218    
219 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
220 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
221     they are documented. Always add a new error instead. Messages marked DEAD below
222 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
223     the number of relocations needed when a shared library is loaded dynamically,
224     it is now one long string. We cannot use a table of offsets, because the
225     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226     simply count through to the one we want - this isn't a performance issue
227 ph10 240 because these strings are used only when there is a compilation error. */
228 nigel 77
229 ph10 240 static const char error_texts[] =
230     "no error\0"
231     "\\ at end of pattern\0"
232     "\\c at end of pattern\0"
233     "unrecognized character follows \\\0"
234     "numbers out of order in {} quantifier\0"
235 nigel 77 /* 5 */
236 ph10 240 "number too big in {} quantifier\0"
237     "missing terminating ] for character class\0"
238     "invalid escape sequence in character class\0"
239     "range out of order in character class\0"
240     "nothing to repeat\0"
241 nigel 77 /* 10 */
242 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243     "internal error: unexpected repeat\0"
244 ph10 269 "unrecognized character after (? or (?-\0"
245 ph10 240 "POSIX named classes are supported only within a class\0"
246     "missing )\0"
247 nigel 77 /* 15 */
248 ph10 240 "reference to non-existent subpattern\0"
249     "erroffset passed as NULL\0"
250     "unknown option bit(s) set\0"
251     "missing ) after comment\0"
252     "parentheses nested too deeply\0" /** DEAD **/
253 nigel 77 /* 20 */
254 ph10 240 "regular expression is too large\0"
255     "failed to get memory\0"
256     "unmatched parentheses\0"
257     "internal error: code overflow\0"
258     "unrecognized character after (?<\0"
259 nigel 77 /* 25 */
260 ph10 240 "lookbehind assertion is not fixed length\0"
261     "malformed number or name after (?(\0"
262     "conditional group contains more than two branches\0"
263     "assertion expected after (?(\0"
264     "(?R or (?[+-]digits must be followed by )\0"
265 nigel 77 /* 30 */
266 ph10 240 "unknown POSIX class name\0"
267     "POSIX collating elements are not supported\0"
268     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269     "spare error\0" /** DEAD **/
270     "character value in \\x{...} sequence is too large\0"
271 nigel 77 /* 35 */
272 ph10 240 "invalid condition (?(0)\0"
273     "\\C not allowed in lookbehind assertion\0"
274     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275     "number after (?C is > 255\0"
276     "closing ) for (?C expected\0"
277 nigel 77 /* 40 */
278 ph10 240 "recursive call could loop indefinitely\0"
279     "unrecognized character after (?P\0"
280     "syntax error in subpattern name (missing terminator)\0"
281     "two named subpatterns have the same name\0"
282     "invalid UTF-8 string\0"
283 nigel 77 /* 45 */
284 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
285     "malformed \\P or \\p sequence\0"
286     "unknown property name after \\P or \\p\0"
287     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 nigel 91 /* 50 */
290 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
291     "octal value is greater than \\377 (not in UTF-8 mode)\0"
292     "internal error: overran compiling workspace\0"
293     "internal error: previously-checked referenced subpattern not found\0"
294     "DEFINE group contains more than one branch\0"
295 nigel 93 /* 55 */
296 ph10 240 "repeating a DEFINE group is not allowed\0"
297     "inconsistent NEWLINE options\0"
298     "\\g is not followed by a braced name or an optionally braced non-zero number\0"
299     "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
300     "(*VERB) with an argument is not supported\0"
301 ph10 211 /* 60 */
302 ph10 240 "(*VERB) not recognized\0"
303 ph10 268 "number is too big\0"
304 ph10 272 "subpattern name expected\0"
305 ph10 269 "digit expected after (?+";
306 nigel 77
307    
308     /* Table to identify digits and hex digits. This is used when compiling
309     patterns. Note that the tables in chartables are dependent on the locale, and
310     may mark arbitrary characters as digits - but the PCRE compiling code expects
311     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
312     a private table here. It costs 256 bytes, but it is a lot faster than doing
313     character value tests (at least in some simple cases I timed), and in some
314     applications one wants PCRE to compile efficiently as well as match
315     efficiently.
316    
317     For convenience, we use the same bit definitions as in chartables:
318    
319     0x04 decimal digit
320     0x08 hexadecimal digit
321    
322     Then we can use ctype_digit and ctype_xdigit in the code. */
323    
324 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
325 nigel 77 static const unsigned char digitab[] =
326     {
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
333     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
334     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
335     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
336     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
337     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
338     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
339     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
340     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
341     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
342     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
343     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
344     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
347     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
348     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
350     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
351     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
352     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
355     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
356     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
357     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
358     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
359    
360 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
361 nigel 77 static const unsigned char digitab[] =
362     {
363     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
364     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
365     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
366     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
367     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
368     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
369     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
370     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
371     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
374 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
375 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
376     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
377     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
378     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
379     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
380     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
381     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
382     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
383     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
387     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
388     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
393     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
394     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
395    
396     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
397     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
398     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
399     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
400     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
401     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
402     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
403     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
404     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
405     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
406     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
407     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
408 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
409 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
410     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
411     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
412     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
413     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
414     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
415     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
416     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
417     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
418     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
419     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
420     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
421     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
422     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
423     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
424     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
425     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
426     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
427     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
428     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
429     #endif
430    
431    
432     /* Definition to allow mutual recursion */
433    
434     static BOOL
435 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
436 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
437 nigel 77
438    
439    
440     /*************************************************
441 ph10 240 * Find an error text *
442     *************************************************/
443    
444 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
445     some of the text is of unknown length, we can't use a table of offsets.
446     Instead, just count through the strings. This is not a performance issue
447 ph10 240 because it happens only when there has been a compilation error.
448    
449     Argument: the error number
450     Returns: pointer to the error string
451     */
452    
453     static const char *
454     find_error_text(int n)
455     {
456     const char *s = error_texts;
457 ph10 243 for (; n > 0; n--) while (*s++ != 0);
458 ph10 240 return s;
459     }
460    
461    
462     /*************************************************
463 nigel 77 * Handle escapes *
464     *************************************************/
465    
466     /* This function is called when a \ has been encountered. It either returns a
467     positive value for a simple escape such as \n, or a negative value which
468 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
469     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
470     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
471     ptr is pointing at the \. On exit, it is on the final character of the escape
472     sequence.
473 nigel 77
474     Arguments:
475     ptrptr points to the pattern position pointer
476     errorcodeptr points to the errorcode variable
477     bracount number of previous extracting brackets
478     options the options bits
479     isclass TRUE if inside a character class
480    
481     Returns: zero or positive => a data character
482     negative => a special escape sequence
483 ph10 213 on error, errorcodeptr is set
484 nigel 77 */
485    
486     static int
487     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
488     int options, BOOL isclass)
489     {
490 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
491     const uschar *ptr = *ptrptr + 1;
492 nigel 77 int c, i;
493    
494 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
495     ptr--; /* Set pointer back to the last byte */
496    
497 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
498    
499     if (c == 0) *errorcodeptr = ERR1;
500    
501 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
502     in a table. A non-zero result is something that can be returned immediately.
503 nigel 77 Otherwise further processing may be required. */
504    
505 ph10 97 #ifndef EBCDIC /* ASCII coding */
506 ph10 274 else if (c < '0' || c > 'z') {} /* Not alphanumeric */
507 nigel 77 else if ((i = escapes[c - '0']) != 0) c = i;
508    
509 ph10 97 #else /* EBCDIC coding */
510 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
511 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
512     #endif
513    
514     /* Escapes that need further processing, or are illegal. */
515    
516     else
517     {
518     const uschar *oldptr;
519 nigel 93 BOOL braced, negated;
520    
521 nigel 77 switch (c)
522     {
523     /* A number of Perl escapes are not handled by PCRE. We give an explicit
524     error. */
525    
526     case 'l':
527     case 'L':
528     case 'N':
529     case 'u':
530     case 'U':
531     *errorcodeptr = ERR37;
532     break;
533    
534 nigel 93 /* \g must be followed by a number, either plain or braced. If positive, it
535     is an absolute backreference. If negative, it is a relative backreference.
536 ph10 172 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
537     reference to a named group. This is part of Perl's movement towards a
538     unified syntax for back references. As this is synonymous with \k{name}, we
539 ph10 171 fudge it up by pretending it really was \k. */
540 nigel 93
541     case 'g':
542     if (ptr[1] == '{')
543     {
544 ph10 171 const uschar *p;
545     for (p = ptr+2; *p != 0 && *p != '}'; p++)
546     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
547 ph10 172 if (*p != 0 && *p != '}')
548 ph10 171 {
549     c = -ESC_k;
550     break;
551 ph10 172 }
552 nigel 93 braced = TRUE;
553     ptr++;
554     }
555     else braced = FALSE;
556    
557     if (ptr[1] == '-')
558     {
559     negated = TRUE;
560     ptr++;
561     }
562     else negated = FALSE;
563    
564     c = 0;
565     while ((digitab[ptr[1]] & ctype_digit) != 0)
566     c = c * 10 + *(++ptr) - '0';
567 ph10 220
568 ph10 213 if (c < 0)
569     {
570     *errorcodeptr = ERR61;
571     break;
572 ph10 220 }
573 nigel 93
574     if (c == 0 || (braced && *(++ptr) != '}'))
575     {
576     *errorcodeptr = ERR57;
577 ph10 213 break;
578 nigel 93 }
579    
580     if (negated)
581     {
582     if (c > bracount)
583     {
584     *errorcodeptr = ERR15;
585 ph10 213 break;
586 nigel 93 }
587     c = bracount - (c - 1);
588     }
589    
590     c = -(ESC_REF + c);
591     break;
592    
593 nigel 77 /* The handling of escape sequences consisting of a string of digits
594     starting with one that is not zero is not straightforward. By experiment,
595     the way Perl works seems to be as follows:
596    
597     Outside a character class, the digits are read as a decimal number. If the
598     number is less than 10, or if there are that many previous extracting
599     left brackets, then it is a back reference. Otherwise, up to three octal
600     digits are read to form an escaped byte. Thus \123 is likely to be octal
601     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
602     value is greater than 377, the least significant 8 bits are taken. Inside a
603     character class, \ followed by a digit is always an octal number. */
604    
605     case '1': case '2': case '3': case '4': case '5':
606     case '6': case '7': case '8': case '9':
607    
608     if (!isclass)
609     {
610     oldptr = ptr;
611     c -= '0';
612     while ((digitab[ptr[1]] & ctype_digit) != 0)
613     c = c * 10 + *(++ptr) - '0';
614 ph10 213 if (c < 0)
615     {
616     *errorcodeptr = ERR61;
617 ph10 220 break;
618     }
619 nigel 77 if (c < 10 || c <= bracount)
620     {
621     c = -(ESC_REF + c);
622     break;
623     }
624     ptr = oldptr; /* Put the pointer back and fall through */
625     }
626    
627     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
628     generates a binary zero byte and treats the digit as a following literal.
629     Thus we have to pull back the pointer by one. */
630    
631     if ((c = *ptr) >= '8')
632     {
633     ptr--;
634     c = 0;
635     break;
636     }
637    
638     /* \0 always starts an octal number, but we may drop through to here with a
639 nigel 91 larger first octal digit. The original code used just to take the least
640     significant 8 bits of octal numbers (I think this is what early Perls used
641     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
642     than 3 octal digits. */
643 nigel 77
644     case '0':
645     c -= '0';
646     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
647     c = c * 8 + *(++ptr) - '0';
648 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
649 nigel 77 break;
650    
651 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
652     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
653     treated as a data character. */
654 nigel 77
655     case 'x':
656 nigel 87 if (ptr[1] == '{')
657 nigel 77 {
658     const uschar *pt = ptr + 2;
659 nigel 87 int count = 0;
660    
661 nigel 77 c = 0;
662     while ((digitab[*pt] & ctype_xdigit) != 0)
663     {
664 nigel 87 register int cc = *pt++;
665     if (c == 0 && cc == '0') continue; /* Leading zeroes */
666 nigel 77 count++;
667 nigel 87
668 ph10 97 #ifndef EBCDIC /* ASCII coding */
669 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
670 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
671 ph10 97 #else /* EBCDIC coding */
672 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
673 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
674 nigel 77 #endif
675     }
676 nigel 87
677 nigel 77 if (*pt == '}')
678     {
679 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
680 nigel 77 ptr = pt;
681     break;
682     }
683 nigel 87
684 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
685     recognize this construct; fall through to the normal \x handling. */
686     }
687    
688 nigel 87 /* Read just a single-byte hex-defined char */
689 nigel 77
690     c = 0;
691     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
692     {
693     int cc; /* Some compilers don't like ++ */
694     cc = *(++ptr); /* in initializers */
695 ph10 97 #ifndef EBCDIC /* ASCII coding */
696 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
697     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
698 ph10 97 #else /* EBCDIC coding */
699 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
700     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
701     #endif
702     }
703     break;
704    
705 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
706     This coding is ASCII-specific, but then the whole concept of \cx is
707     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
708 nigel 77
709     case 'c':
710     c = *(++ptr);
711     if (c == 0)
712     {
713     *errorcodeptr = ERR2;
714 ph10 213 break;
715 nigel 77 }
716    
717 ph10 97 #ifndef EBCDIC /* ASCII coding */
718 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
719     c ^= 0x40;
720 ph10 97 #else /* EBCDIC coding */
721 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
722     c ^= 0xC0;
723     #endif
724     break;
725    
726     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
727 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
728     otherwise, for Perl compatibility, it is a literal. This code looks a bit
729     odd, but there used to be some cases other than the default, and there may
730     be again in future, so I haven't "optimized" it. */
731 nigel 77
732     default:
733     if ((options & PCRE_EXTRA) != 0) switch(c)
734     {
735     default:
736     *errorcodeptr = ERR3;
737     break;
738     }
739     break;
740     }
741     }
742    
743     *ptrptr = ptr;
744     return c;
745     }
746    
747    
748    
749     #ifdef SUPPORT_UCP
750     /*************************************************
751     * Handle \P and \p *
752     *************************************************/
753    
754     /* This function is called after \P or \p has been encountered, provided that
755     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
756     pointing at the P or p. On exit, it is pointing at the final character of the
757     escape sequence.
758    
759     Argument:
760     ptrptr points to the pattern position pointer
761     negptr points to a boolean that is set TRUE for negation else FALSE
762 nigel 87 dptr points to an int that is set to the detailed property value
763 nigel 77 errorcodeptr points to the error code variable
764    
765 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
766 nigel 77 */
767    
768     static int
769 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
770 nigel 77 {
771     int c, i, bot, top;
772     const uschar *ptr = *ptrptr;
773 nigel 87 char name[32];
774 nigel 77
775     c = *(++ptr);
776     if (c == 0) goto ERROR_RETURN;
777    
778     *negptr = FALSE;
779    
780 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
781     negation. */
782 nigel 77
783     if (c == '{')
784     {
785     if (ptr[1] == '^')
786     {
787     *negptr = TRUE;
788     ptr++;
789     }
790 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
791 nigel 77 {
792     c = *(++ptr);
793     if (c == 0) goto ERROR_RETURN;
794     if (c == '}') break;
795     name[i] = c;
796     }
797 nigel 87 if (c !='}') goto ERROR_RETURN;
798 nigel 77 name[i] = 0;
799     }
800    
801     /* Otherwise there is just one following character */
802    
803     else
804     {
805     name[0] = c;
806     name[1] = 0;
807     }
808    
809     *ptrptr = ptr;
810    
811     /* Search for a recognized property name using binary chop */
812    
813     bot = 0;
814     top = _pcre_utt_size;
815    
816     while (bot < top)
817     {
818 nigel 87 i = (bot + top) >> 1;
819 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
820 nigel 87 if (c == 0)
821     {
822     *dptr = _pcre_utt[i].value;
823     return _pcre_utt[i].type;
824     }
825 nigel 77 if (c > 0) bot = i + 1; else top = i;
826     }
827    
828     *errorcodeptr = ERR47;
829     *ptrptr = ptr;
830     return -1;
831    
832     ERROR_RETURN:
833     *errorcodeptr = ERR46;
834     *ptrptr = ptr;
835     return -1;
836     }
837     #endif
838    
839    
840    
841    
842     /*************************************************
843     * Check for counted repeat *
844     *************************************************/
845    
846     /* This function is called when a '{' is encountered in a place where it might
847     start a quantifier. It looks ahead to see if it really is a quantifier or not.
848     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
849     where the ddds are digits.
850    
851     Arguments:
852     p pointer to the first char after '{'
853    
854     Returns: TRUE or FALSE
855     */
856    
857     static BOOL
858     is_counted_repeat(const uschar *p)
859     {
860     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
861     while ((digitab[*p] & ctype_digit) != 0) p++;
862     if (*p == '}') return TRUE;
863    
864     if (*p++ != ',') return FALSE;
865     if (*p == '}') return TRUE;
866    
867     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
868     while ((digitab[*p] & ctype_digit) != 0) p++;
869    
870     return (*p == '}');
871     }
872    
873    
874    
875     /*************************************************
876     * Read repeat counts *
877     *************************************************/
878    
879     /* Read an item of the form {n,m} and return the values. This is called only
880     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
881     so the syntax is guaranteed to be correct, but we need to check the values.
882    
883     Arguments:
884     p pointer to first char after '{'
885     minp pointer to int for min
886     maxp pointer to int for max
887     returned as -1 if no max
888     errorcodeptr points to error code variable
889    
890     Returns: pointer to '}' on success;
891     current ptr on error, with errorcodeptr set non-zero
892     */
893    
894     static const uschar *
895     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
896     {
897     int min = 0;
898     int max = -1;
899    
900 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
901     an integer overflow. */
902    
903 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
904 nigel 81 if (min < 0 || min > 65535)
905     {
906     *errorcodeptr = ERR5;
907     return p;
908     }
909 nigel 77
910 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
911     Also, max must not be less than min. */
912    
913 nigel 77 if (*p == '}') max = min; else
914     {
915     if (*(++p) != '}')
916     {
917     max = 0;
918     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
919 nigel 81 if (max < 0 || max > 65535)
920     {
921     *errorcodeptr = ERR5;
922     return p;
923     }
924 nigel 77 if (max < min)
925     {
926     *errorcodeptr = ERR4;
927     return p;
928     }
929     }
930     }
931    
932 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
933     '}'. */
934 nigel 77
935 nigel 81 *minp = min;
936     *maxp = max;
937 nigel 77 return p;
938     }
939    
940    
941    
942     /*************************************************
943 nigel 93 * Find forward referenced subpattern *
944 nigel 91 *************************************************/
945    
946 nigel 93 /* This function scans along a pattern's text looking for capturing
947     subpatterns, and counting them. If it finds a named pattern that matches the
948     name it is given, it returns its number. Alternatively, if the name is NULL, it
949     returns when it reaches a given numbered subpattern. This is used for forward
950     references to subpatterns. We know that if (?P< is encountered, the name will
951     be terminated by '>' because that is checked in the first pass.
952 nigel 91
953     Arguments:
954 nigel 93 ptr current position in the pattern
955     count current count of capturing parens so far encountered
956     name name to seek, or NULL if seeking a numbered subpattern
957     lorn name length, or subpattern number if name is NULL
958     xmode TRUE if we are in /x mode
959 nigel 91
960     Returns: the number of the named subpattern, or -1 if not found
961     */
962    
963     static int
964 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
965     BOOL xmode)
966 nigel 91 {
967     const uschar *thisname;
968 nigel 93
969 nigel 91 for (; *ptr != 0; ptr++)
970     {
971 nigel 93 int term;
972    
973     /* Skip over backslashed characters and also entire \Q...\E */
974    
975     if (*ptr == '\\')
976     {
977     if (*(++ptr) == 0) return -1;
978     if (*ptr == 'Q') for (;;)
979     {
980     while (*(++ptr) != 0 && *ptr != '\\');
981     if (*ptr == 0) return -1;
982     if (*(++ptr) == 'E') break;
983     }
984     continue;
985     }
986    
987     /* Skip over character classes */
988    
989     if (*ptr == '[')
990     {
991     while (*(++ptr) != ']')
992     {
993 ph10 220 if (*ptr == 0) return -1;
994 nigel 93 if (*ptr == '\\')
995     {
996     if (*(++ptr) == 0) return -1;
997     if (*ptr == 'Q') for (;;)
998     {
999     while (*(++ptr) != 0 && *ptr != '\\');
1000     if (*ptr == 0) return -1;
1001     if (*(++ptr) == 'E') break;
1002     }
1003     continue;
1004     }
1005     }
1006     continue;
1007     }
1008    
1009     /* Skip comments in /x mode */
1010    
1011     if (xmode && *ptr == '#')
1012     {
1013     while (*(++ptr) != 0 && *ptr != '\n');
1014     if (*ptr == 0) return -1;
1015     continue;
1016     }
1017    
1018     /* An opening parens must now be a real metacharacter */
1019    
1020 nigel 91 if (*ptr != '(') continue;
1021 ph10 210 if (ptr[1] != '?' && ptr[1] != '*')
1022 nigel 93 {
1023     count++;
1024     if (name == NULL && count == lorn) return count;
1025     continue;
1026     }
1027    
1028     ptr += 2;
1029     if (*ptr == 'P') ptr++; /* Allow optional P */
1030    
1031     /* We have to disambiguate (?<! and (?<= from (?<name> */
1032    
1033     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1034     *ptr != '\'')
1035     continue;
1036    
1037 nigel 91 count++;
1038 nigel 93
1039     if (name == NULL && count == lorn) return count;
1040     term = *ptr++;
1041     if (term == '<') term = '>';
1042 nigel 91 thisname = ptr;
1043 nigel 93 while (*ptr != term) ptr++;
1044     if (name != NULL && lorn == ptr - thisname &&
1045     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1046 nigel 91 return count;
1047     }
1048 nigel 93
1049 nigel 91 return -1;
1050     }
1051    
1052    
1053    
1054     /*************************************************
1055 nigel 77 * Find first significant op code *
1056     *************************************************/
1057    
1058     /* This is called by several functions that scan a compiled expression looking
1059     for a fixed first character, or an anchoring op code etc. It skips over things
1060     that do not influence this. For some calls, a change of option is important.
1061     For some calls, it makes sense to skip negative forward and all backward
1062     assertions, and also the \b assertion; for others it does not.
1063    
1064     Arguments:
1065     code pointer to the start of the group
1066     options pointer to external options
1067     optbit the option bit whose changing is significant, or
1068     zero if none are
1069     skipassert TRUE if certain assertions are to be skipped
1070    
1071     Returns: pointer to the first significant opcode
1072     */
1073    
1074     static const uschar*
1075     first_significant_code(const uschar *code, int *options, int optbit,
1076     BOOL skipassert)
1077     {
1078     for (;;)
1079     {
1080     switch ((int)*code)
1081     {
1082     case OP_OPT:
1083     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1084     *options = (int)code[1];
1085     code += 2;
1086     break;
1087    
1088     case OP_ASSERT_NOT:
1089     case OP_ASSERTBACK:
1090     case OP_ASSERTBACK_NOT:
1091     if (!skipassert) return code;
1092     do code += GET(code, 1); while (*code == OP_ALT);
1093     code += _pcre_OP_lengths[*code];
1094     break;
1095    
1096     case OP_WORD_BOUNDARY:
1097     case OP_NOT_WORD_BOUNDARY:
1098     if (!skipassert) return code;
1099     /* Fall through */
1100    
1101     case OP_CALLOUT:
1102     case OP_CREF:
1103 nigel 93 case OP_RREF:
1104     case OP_DEF:
1105 nigel 77 code += _pcre_OP_lengths[*code];
1106     break;
1107    
1108     default:
1109     return code;
1110     }
1111     }
1112     /* Control never reaches here */
1113     }
1114    
1115    
1116    
1117    
1118     /*************************************************
1119     * Find the fixed length of a pattern *
1120     *************************************************/
1121    
1122     /* Scan a pattern and compute the fixed length of subject that will match it,
1123     if the length is fixed. This is needed for dealing with backward assertions.
1124     In UTF8 mode, the result is in characters rather than bytes.
1125    
1126     Arguments:
1127     code points to the start of the pattern (the bracket)
1128     options the compiling options
1129    
1130     Returns: the fixed length, or -1 if there is no fixed length,
1131     or -2 if \C was encountered
1132     */
1133    
1134     static int
1135     find_fixedlength(uschar *code, int options)
1136     {
1137     int length = -1;
1138    
1139     register int branchlength = 0;
1140     register uschar *cc = code + 1 + LINK_SIZE;
1141    
1142     /* Scan along the opcodes for this branch. If we get to the end of the
1143     branch, check the length against that of the other branches. */
1144    
1145     for (;;)
1146     {
1147     int d;
1148     register int op = *cc;
1149     switch (op)
1150     {
1151 nigel 93 case OP_CBRA:
1152 nigel 77 case OP_BRA:
1153     case OP_ONCE:
1154     case OP_COND:
1155 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1156 nigel 77 if (d < 0) return d;
1157     branchlength += d;
1158     do cc += GET(cc, 1); while (*cc == OP_ALT);
1159     cc += 1 + LINK_SIZE;
1160     break;
1161    
1162     /* Reached end of a branch; if it's a ket it is the end of a nested
1163     call. If it's ALT it is an alternation in a nested call. If it is
1164     END it's the end of the outer call. All can be handled by the same code. */
1165    
1166     case OP_ALT:
1167     case OP_KET:
1168     case OP_KETRMAX:
1169     case OP_KETRMIN:
1170     case OP_END:
1171     if (length < 0) length = branchlength;
1172     else if (length != branchlength) return -1;
1173     if (*cc != OP_ALT) return length;
1174     cc += 1 + LINK_SIZE;
1175     branchlength = 0;
1176     break;
1177    
1178     /* Skip over assertive subpatterns */
1179    
1180     case OP_ASSERT:
1181     case OP_ASSERT_NOT:
1182     case OP_ASSERTBACK:
1183     case OP_ASSERTBACK_NOT:
1184     do cc += GET(cc, 1); while (*cc == OP_ALT);
1185     /* Fall through */
1186    
1187     /* Skip over things that don't match chars */
1188    
1189     case OP_REVERSE:
1190     case OP_CREF:
1191 nigel 93 case OP_RREF:
1192     case OP_DEF:
1193 nigel 77 case OP_OPT:
1194     case OP_CALLOUT:
1195     case OP_SOD:
1196     case OP_SOM:
1197     case OP_EOD:
1198     case OP_EODN:
1199     case OP_CIRC:
1200     case OP_DOLL:
1201     case OP_NOT_WORD_BOUNDARY:
1202     case OP_WORD_BOUNDARY:
1203     cc += _pcre_OP_lengths[*cc];
1204     break;
1205    
1206     /* Handle literal characters */
1207    
1208     case OP_CHAR:
1209     case OP_CHARNC:
1210 nigel 91 case OP_NOT:
1211 nigel 77 branchlength++;
1212     cc += 2;
1213     #ifdef SUPPORT_UTF8
1214     if ((options & PCRE_UTF8) != 0)
1215     {
1216     while ((*cc & 0xc0) == 0x80) cc++;
1217     }
1218     #endif
1219     break;
1220    
1221     /* Handle exact repetitions. The count is already in characters, but we
1222     need to skip over a multibyte character in UTF8 mode. */
1223    
1224     case OP_EXACT:
1225     branchlength += GET2(cc,1);
1226     cc += 4;
1227     #ifdef SUPPORT_UTF8
1228     if ((options & PCRE_UTF8) != 0)
1229     {
1230     while((*cc & 0x80) == 0x80) cc++;
1231     }
1232     #endif
1233     break;
1234    
1235     case OP_TYPEEXACT:
1236     branchlength += GET2(cc,1);
1237 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1238 nigel 77 cc += 4;
1239     break;
1240    
1241     /* Handle single-char matchers */
1242    
1243     case OP_PROP:
1244     case OP_NOTPROP:
1245 nigel 87 cc += 2;
1246 nigel 77 /* Fall through */
1247    
1248     case OP_NOT_DIGIT:
1249     case OP_DIGIT:
1250     case OP_NOT_WHITESPACE:
1251     case OP_WHITESPACE:
1252     case OP_NOT_WORDCHAR:
1253     case OP_WORDCHAR:
1254     case OP_ANY:
1255     branchlength++;
1256     cc++;
1257     break;
1258    
1259     /* The single-byte matcher isn't allowed */
1260    
1261     case OP_ANYBYTE:
1262     return -2;
1263    
1264     /* Check a class for variable quantification */
1265    
1266     #ifdef SUPPORT_UTF8
1267     case OP_XCLASS:
1268     cc += GET(cc, 1) - 33;
1269     /* Fall through */
1270     #endif
1271    
1272     case OP_CLASS:
1273     case OP_NCLASS:
1274     cc += 33;
1275    
1276     switch (*cc)
1277     {
1278     case OP_CRSTAR:
1279     case OP_CRMINSTAR:
1280     case OP_CRQUERY:
1281     case OP_CRMINQUERY:
1282     return -1;
1283    
1284     case OP_CRRANGE:
1285     case OP_CRMINRANGE:
1286     if (GET2(cc,1) != GET2(cc,3)) return -1;
1287     branchlength += GET2(cc,1);
1288     cc += 5;
1289     break;
1290    
1291     default:
1292     branchlength++;
1293     }
1294     break;
1295    
1296     /* Anything else is variable length */
1297    
1298     default:
1299     return -1;
1300     }
1301     }
1302     /* Control never gets here */
1303     }
1304    
1305    
1306    
1307    
1308     /*************************************************
1309     * Scan compiled regex for numbered bracket *
1310     *************************************************/
1311    
1312     /* This little function scans through a compiled pattern until it finds a
1313     capturing bracket with the given number.
1314    
1315     Arguments:
1316     code points to start of expression
1317     utf8 TRUE in UTF-8 mode
1318     number the required bracket number
1319    
1320     Returns: pointer to the opcode for the bracket, or NULL if not found
1321     */
1322    
1323     static const uschar *
1324     find_bracket(const uschar *code, BOOL utf8, int number)
1325     {
1326     for (;;)
1327     {
1328     register int c = *code;
1329     if (c == OP_END) return NULL;
1330 nigel 91
1331     /* XCLASS is used for classes that cannot be represented just by a bit
1332     map. This includes negated single high-valued characters. The length in
1333     the table is zero; the actual length is stored in the compiled code. */
1334    
1335     if (c == OP_XCLASS) code += GET(code, 1);
1336    
1337 nigel 93 /* Handle capturing bracket */
1338 nigel 91
1339 nigel 93 else if (c == OP_CBRA)
1340 nigel 77 {
1341 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1342 nigel 77 if (n == number) return (uschar *)code;
1343 nigel 93 code += _pcre_OP_lengths[c];
1344 nigel 77 }
1345 nigel 91
1346 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1347     repeated character types, we have to test for \p and \P, which have an extra
1348 ph10 218 two bytes of parameters. */
1349 nigel 91
1350 nigel 77 else
1351     {
1352 ph10 218 switch(c)
1353     {
1354     case OP_TYPESTAR:
1355     case OP_TYPEMINSTAR:
1356     case OP_TYPEPLUS:
1357     case OP_TYPEMINPLUS:
1358     case OP_TYPEQUERY:
1359     case OP_TYPEMINQUERY:
1360     case OP_TYPEPOSSTAR:
1361     case OP_TYPEPOSPLUS:
1362     case OP_TYPEPOSQUERY:
1363     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1364 ph10 220 break;
1365 ph10 221
1366     case OP_TYPEUPTO:
1367     case OP_TYPEMINUPTO:
1368     case OP_TYPEEXACT:
1369     case OP_TYPEPOSUPTO:
1370     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1371     break;
1372 ph10 220 }
1373    
1374 ph10 218 /* Add in the fixed length from the table */
1375 ph10 220
1376 nigel 77 code += _pcre_OP_lengths[c];
1377 ph10 220
1378 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1379     a multi-byte character. The length in the table is a minimum, so we have to
1380     arrange to skip the extra bytes. */
1381 ph10 220
1382 ph10 107 #ifdef SUPPORT_UTF8
1383 nigel 77 if (utf8) switch(c)
1384     {
1385     case OP_CHAR:
1386     case OP_CHARNC:
1387     case OP_EXACT:
1388     case OP_UPTO:
1389     case OP_MINUPTO:
1390 nigel 93 case OP_POSUPTO:
1391 nigel 77 case OP_STAR:
1392     case OP_MINSTAR:
1393 nigel 93 case OP_POSSTAR:
1394 nigel 77 case OP_PLUS:
1395     case OP_MINPLUS:
1396 nigel 93 case OP_POSPLUS:
1397 nigel 77 case OP_QUERY:
1398     case OP_MINQUERY:
1399 nigel 93 case OP_POSQUERY:
1400     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1401 nigel 77 break;
1402     }
1403 ph10 111 #endif
1404 nigel 77 }
1405     }
1406     }
1407    
1408    
1409    
1410     /*************************************************
1411     * Scan compiled regex for recursion reference *
1412     *************************************************/
1413    
1414     /* This little function scans through a compiled pattern until it finds an
1415     instance of OP_RECURSE.
1416    
1417     Arguments:
1418     code points to start of expression
1419     utf8 TRUE in UTF-8 mode
1420    
1421     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1422     */
1423    
1424     static const uschar *
1425     find_recurse(const uschar *code, BOOL utf8)
1426     {
1427     for (;;)
1428     {
1429     register int c = *code;
1430     if (c == OP_END) return NULL;
1431 nigel 91 if (c == OP_RECURSE) return code;
1432 ph10 220
1433 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1434     map. This includes negated single high-valued characters. The length in
1435     the table is zero; the actual length is stored in the compiled code. */
1436    
1437     if (c == OP_XCLASS) code += GET(code, 1);
1438    
1439 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1440     repeated character types, we have to test for \p and \P, which have an extra
1441 ph10 218 two bytes of parameters. */
1442 nigel 91
1443 nigel 77 else
1444     {
1445 ph10 218 switch(c)
1446     {
1447     case OP_TYPESTAR:
1448     case OP_TYPEMINSTAR:
1449     case OP_TYPEPLUS:
1450     case OP_TYPEMINPLUS:
1451     case OP_TYPEQUERY:
1452     case OP_TYPEMINQUERY:
1453     case OP_TYPEPOSSTAR:
1454     case OP_TYPEPOSPLUS:
1455     case OP_TYPEPOSQUERY:
1456     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1457 ph10 220 break;
1458 ph10 221
1459     case OP_TYPEPOSUPTO:
1460     case OP_TYPEUPTO:
1461     case OP_TYPEMINUPTO:
1462     case OP_TYPEEXACT:
1463     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1464     break;
1465 ph10 220 }
1466    
1467 ph10 218 /* Add in the fixed length from the table */
1468    
1469 nigel 77 code += _pcre_OP_lengths[c];
1470 ph10 220
1471 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1472     by a multi-byte character. The length in the table is a minimum, so we have
1473     to arrange to skip the extra bytes. */
1474 ph10 220
1475 ph10 107 #ifdef SUPPORT_UTF8
1476 nigel 77 if (utf8) switch(c)
1477     {
1478     case OP_CHAR:
1479     case OP_CHARNC:
1480     case OP_EXACT:
1481     case OP_UPTO:
1482     case OP_MINUPTO:
1483 nigel 93 case OP_POSUPTO:
1484 nigel 77 case OP_STAR:
1485     case OP_MINSTAR:
1486 nigel 93 case OP_POSSTAR:
1487 nigel 77 case OP_PLUS:
1488     case OP_MINPLUS:
1489 nigel 93 case OP_POSPLUS:
1490 nigel 77 case OP_QUERY:
1491     case OP_MINQUERY:
1492 nigel 93 case OP_POSQUERY:
1493     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1494 nigel 77 break;
1495     }
1496 ph10 111 #endif
1497 nigel 77 }
1498     }
1499     }
1500    
1501    
1502    
1503     /*************************************************
1504     * Scan compiled branch for non-emptiness *
1505     *************************************************/
1506    
1507     /* This function scans through a branch of a compiled pattern to see whether it
1508 nigel 93 can match the empty string or not. It is called from could_be_empty()
1509     below and from compile_branch() when checking for an unlimited repeat of a
1510     group that can match nothing. Note that first_significant_code() skips over
1511 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1512     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1513     bracket whose current branch will already have been scanned.
1514 nigel 77
1515     Arguments:
1516     code points to start of search
1517     endcode points to where to stop
1518     utf8 TRUE if in UTF8 mode
1519    
1520     Returns: TRUE if what is matched could be empty
1521     */
1522    
1523     static BOOL
1524     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1525     {
1526     register int c;
1527 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1528 nigel 77 code < endcode;
1529     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1530     {
1531     const uschar *ccode;
1532    
1533     c = *code;
1534 ph10 282
1535     /* Skip over forward assertions; the other assertions are skipped by
1536     first_significant_code() with a TRUE final argument. */
1537    
1538     if (c == OP_ASSERT)
1539     {
1540     do code += GET(code, 1); while (*code == OP_ALT);
1541     c = *code;
1542     continue;
1543     }
1544 ph10 172
1545 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1546 nigel 77
1547 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1548     {
1549 ph10 172 code += _pcre_OP_lengths[c];
1550 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1551     c = *code;
1552     continue;
1553     }
1554    
1555     /* For other groups, scan the branches. */
1556 ph10 172
1557 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1558 nigel 77 {
1559     BOOL empty_branch;
1560     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1561    
1562     /* Scan a closed bracket */
1563    
1564     empty_branch = FALSE;
1565     do
1566     {
1567     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1568     empty_branch = TRUE;
1569     code += GET(code, 1);
1570     }
1571     while (*code == OP_ALT);
1572     if (!empty_branch) return FALSE; /* All branches are non-empty */
1573 ph10 172 c = *code;
1574 nigel 93 continue;
1575 nigel 77 }
1576    
1577 nigel 93 /* Handle the other opcodes */
1578    
1579     switch (c)
1580 nigel 77 {
1581 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1582     cannot be represented just by a bit map. This includes negated single
1583     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1584 ph10 220 actual length is stored in the compiled code, so we must update "code"
1585 ph10 216 here. */
1586 nigel 77
1587     #ifdef SUPPORT_UTF8
1588     case OP_XCLASS:
1589 ph10 216 ccode = code += GET(code, 1);
1590 nigel 77 goto CHECK_CLASS_REPEAT;
1591     #endif
1592    
1593     case OP_CLASS:
1594     case OP_NCLASS:
1595     ccode = code + 33;
1596    
1597     #ifdef SUPPORT_UTF8
1598     CHECK_CLASS_REPEAT:
1599     #endif
1600    
1601     switch (*ccode)
1602     {
1603     case OP_CRSTAR: /* These could be empty; continue */
1604     case OP_CRMINSTAR:
1605     case OP_CRQUERY:
1606     case OP_CRMINQUERY:
1607     break;
1608    
1609     default: /* Non-repeat => class must match */
1610     case OP_CRPLUS: /* These repeats aren't empty */
1611     case OP_CRMINPLUS:
1612     return FALSE;
1613    
1614     case OP_CRRANGE:
1615     case OP_CRMINRANGE:
1616     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1617     break;
1618     }
1619     break;
1620    
1621     /* Opcodes that must match a character */
1622    
1623     case OP_PROP:
1624     case OP_NOTPROP:
1625     case OP_EXTUNI:
1626     case OP_NOT_DIGIT:
1627     case OP_DIGIT:
1628     case OP_NOT_WHITESPACE:
1629     case OP_WHITESPACE:
1630     case OP_NOT_WORDCHAR:
1631     case OP_WORDCHAR:
1632     case OP_ANY:
1633     case OP_ANYBYTE:
1634     case OP_CHAR:
1635     case OP_CHARNC:
1636     case OP_NOT:
1637     case OP_PLUS:
1638     case OP_MINPLUS:
1639 nigel 93 case OP_POSPLUS:
1640 nigel 77 case OP_EXACT:
1641     case OP_NOTPLUS:
1642     case OP_NOTMINPLUS:
1643 nigel 93 case OP_NOTPOSPLUS:
1644 nigel 77 case OP_NOTEXACT:
1645     case OP_TYPEPLUS:
1646     case OP_TYPEMINPLUS:
1647 nigel 93 case OP_TYPEPOSPLUS:
1648 nigel 77 case OP_TYPEEXACT:
1649     return FALSE;
1650 ph10 227
1651     /* These are going to continue, as they may be empty, but we have to
1652     fudge the length for the \p and \P cases. */
1653    
1654 ph10 224 case OP_TYPESTAR:
1655     case OP_TYPEMINSTAR:
1656     case OP_TYPEPOSSTAR:
1657     case OP_TYPEQUERY:
1658     case OP_TYPEMINQUERY:
1659     case OP_TYPEPOSQUERY:
1660     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1661 ph10 227 break;
1662    
1663 ph10 224 /* Same for these */
1664 ph10 227
1665 ph10 224 case OP_TYPEUPTO:
1666     case OP_TYPEMINUPTO:
1667     case OP_TYPEPOSUPTO:
1668     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1669     break;
1670 nigel 77
1671     /* End of branch */
1672    
1673     case OP_KET:
1674     case OP_KETRMAX:
1675     case OP_KETRMIN:
1676     case OP_ALT:
1677     return TRUE;
1678    
1679 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1680     MINUPTO, and POSUPTO may be followed by a multibyte character */
1681 nigel 77
1682     #ifdef SUPPORT_UTF8
1683     case OP_STAR:
1684     case OP_MINSTAR:
1685 nigel 93 case OP_POSSTAR:
1686 nigel 77 case OP_QUERY:
1687     case OP_MINQUERY:
1688 nigel 93 case OP_POSQUERY:
1689 nigel 77 case OP_UPTO:
1690     case OP_MINUPTO:
1691 nigel 93 case OP_POSUPTO:
1692 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1693     break;
1694     #endif
1695     }
1696     }
1697    
1698     return TRUE;
1699     }
1700    
1701    
1702    
1703     /*************************************************
1704     * Scan compiled regex for non-emptiness *
1705     *************************************************/
1706    
1707     /* This function is called to check for left recursive calls. We want to check
1708     the current branch of the current pattern to see if it could match the empty
1709     string. If it could, we must look outwards for branches at other levels,
1710     stopping when we pass beyond the bracket which is the subject of the recursion.
1711    
1712     Arguments:
1713     code points to start of the recursion
1714     endcode points to where to stop (current RECURSE item)
1715     bcptr points to the chain of current (unclosed) branch starts
1716     utf8 TRUE if in UTF-8 mode
1717    
1718     Returns: TRUE if what is matched could be empty
1719     */
1720    
1721     static BOOL
1722     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1723     BOOL utf8)
1724     {
1725     while (bcptr != NULL && bcptr->current >= code)
1726     {
1727     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1728     bcptr = bcptr->outer;
1729     }
1730     return TRUE;
1731     }
1732    
1733    
1734    
1735     /*************************************************
1736     * Check for POSIX class syntax *
1737     *************************************************/
1738    
1739     /* This function is called when the sequence "[:" or "[." or "[=" is
1740     encountered in a character class. It checks whether this is followed by an
1741     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1742     ".]" or "=]".
1743    
1744     Argument:
1745     ptr pointer to the initial [
1746     endptr where to return the end pointer
1747     cd pointer to compile data
1748    
1749     Returns: TRUE or FALSE
1750     */
1751    
1752     static BOOL
1753     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1754     {
1755     int terminator; /* Don't combine these lines; the Solaris cc */
1756     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1757     if (*(++ptr) == '^') ptr++;
1758     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1759     if (*ptr == terminator && ptr[1] == ']')
1760     {
1761     *endptr = ptr;
1762     return TRUE;
1763     }
1764     return FALSE;
1765     }
1766    
1767    
1768    
1769    
1770     /*************************************************
1771     * Check POSIX class name *
1772     *************************************************/
1773    
1774     /* This function is called to check the name given in a POSIX-style class entry
1775     such as [:alnum:].
1776    
1777     Arguments:
1778     ptr points to the first letter
1779     len the length of the name
1780    
1781     Returns: a value representing the name, or -1 if unknown
1782     */
1783    
1784     static int
1785     check_posix_name(const uschar *ptr, int len)
1786     {
1787 ph10 240 const char *pn = posix_names;
1788 nigel 77 register int yield = 0;
1789     while (posix_name_lengths[yield] != 0)
1790     {
1791     if (len == posix_name_lengths[yield] &&
1792 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
1793 ph10 243 pn += posix_name_lengths[yield] + 1;
1794 nigel 77 yield++;
1795     }
1796     return -1;
1797     }
1798    
1799    
1800     /*************************************************
1801     * Adjust OP_RECURSE items in repeated group *
1802     *************************************************/
1803    
1804     /* OP_RECURSE items contain an offset from the start of the regex to the group
1805     that is referenced. This means that groups can be replicated for fixed
1806     repetition simply by copying (because the recursion is allowed to refer to
1807     earlier groups that are outside the current group). However, when a group is
1808     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1809     it, after it has been compiled. This means that any OP_RECURSE items within it
1810     that refer to the group itself or any contained groups have to have their
1811 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1812     the partially compiled regex must be temporarily terminated with OP_END.
1813 nigel 77
1814 nigel 93 This function has been extended with the possibility of forward references for
1815     recursions and subroutine calls. It must also check the list of such references
1816     for the group we are dealing with. If it finds that one of the recursions in
1817     the current group is on this list, it adjusts the offset in the list, not the
1818     value in the reference (which is a group number).
1819    
1820 nigel 77 Arguments:
1821     group points to the start of the group
1822     adjust the amount by which the group is to be moved
1823     utf8 TRUE in UTF-8 mode
1824     cd contains pointers to tables etc.
1825 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1826 nigel 77
1827     Returns: nothing
1828     */
1829    
1830     static void
1831 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1832     uschar *save_hwm)
1833 nigel 77 {
1834     uschar *ptr = group;
1835 ph10 224
1836 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1837     {
1838 nigel 93 int offset;
1839     uschar *hc;
1840    
1841     /* See if this recursion is on the forward reference list. If so, adjust the
1842     reference. */
1843    
1844     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1845     {
1846     offset = GET(hc, 0);
1847     if (cd->start_code + offset == ptr + 1)
1848     {
1849     PUT(hc, 0, offset + adjust);
1850     break;
1851     }
1852     }
1853    
1854     /* Otherwise, adjust the recursion offset if it's after the start of this
1855     group. */
1856    
1857     if (hc >= cd->hwm)
1858     {
1859     offset = GET(ptr, 1);
1860     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1861     }
1862    
1863 nigel 77 ptr += 1 + LINK_SIZE;
1864     }
1865     }
1866    
1867    
1868    
1869     /*************************************************
1870     * Insert an automatic callout point *
1871     *************************************************/
1872    
1873     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1874     callout points before each pattern item.
1875    
1876     Arguments:
1877     code current code pointer
1878     ptr current pattern pointer
1879     cd pointers to tables etc
1880    
1881     Returns: new code pointer
1882     */
1883    
1884     static uschar *
1885     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1886     {
1887     *code++ = OP_CALLOUT;
1888     *code++ = 255;
1889     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1890     PUT(code, LINK_SIZE, 0); /* Default length */
1891     return code + 2*LINK_SIZE;
1892     }
1893    
1894    
1895    
1896     /*************************************************
1897     * Complete a callout item *
1898     *************************************************/
1899    
1900     /* A callout item contains the length of the next item in the pattern, which
1901     we can't fill in till after we have reached the relevant point. This is used
1902     for both automatic and manual callouts.
1903    
1904     Arguments:
1905     previous_callout points to previous callout item
1906     ptr current pattern pointer
1907     cd pointers to tables etc
1908    
1909     Returns: nothing
1910     */
1911    
1912     static void
1913     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1914     {
1915     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1916     PUT(previous_callout, 2 + LINK_SIZE, length);
1917     }
1918    
1919    
1920    
1921     #ifdef SUPPORT_UCP
1922     /*************************************************
1923     * Get othercase range *
1924     *************************************************/
1925    
1926     /* This function is passed the start and end of a class range, in UTF-8 mode
1927     with UCP support. It searches up the characters, looking for internal ranges of
1928     characters in the "other" case. Each call returns the next one, updating the
1929     start address.
1930    
1931     Arguments:
1932     cptr points to starting character value; updated
1933     d end value
1934     ocptr where to put start of othercase range
1935     odptr where to put end of othercase range
1936    
1937     Yield: TRUE when range returned; FALSE when no more
1938     */
1939    
1940     static BOOL
1941 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1942     unsigned int *odptr)
1943 nigel 77 {
1944 nigel 93 unsigned int c, othercase, next;
1945 nigel 77
1946     for (c = *cptr; c <= d; c++)
1947 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1948 nigel 77
1949     if (c > d) return FALSE;
1950    
1951     *ocptr = othercase;
1952     next = othercase + 1;
1953    
1954     for (++c; c <= d; c++)
1955     {
1956 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1957 nigel 77 next++;
1958     }
1959    
1960     *odptr = next - 1;
1961     *cptr = c;
1962    
1963     return TRUE;
1964     }
1965     #endif /* SUPPORT_UCP */
1966    
1967    
1968 nigel 93
1969 nigel 77 /*************************************************
1970 nigel 93 * Check if auto-possessifying is possible *
1971     *************************************************/
1972    
1973     /* This function is called for unlimited repeats of certain items, to see
1974     whether the next thing could possibly match the repeated item. If not, it makes
1975     sense to automatically possessify the repeated item.
1976    
1977     Arguments:
1978     op_code the repeated op code
1979     this data for this item, depends on the opcode
1980     utf8 TRUE in UTF-8 mode
1981     utf8_char used for utf8 character bytes, NULL if not relevant
1982     ptr next character in pattern
1983     options options bits
1984     cd contains pointers to tables etc.
1985    
1986     Returns: TRUE if possessifying is wanted
1987     */
1988    
1989     static BOOL
1990     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1991     const uschar *ptr, int options, compile_data *cd)
1992     {
1993     int next;
1994    
1995     /* Skip whitespace and comments in extended mode */
1996    
1997     if ((options & PCRE_EXTENDED) != 0)
1998     {
1999     for (;;)
2000     {
2001     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2002     if (*ptr == '#')
2003     {
2004     while (*(++ptr) != 0)
2005     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2006     }
2007     else break;
2008     }
2009     }
2010    
2011     /* If the next item is one that we can handle, get its value. A non-negative
2012     value is a character, a negative value is an escape value. */
2013    
2014     if (*ptr == '\\')
2015     {
2016     int temperrorcode = 0;
2017     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2018     if (temperrorcode != 0) return FALSE;
2019     ptr++; /* Point after the escape sequence */
2020     }
2021    
2022     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2023     {
2024     #ifdef SUPPORT_UTF8
2025     if (utf8) { GETCHARINC(next, ptr); } else
2026     #endif
2027     next = *ptr++;
2028     }
2029    
2030     else return FALSE;
2031    
2032     /* Skip whitespace and comments in extended mode */
2033    
2034     if ((options & PCRE_EXTENDED) != 0)
2035     {
2036     for (;;)
2037     {
2038     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2039     if (*ptr == '#')
2040     {
2041     while (*(++ptr) != 0)
2042     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2043     }
2044     else break;
2045     }
2046     }
2047    
2048     /* If the next thing is itself optional, we have to give up. */
2049    
2050     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2051     return FALSE;
2052    
2053     /* Now compare the next item with the previous opcode. If the previous is a
2054     positive single character match, "item" either contains the character or, if
2055     "item" is greater than 127 in utf8 mode, the character's bytes are in
2056     utf8_char. */
2057    
2058    
2059     /* Handle cases when the next item is a character. */
2060    
2061     if (next >= 0) switch(op_code)
2062     {
2063     case OP_CHAR:
2064     #ifdef SUPPORT_UTF8
2065     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2066     #endif
2067     return item != next;
2068    
2069     /* For CHARNC (caseless character) we must check the other case. If we have
2070     Unicode property support, we can use it to test the other case of
2071     high-valued characters. */
2072    
2073     case OP_CHARNC:
2074     #ifdef SUPPORT_UTF8
2075     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2076     #endif
2077     if (item == next) return FALSE;
2078     #ifdef SUPPORT_UTF8
2079     if (utf8)
2080     {
2081     unsigned int othercase;
2082     if (next < 128) othercase = cd->fcc[next]; else
2083     #ifdef SUPPORT_UCP
2084     othercase = _pcre_ucp_othercase((unsigned int)next);
2085     #else
2086     othercase = NOTACHAR;
2087     #endif
2088     return (unsigned int)item != othercase;
2089     }
2090     else
2091     #endif /* SUPPORT_UTF8 */
2092     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2093    
2094     /* For OP_NOT, "item" must be a single-byte character. */
2095    
2096     case OP_NOT:
2097     if (next < 0) return FALSE; /* Not a character */
2098     if (item == next) return TRUE;
2099     if ((options & PCRE_CASELESS) == 0) return FALSE;
2100     #ifdef SUPPORT_UTF8
2101     if (utf8)
2102     {
2103     unsigned int othercase;
2104     if (next < 128) othercase = cd->fcc[next]; else
2105     #ifdef SUPPORT_UCP
2106     othercase = _pcre_ucp_othercase(next);
2107     #else
2108     othercase = NOTACHAR;
2109     #endif
2110     return (unsigned int)item == othercase;
2111     }
2112     else
2113     #endif /* SUPPORT_UTF8 */
2114     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2115    
2116     case OP_DIGIT:
2117     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2118    
2119     case OP_NOT_DIGIT:
2120     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2121    
2122     case OP_WHITESPACE:
2123     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2124    
2125     case OP_NOT_WHITESPACE:
2126     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2127    
2128     case OP_WORDCHAR:
2129     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2130    
2131     case OP_NOT_WORDCHAR:
2132     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2133    
2134 ph10 180 case OP_HSPACE:
2135     case OP_NOT_HSPACE:
2136     switch(next)
2137     {
2138     case 0x09:
2139     case 0x20:
2140     case 0xa0:
2141     case 0x1680:
2142     case 0x180e:
2143     case 0x2000:
2144     case 0x2001:
2145     case 0x2002:
2146     case 0x2003:
2147     case 0x2004:
2148     case 0x2005:
2149     case 0x2006:
2150     case 0x2007:
2151     case 0x2008:
2152     case 0x2009:
2153     case 0x200A:
2154     case 0x202f:
2155     case 0x205f:
2156     case 0x3000:
2157     return op_code != OP_HSPACE;
2158     default:
2159     return op_code == OP_HSPACE;
2160     }
2161    
2162     case OP_VSPACE:
2163     case OP_NOT_VSPACE:
2164     switch(next)
2165     {
2166     case 0x0a:
2167     case 0x0b:
2168     case 0x0c:
2169     case 0x0d:
2170     case 0x85:
2171     case 0x2028:
2172     case 0x2029:
2173     return op_code != OP_VSPACE;
2174     default:
2175     return op_code == OP_VSPACE;
2176     }
2177    
2178 nigel 93 default:
2179     return FALSE;
2180     }
2181    
2182    
2183     /* Handle the case when the next item is \d, \s, etc. */
2184    
2185     switch(op_code)
2186     {
2187     case OP_CHAR:
2188     case OP_CHARNC:
2189     #ifdef SUPPORT_UTF8
2190     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2191     #endif
2192     switch(-next)
2193     {
2194     case ESC_d:
2195     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2196    
2197     case ESC_D:
2198     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2199    
2200     case ESC_s:
2201     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2202    
2203     case ESC_S:
2204     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2205    
2206     case ESC_w:
2207     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2208    
2209     case ESC_W:
2210     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2211 ph10 182
2212 ph10 180 case ESC_h:
2213     case ESC_H:
2214     switch(item)
2215     {
2216     case 0x09:
2217     case 0x20:
2218     case 0xa0:
2219     case 0x1680:
2220     case 0x180e:
2221     case 0x2000:
2222     case 0x2001:
2223     case 0x2002:
2224     case 0x2003:
2225     case 0x2004:
2226     case 0x2005:
2227     case 0x2006:
2228     case 0x2007:
2229     case 0x2008:
2230     case 0x2009:
2231     case 0x200A:
2232     case 0x202f:
2233     case 0x205f:
2234     case 0x3000:
2235     return -next != ESC_h;
2236     default:
2237     return -next == ESC_h;
2238 ph10 182 }
2239    
2240 ph10 180 case ESC_v:
2241     case ESC_V:
2242     switch(item)
2243     {
2244     case 0x0a:
2245     case 0x0b:
2246     case 0x0c:
2247     case 0x0d:
2248     case 0x85:
2249     case 0x2028:
2250     case 0x2029:
2251     return -next != ESC_v;
2252     default:
2253     return -next == ESC_v;
2254 ph10 182 }
2255 nigel 93
2256     default:
2257     return FALSE;
2258     }
2259    
2260     case OP_DIGIT:
2261 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2262     next == -ESC_h || next == -ESC_v;
2263 nigel 93
2264     case OP_NOT_DIGIT:
2265     return next == -ESC_d;
2266    
2267     case OP_WHITESPACE:
2268     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2269    
2270     case OP_NOT_WHITESPACE:
2271 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2272 nigel 93
2273 ph10 180 case OP_HSPACE:
2274     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2275    
2276     case OP_NOT_HSPACE:
2277     return next == -ESC_h;
2278 ph10 182
2279 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2280 ph10 182 case OP_VSPACE:
2281 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2282    
2283     case OP_NOT_VSPACE:
2284 ph10 182 return next == -ESC_v;
2285 ph10 180
2286 nigel 93 case OP_WORDCHAR:
2287 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2288 nigel 93
2289     case OP_NOT_WORDCHAR:
2290     return next == -ESC_w || next == -ESC_d;
2291 ph10 182
2292 nigel 93 default:
2293     return FALSE;
2294     }
2295    
2296     /* Control does not reach here */
2297     }
2298    
2299    
2300    
2301     /*************************************************
2302 nigel 77 * Compile one branch *
2303     *************************************************/
2304    
2305 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2306 nigel 77 changed during the branch, the pointer is used to change the external options
2307 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2308     to find out the amount of memory needed, as well as during the real compile
2309     phase. The value of lengthptr distinguishes the two phases.
2310 nigel 77
2311     Arguments:
2312     optionsptr pointer to the option bits
2313     codeptr points to the pointer to the current code point
2314     ptrptr points to the current pattern pointer
2315     errorcodeptr points to error code variable
2316     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2317     reqbyteptr set to the last literal character required, else < 0
2318     bcptr points to current branch chain
2319     cd contains pointers to tables etc.
2320 nigel 93 lengthptr NULL during the real compile phase
2321     points to length accumulator during pre-compile phase
2322 nigel 77
2323     Returns: TRUE on success
2324     FALSE, with *errorcodeptr set non-zero on error
2325     */
2326    
2327     static BOOL
2328 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2329     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2330     compile_data *cd, int *lengthptr)
2331 nigel 77 {
2332     int repeat_type, op_type;
2333     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2334     int bravalue = 0;
2335     int greedy_default, greedy_non_default;
2336     int firstbyte, reqbyte;
2337     int zeroreqbyte, zerofirstbyte;
2338     int req_caseopt, reqvary, tempreqvary;
2339     int options = *optionsptr;
2340     int after_manual_callout = 0;
2341 nigel 93 int length_prevgroup = 0;
2342 nigel 77 register int c;
2343     register uschar *code = *codeptr;
2344 nigel 93 uschar *last_code = code;
2345     uschar *orig_code = code;
2346 nigel 77 uschar *tempcode;
2347     BOOL inescq = FALSE;
2348     BOOL groupsetfirstbyte = FALSE;
2349     const uschar *ptr = *ptrptr;
2350     const uschar *tempptr;
2351     uschar *previous = NULL;
2352     uschar *previous_callout = NULL;
2353 nigel 93 uschar *save_hwm = NULL;
2354 nigel 77 uschar classbits[32];
2355    
2356     #ifdef SUPPORT_UTF8
2357     BOOL class_utf8;
2358     BOOL utf8 = (options & PCRE_UTF8) != 0;
2359     uschar *class_utf8data;
2360     uschar utf8_char[6];
2361     #else
2362     BOOL utf8 = FALSE;
2363 nigel 93 uschar *utf8_char = NULL;
2364 nigel 77 #endif
2365    
2366 nigel 93 #ifdef DEBUG
2367     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2368     #endif
2369    
2370 nigel 77 /* Set up the default and non-default settings for greediness */
2371    
2372     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2373     greedy_non_default = greedy_default ^ 1;
2374    
2375     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2376     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2377     matches a non-fixed char first char; reqbyte just remains unset if we never
2378     find one.
2379    
2380     When we hit a repeat whose minimum is zero, we may have to adjust these values
2381     to take the zero repeat into account. This is implemented by setting them to
2382     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2383     item types that can be repeated set these backoff variables appropriately. */
2384    
2385     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2386    
2387     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2388     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2389     value > 255. It is added into the firstbyte or reqbyte variables to record the
2390     case status of the value. This is used only for ASCII characters. */
2391    
2392     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2393    
2394     /* Switch on next character until the end of the branch */
2395    
2396     for (;; ptr++)
2397     {
2398     BOOL negate_class;
2399 ph10 264 BOOL should_flip_negation;
2400 nigel 77 BOOL possessive_quantifier;
2401     BOOL is_quantifier;
2402 nigel 93 BOOL is_recurse;
2403 ph10 180 BOOL reset_bracount;
2404 nigel 77 int class_charcount;
2405     int class_lastchar;
2406     int newoptions;
2407     int recno;
2408 ph10 172 int refsign;
2409 nigel 77 int skipbytes;
2410     int subreqbyte;
2411     int subfirstbyte;
2412 nigel 93 int terminator;
2413 nigel 77 int mclength;
2414     uschar mcbuffer[8];
2415    
2416 nigel 93 /* Get next byte in the pattern */
2417 nigel 77
2418     c = *ptr;
2419    
2420 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2421     previous cycle of this loop. */
2422    
2423     if (lengthptr != NULL)
2424     {
2425     #ifdef DEBUG
2426     if (code > cd->hwm) cd->hwm = code; /* High water info */
2427     #endif
2428     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2429     {
2430     *errorcodeptr = ERR52;
2431     goto FAILED;
2432     }
2433    
2434     /* There is at least one situation where code goes backwards: this is the
2435     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2436     the class is simply eliminated. However, it is created first, so we have to
2437     allow memory for it. Therefore, don't ever reduce the length at this point.
2438     */
2439    
2440     if (code < last_code) code = last_code;
2441 ph10 202
2442     /* Paranoid check for integer overflow */
2443    
2444     if (OFLOW_MAX - *lengthptr < code - last_code)
2445     {
2446     *errorcodeptr = ERR20;
2447     goto FAILED;
2448     }
2449    
2450 nigel 93 *lengthptr += code - last_code;
2451     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2452    
2453     /* If "previous" is set and it is not at the start of the work space, move
2454     it back to there, in order to avoid filling up the work space. Otherwise,
2455     if "previous" is NULL, reset the current code pointer to the start. */
2456    
2457     if (previous != NULL)
2458     {
2459     if (previous > orig_code)
2460     {
2461     memmove(orig_code, previous, code - previous);
2462     code -= previous - orig_code;
2463     previous = orig_code;
2464     }
2465     }
2466     else code = orig_code;
2467    
2468     /* Remember where this code item starts so we can pick up the length
2469     next time round. */
2470    
2471     last_code = code;
2472     }
2473    
2474     /* In the real compile phase, just check the workspace used by the forward
2475     reference list. */
2476    
2477     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2478     {
2479     *errorcodeptr = ERR52;
2480     goto FAILED;
2481     }
2482    
2483 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2484    
2485     if (inescq && c != 0)
2486     {
2487     if (c == '\\' && ptr[1] == 'E')
2488     {
2489     inescq = FALSE;
2490     ptr++;
2491     continue;
2492     }
2493     else
2494     {
2495     if (previous_callout != NULL)
2496     {
2497 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2498     complete_callout(previous_callout, ptr, cd);
2499 nigel 77 previous_callout = NULL;
2500     }
2501     if ((options & PCRE_AUTO_CALLOUT) != 0)
2502     {
2503     previous_callout = code;
2504     code = auto_callout(code, ptr, cd);
2505     }
2506     goto NORMAL_CHAR;
2507     }
2508     }
2509    
2510     /* Fill in length of a previous callout, except when the next thing is
2511     a quantifier. */
2512    
2513     is_quantifier = c == '*' || c == '+' || c == '?' ||
2514     (c == '{' && is_counted_repeat(ptr+1));
2515    
2516     if (!is_quantifier && previous_callout != NULL &&
2517     after_manual_callout-- <= 0)
2518     {
2519 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2520     complete_callout(previous_callout, ptr, cd);
2521 nigel 77 previous_callout = NULL;
2522     }
2523    
2524     /* In extended mode, skip white space and comments */
2525    
2526     if ((options & PCRE_EXTENDED) != 0)
2527     {
2528     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2529     if (c == '#')
2530     {
2531 nigel 93 while (*(++ptr) != 0)
2532 nigel 91 {
2533 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2534 nigel 91 }
2535 nigel 93 if (*ptr != 0) continue;
2536    
2537 nigel 91 /* Else fall through to handle end of string */
2538     c = 0;
2539 nigel 77 }
2540     }
2541    
2542     /* No auto callout for quantifiers. */
2543    
2544     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2545     {
2546     previous_callout = code;
2547     code = auto_callout(code, ptr, cd);
2548     }
2549    
2550     switch(c)
2551     {
2552 nigel 93 /* ===================================================================*/
2553     case 0: /* The branch terminates at string end */
2554     case '|': /* or | or ) */
2555 nigel 77 case ')':
2556     *firstbyteptr = firstbyte;
2557     *reqbyteptr = reqbyte;
2558     *codeptr = code;
2559     *ptrptr = ptr;
2560 nigel 93 if (lengthptr != NULL)
2561     {
2562 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2563     {
2564     *errorcodeptr = ERR20;
2565     goto FAILED;
2566     }
2567 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2568     DPRINTF((">> end branch\n"));
2569     }
2570 nigel 77 return TRUE;
2571    
2572 nigel 93
2573     /* ===================================================================*/
2574 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2575     the setting of any following char as a first character. */
2576    
2577     case '^':
2578     if ((options & PCRE_MULTILINE) != 0)
2579     {
2580     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2581     }
2582     previous = NULL;
2583     *code++ = OP_CIRC;
2584     break;
2585    
2586     case '$':
2587     previous = NULL;
2588     *code++ = OP_DOLL;
2589     break;
2590    
2591     /* There can never be a first char if '.' is first, whatever happens about
2592     repeats. The value of reqbyte doesn't change either. */
2593    
2594     case '.':
2595     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2596     zerofirstbyte = firstbyte;
2597     zeroreqbyte = reqbyte;
2598     previous = code;
2599     *code++ = OP_ANY;
2600     break;
2601    
2602 nigel 93
2603     /* ===================================================================*/
2604 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2605     32-byte bitmap of the permitted characters, except in the special case
2606     where there is only one such character. For negated classes, we build the
2607     map as usual, then invert it at the end. However, we use a different opcode
2608     so that data characters > 255 can be handled correctly.
2609 nigel 77
2610     If the class contains characters outside the 0-255 range, a different
2611     opcode is compiled. It may optionally have a bit map for characters < 256,
2612     but those above are are explicitly listed afterwards. A flag byte tells
2613     whether the bitmap is present, and whether this is a negated class or not.
2614     */
2615    
2616     case '[':
2617     previous = code;
2618    
2619     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2620     they are encountered at the top level, so we'll do that too. */
2621    
2622     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2623     check_posix_syntax(ptr, &tempptr, cd))
2624     {
2625     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2626     goto FAILED;
2627     }
2628    
2629 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2630 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2631 ph10 205 skip them too. This makes for compatibility with Perl. */
2632 ph10 208
2633 ph10 205 negate_class = FALSE;
2634     for (;;)
2635 nigel 77 {
2636     c = *(++ptr);
2637 ph10 205 if (c == '\\')
2638     {
2639 ph10 208 if (ptr[1] == 'E') ptr++;
2640 ph10 205 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2641 ph10 208 else break;
2642 ph10 205 }
2643     else if (!negate_class && c == '^')
2644     negate_class = TRUE;
2645     else break;
2646 ph10 208 }
2647 nigel 77
2648 ph10 264 /* If a class contains a negative special such as \S, we need to flip the
2649     negation flag at the end, so that support for characters > 255 works
2650     correctly (they are all included in the class). */
2651    
2652     should_flip_negation = FALSE;
2653    
2654 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
2655 nigel 93 of just a single character (as long as it's < 256). However, For higher
2656     valued UTF-8 characters, we don't yet do any optimization. */
2657 nigel 77
2658     class_charcount = 0;
2659     class_lastchar = -1;
2660    
2661 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2662     temporary bit of memory, in case the class contains only 1 character (less
2663     than 256), because in that case the compiled code doesn't use the bit map.
2664     */
2665    
2666     memset(classbits, 0, 32 * sizeof(uschar));
2667    
2668 nigel 77 #ifdef SUPPORT_UTF8
2669     class_utf8 = FALSE; /* No chars >= 256 */
2670 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2671 nigel 77 #endif
2672    
2673     /* Process characters until ] is reached. By writing this as a "do" it
2674 nigel 93 means that an initial ] is taken as a data character. At the start of the
2675     loop, c contains the first byte of the character. */
2676 nigel 77
2677 nigel 93 if (c != 0) do
2678 nigel 77 {
2679 nigel 93 const uschar *oldptr;
2680    
2681 nigel 77 #ifdef SUPPORT_UTF8
2682     if (utf8 && c > 127)
2683     { /* Braces are required because the */
2684     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2685     }
2686     #endif
2687    
2688     /* Inside \Q...\E everything is literal except \E */
2689    
2690     if (inescq)
2691     {
2692 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2693 nigel 77 {
2694 nigel 93 inescq = FALSE; /* Reset literal state */
2695     ptr++; /* Skip the 'E' */
2696     continue; /* Carry on with next */
2697 nigel 77 }
2698 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2699 nigel 77 }
2700    
2701     /* Handle POSIX class names. Perl allows a negation extension of the
2702     form [:^name:]. A square bracket that doesn't match the syntax is
2703     treated as a literal. We also recognize the POSIX constructions
2704     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2705     5.6 and 5.8 do. */
2706    
2707     if (c == '[' &&
2708     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2709     check_posix_syntax(ptr, &tempptr, cd))
2710     {
2711     BOOL local_negate = FALSE;
2712 nigel 87 int posix_class, taboffset, tabopt;
2713 nigel 77 register const uschar *cbits = cd->cbits;
2714 nigel 87 uschar pbits[32];
2715 nigel 77
2716     if (ptr[1] != ':')
2717     {
2718     *errorcodeptr = ERR31;
2719     goto FAILED;
2720     }
2721    
2722     ptr += 2;
2723     if (*ptr == '^')
2724     {
2725     local_negate = TRUE;
2726 ph10 265 should_flip_negation = TRUE; /* Note negative special */
2727 nigel 77 ptr++;
2728     }
2729    
2730     posix_class = check_posix_name(ptr, tempptr - ptr);
2731     if (posix_class < 0)
2732     {
2733     *errorcodeptr = ERR30;
2734     goto FAILED;
2735     }
2736    
2737     /* If matching is caseless, upper and lower are converted to
2738     alpha. This relies on the fact that the class table starts with
2739     alpha, lower, upper as the first 3 entries. */
2740    
2741     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2742     posix_class = 0;
2743    
2744 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2745     because we may be adding and subtracting from it, and we don't want to
2746     subtract bits that may be in the main map already. At the end we or the
2747     result into the bit map that is being built. */
2748 nigel 77
2749     posix_class *= 3;
2750 nigel 87
2751     /* Copy in the first table (always present) */
2752    
2753     memcpy(pbits, cbits + posix_class_maps[posix_class],
2754     32 * sizeof(uschar));
2755    
2756     /* If there is a second table, add or remove it as required. */
2757    
2758     taboffset = posix_class_maps[posix_class + 1];
2759     tabopt = posix_class_maps[posix_class + 2];
2760    
2761     if (taboffset >= 0)
2762 nigel 77 {
2763 nigel 87 if (tabopt >= 0)
2764     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2765 nigel 77 else
2766 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2767 nigel 77 }
2768    
2769 nigel 87 /* Not see if we need to remove any special characters. An option
2770     value of 1 removes vertical space and 2 removes underscore. */
2771    
2772     if (tabopt < 0) tabopt = -tabopt;
2773     if (tabopt == 1) pbits[1] &= ~0x3c;
2774     else if (tabopt == 2) pbits[11] &= 0x7f;
2775    
2776     /* Add the POSIX table or its complement into the main table that is
2777     being built and we are done. */
2778    
2779     if (local_negate)
2780     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2781     else
2782     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2783    
2784 nigel 77 ptr = tempptr + 1;
2785     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2786     continue; /* End of POSIX syntax handling */
2787     }
2788    
2789     /* Backslash may introduce a single character, or it may introduce one
2790 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2791     case. Inside a class (and only there) it is treated as backspace.
2792     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2793 ph10 205 to 'or' into the one we are building. We assume they have more than one
2794 nigel 77 character in them, so set class_charcount bigger than one. */
2795    
2796     if (c == '\\')
2797     {
2798 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2799     if (*errorcodeptr != 0) goto FAILED;
2800 nigel 77
2801 ph10 275 if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
2802 nigel 77 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2803 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2804 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2805     {
2806     if (ptr[1] == '\\' && ptr[2] == 'E')
2807     {
2808     ptr += 2; /* avoid empty string */
2809     }
2810     else inescq = TRUE;
2811     continue;
2812     }
2813 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2814 nigel 77
2815     if (c < 0)
2816     {
2817     register const uschar *cbits = cd->cbits;
2818     class_charcount += 2; /* Greater than 1 is what matters */
2819 nigel 93
2820     /* Save time by not doing this in the pre-compile phase. */
2821    
2822     if (lengthptr == NULL) switch (-c)
2823 nigel 77 {
2824     case ESC_d:
2825     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2826     continue;
2827    
2828     case ESC_D:
2829 ph10 264 should_flip_negation = TRUE;
2830 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2831     continue;
2832    
2833     case ESC_w:
2834     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2835     continue;
2836    
2837     case ESC_W:
2838 ph10 264 should_flip_negation = TRUE;
2839 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2840     continue;
2841    
2842     case ESC_s:
2843     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2844     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2845     continue;
2846    
2847     case ESC_S:
2848 ph10 264 should_flip_negation = TRUE;
2849 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2850     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2851     continue;
2852    
2853 nigel 93 default: /* Not recognized; fall through */
2854     break; /* Need "default" setting to stop compiler warning. */
2855     }
2856    
2857     /* In the pre-compile phase, just do the recognition. */
2858    
2859     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2860     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2861 ph10 180
2862 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
2863     they use extra memory. */
2864 ph10 180
2865 ph10 178 if (-c == ESC_h)
2866     {
2867     SETBIT(classbits, 0x09); /* VT */
2868     SETBIT(classbits, 0x20); /* SPACE */
2869 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
2870 ph10 178 #ifdef SUPPORT_UTF8
2871     if (utf8)
2872 ph10 180 {
2873 ph10 178 class_utf8 = TRUE;
2874     *class_utf8data++ = XCL_SINGLE;
2875 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2876 ph10 178 *class_utf8data++ = XCL_SINGLE;
2877 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2878     *class_utf8data++ = XCL_RANGE;
2879     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2880     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2881 ph10 178 *class_utf8data++ = XCL_SINGLE;
2882 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2883 ph10 178 *class_utf8data++ = XCL_SINGLE;
2884 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2885 ph10 178 *class_utf8data++ = XCL_SINGLE;
2886 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2887     }
2888     #endif
2889     continue;
2890     }
2891 nigel 93
2892 ph10 178 if (-c == ESC_H)
2893     {
2894     for (c = 0; c < 32; c++)
2895     {
2896     int x = 0xff;
2897     switch (c)
2898 ph10 180 {
2899 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
2900     case 0x20/8: x ^= 1 << (0x20%8); break;
2901     case 0xa0/8: x ^= 1 << (0xa0%8); break;
2902     default: break;
2903     }
2904     classbits[c] |= x;
2905 ph10 180 }
2906    
2907 ph10 178 #ifdef SUPPORT_UTF8
2908     if (utf8)
2909 ph10 180 {
2910 ph10 178 class_utf8 = TRUE;
2911 ph10 180 *class_utf8data++ = XCL_RANGE;
2912     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2913     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2914     *class_utf8data++ = XCL_RANGE;
2915     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2916     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2917     *class_utf8data++ = XCL_RANGE;
2918     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2919     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2920     *class_utf8data++ = XCL_RANGE;
2921     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2922     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2923     *class_utf8data++ = XCL_RANGE;
2924     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2925     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2926     *class_utf8data++ = XCL_RANGE;
2927     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2928     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2929     *class_utf8data++ = XCL_RANGE;
2930     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2931     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2932     }
2933     #endif
2934     continue;
2935     }
2936 ph10 178
2937     if (-c == ESC_v)
2938     {
2939     SETBIT(classbits, 0x0a); /* LF */
2940     SETBIT(classbits, 0x0b); /* VT */
2941 ph10 180 SETBIT(classbits, 0x0c); /* FF */
2942     SETBIT(classbits, 0x0d); /* CR */
2943     SETBIT(classbits, 0x85); /* NEL */
2944 ph10 178 #ifdef SUPPORT_UTF8
2945     if (utf8)
2946 ph10 180 {
2947 ph10 178 class_utf8 = TRUE;
2948 ph10 180 *class_utf8data++ = XCL_RANGE;
2949     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2950     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2951     }
2952     #endif
2953     continue;
2954     }
2955 ph10 178
2956     if (-c == ESC_V)
2957     {
2958     for (c = 0; c < 32; c++)
2959     {
2960     int x = 0xff;
2961     switch (c)
2962 ph10 180 {
2963 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
2964     x ^= 1 << (0x0b%8);
2965     x ^= 1 << (0x0c%8);
2966 ph10 180 x ^= 1 << (0x0d%8);
2967 ph10 178 break;
2968     case 0x85/8: x ^= 1 << (0x85%8); break;
2969     default: break;
2970     }
2971     classbits[c] |= x;
2972 ph10 180 }
2973    
2974 ph10 178 #ifdef SUPPORT_UTF8
2975     if (utf8)
2976 ph10 180 {
2977 ph10 178 class_utf8 = TRUE;
2978 ph10 180 *class_utf8data++ = XCL_RANGE;
2979     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2980     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2981     *class_utf8data++ = XCL_RANGE;
2982     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2983     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2984     }
2985     #endif
2986     continue;
2987     }
2988 ph10 178
2989 nigel 93 /* We need to deal with \P and \p in both phases. */
2990    
2991 nigel 77 #ifdef SUPPORT_UCP
2992 nigel 93 if (-c == ESC_p || -c == ESC_P)
2993     {
2994     BOOL negated;
2995     int pdata;
2996     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2997     if (ptype < 0) goto FAILED;
2998     class_utf8 = TRUE;
2999     *class_utf8data++ = ((-c == ESC_p) != negated)?
3000     XCL_PROP : XCL_NOTPROP;
3001     *class_utf8data++ = ptype;
3002     *class_utf8data++ = pdata;
3003     class_charcount -= 2; /* Not a < 256 character */
3004 nigel 77 continue;
3005 nigel 93 }
3006 nigel 77 #endif
3007 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
3008     strict mode. By default, for compatibility with Perl, they are
3009     treated as literals. */
3010 nigel 77
3011 nigel 93 if ((options & PCRE_EXTRA) != 0)
3012     {
3013     *errorcodeptr = ERR7;
3014     goto FAILED;
3015     }
3016 nigel 77
3017 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3018     c = *ptr; /* Get the final character and fall through */
3019 nigel 77 }
3020    
3021     /* Fall through if we have a single character (c >= 0). This may be
3022 nigel 93 greater than 256 in UTF-8 mode. */
3023 nigel 77
3024     } /* End of backslash handling */
3025    
3026     /* A single character may be followed by '-' to form a range. However,
3027     Perl does not permit ']' to be the end of the range. A '-' character
3028 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3029     entirely. The code for handling \Q and \E is messy. */
3030 nigel 77
3031 nigel 93 CHECK_RANGE:
3032     while (ptr[1] == '\\' && ptr[2] == 'E')
3033 nigel 77 {
3034 nigel 93 inescq = FALSE;
3035     ptr += 2;
3036     }
3037    
3038     oldptr = ptr;
3039 ph10 231
3040 ph10 230 /* Remember \r or \n */
3041 ph10 231
3042     if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3043    
3044 ph10 230 /* Check for range */
3045 nigel 93
3046     if (!inescq && ptr[1] == '-')
3047     {
3048 nigel 77 int d;
3049     ptr += 2;
3050 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3051 nigel 77
3052 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3053     mode. */
3054    
3055     while (*ptr == '\\' && ptr[1] == 'Q')
3056     {
3057     ptr += 2;
3058     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3059     inescq = TRUE;
3060     break;
3061     }
3062    
3063     if (*ptr == 0 || (!inescq && *ptr == ']'))
3064     {
3065     ptr = oldptr;
3066     goto LONE_SINGLE_CHARACTER;
3067     }
3068    
3069 nigel 77 #ifdef SUPPORT_UTF8
3070     if (utf8)
3071     { /* Braces are required because the */
3072     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3073     }
3074     else
3075     #endif
3076     d = *ptr; /* Not UTF-8 mode */
3077    
3078     /* The second part of a range can be a single-character escape, but
3079     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3080     in such circumstances. */
3081    
3082 nigel 93 if (!inescq && d == '\\')
3083 nigel 77 {
3084 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3085     if (*errorcodeptr != 0) goto FAILED;
3086 nigel 77
3087 ph10 275 /* \b is backspace; \X is literal X; \R is literal R; any other
3088 nigel 93 special means the '-' was literal */
3089 nigel 77
3090     if (d < 0)
3091     {
3092     if (d == -ESC_b) d = '\b';
3093 nigel 93 else if (d == -ESC_X) d = 'X';
3094     else if (d == -ESC_R) d = 'R'; else
3095 nigel 77 {
3096 nigel 93 ptr = oldptr;
3097 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3098     }
3099     }
3100     }
3101    
3102 nigel 93 /* Check that the two values are in the correct order. Optimize
3103     one-character ranges */
3104 nigel 77
3105 nigel 93 if (d < c)
3106     {
3107     *errorcodeptr = ERR8;
3108     goto FAILED;
3109     }
3110    
3111 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3112    
3113 ph10 230 /* Remember \r or \n */
3114 ph10 231
3115     if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3116    
3117 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3118     matching, we have to use an XCLASS with extra data items. Caseless
3119     matching for characters > 127 is available only if UCP support is
3120     available. */
3121    
3122     #ifdef SUPPORT_UTF8
3123     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3124     {
3125     class_utf8 = TRUE;
3126    
3127     /* With UCP support, we can find the other case equivalents of
3128     the relevant characters. There may be several ranges. Optimize how
3129     they fit with the basic range. */
3130    
3131     #ifdef SUPPORT_UCP
3132     if ((options & PCRE_CASELESS) != 0)
3133     {
3134 nigel 93 unsigned int occ, ocd;
3135     unsigned int cc = c;
3136     unsigned int origd = d;
3137 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3138     {
3139 ph10 180 if (occ >= (unsigned int)c &&
3140     ocd <= (unsigned int)d)
3141 ph10 176 continue; /* Skip embedded ranges */
3142 nigel 77
3143 ph10 180 if (occ < (unsigned int)c &&
3144 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3145 nigel 77 { /* if there is overlap, */
3146     c = occ; /* noting that if occ < c */
3147     continue; /* we can't have ocd > d */
3148     } /* because a subrange is */
3149 ph10 180 if (ocd > (unsigned int)d &&
3150 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3151 nigel 77 { /* the basic range. */
3152     d = ocd;
3153     continue;
3154     }
3155    
3156     if (occ == ocd)
3157     {
3158     *class_utf8data++ = XCL_SINGLE;
3159     }
3160     else
3161     {
3162     *class_utf8data++ = XCL_RANGE;
3163     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3164     }
3165     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3166     }
3167     }
3168     #endif /* SUPPORT_UCP */
3169    
3170     /* Now record the original range, possibly modified for UCP caseless
3171     overlapping ranges. */
3172    
3173     *class_utf8data++ = XCL_RANGE;
3174     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3175     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3176    
3177     /* With UCP support, we are done. Without UCP support, there is no
3178     caseless matching for UTF-8 characters > 127; we can use the bit map
3179     for the smaller ones. */
3180    
3181     #ifdef SUPPORT_UCP
3182     continue; /* With next character in the class */
3183     #else
3184     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3185    
3186     /* Adjust upper limit and fall through to set up the map */
3187    
3188     d = 127;
3189    
3190     #endif /* SUPPORT_UCP */
3191     }
3192     #endif /* SUPPORT_UTF8 */
3193    
3194     /* We use the bit map for all cases when not in UTF-8 mode; else
3195     ranges that lie entirely within 0-127 when there is UCP support; else
3196     for partial ranges without UCP support. */
3197    
3198 nigel 93 class_charcount += d - c + 1;
3199     class_lastchar = d;
3200    
3201     /* We can save a bit of time by skipping this in the pre-compile. */
3202    
3203     if (lengthptr == NULL) for (; c <= d; c++)
3204 nigel 77 {
3205     classbits[c/8] |= (1 << (c&7));
3206     if ((options & PCRE_CASELESS) != 0)
3207     {
3208     int uc = cd->fcc[c]; /* flip case */
3209     classbits[uc/8] |= (1 << (uc&7));
3210     }
3211     }
3212    
3213     continue; /* Go get the next char in the class */
3214     }
3215    
3216     /* Handle a lone single character - we can get here for a normal
3217     non-escape char, or after \ that introduces a single character or for an
3218     apparent range that isn't. */
3219    
3220     LONE_SINGLE_CHARACTER:
3221 ph10 231
3222 nigel 77 /* Handle a character that cannot go in the bit map */
3223    
3224     #ifdef SUPPORT_UTF8
3225     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3226     {
3227     class_utf8 = TRUE;
3228     *class_utf8data++ = XCL_SINGLE;
3229     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3230    
3231     #ifdef SUPPORT_UCP
3232     if ((options & PCRE_CASELESS) != 0)
3233     {
3234 nigel 93 unsigned int othercase;
3235     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3236 nigel 77 {
3237     *class_utf8data++ = XCL_SINGLE;
3238     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3239     }
3240     }
3241     #endif /* SUPPORT_UCP */
3242    
3243     }
3244     else
3245     #endif /* SUPPORT_UTF8 */
3246    
3247     /* Handle a single-byte character */
3248     {
3249     classbits[c/8] |= (1 << (c&7));
3250     if ((options & PCRE_CASELESS) != 0)
3251     {
3252     c = cd->fcc[c]; /* flip case */
3253     classbits[c/8] |= (1 << (c&7));
3254     }
3255     class_charcount++;
3256     class_lastchar = c;
3257     }
3258     }
3259    
3260 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3261 nigel 77
3262 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3263 nigel 77
3264 nigel 93 if (c == 0) /* Missing terminating ']' */
3265     {
3266     *errorcodeptr = ERR6;
3267     goto FAILED;
3268     }
3269 ph10 231
3270    
3271 ph10 230 /* This code has been disabled because it would mean that \s counts as
3272     an explicit \r or \n reference, and that's not really what is wanted. Now
3273     we set the flag only if there is a literal "\r" or "\n" in the class. */
3274 ph10 227
3275 ph10 230 #if 0
3276 ph10 226 /* Remember whether \r or \n are in this class */
3277 ph10 227
3278 ph10 226 if (negate_class)
3279     {
3280 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3281 ph10 226 }
3282     else
3283     {
3284 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3285 ph10 227 }
3286 ph10 230 #endif
3287 ph10 227
3288 ph10 231
3289 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3290 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3291     use of \p or \P, in other words, no use of any XCLASS features, we can
3292     optimize.
3293    
3294 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3295     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3296     operate on single-bytes only. This is an historical hangover. Maybe one day
3297     we can tidy these opcodes to handle multi-byte characters.
3298 nigel 77
3299     The optimization throws away the bit map. We turn the item into a
3300     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3301     that OP_NOT does not support multibyte characters. In the positive case, it
3302     can cause firstbyte to be set. Otherwise, there can be no first char if
3303     this item is first, whatever repeat count may follow. In the case of
3304     reqbyte, save the previous value for reinstating. */
3305    
3306     #ifdef SUPPORT_UTF8
3307 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3308 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3309 nigel 77 #else
3310     if (class_charcount == 1)
3311     #endif
3312     {
3313     zeroreqbyte = reqbyte;
3314    
3315     /* The OP_NOT opcode works on one-byte characters only. */
3316    
3317     if (negate_class)
3318     {
3319     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3320     zerofirstbyte = firstbyte;
3321     *code++ = OP_NOT;
3322     *code++ = class_lastchar;
3323     break;
3324     }
3325    
3326     /* For a single, positive character, get the value into mcbuffer, and
3327     then we can handle this with the normal one-character code. */
3328    
3329     #ifdef SUPPORT_UTF8
3330     if (utf8 && class_lastchar > 127)
3331     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3332     else
3333     #endif
3334     {
3335     mcbuffer[0] = class_lastchar;
3336     mclength = 1;
3337     }
3338     goto ONE_CHAR;
3339     } /* End of 1-char optimization */
3340    
3341     /* The general case - not the one-char optimization. If this is the first
3342     thing in the branch, there can be no first char setting, whatever the
3343     repeat count. Any reqbyte setting must remain unchanged after any kind of
3344     repeat. */
3345    
3346     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3347     zerofirstbyte = firstbyte;
3348     zeroreqbyte = reqbyte;
3349    
3350     /* If there are characters with values > 255, we have to compile an
3351 ph10 264 extended class, with its own opcode, unless there was a negated special
3352     such as \S in the class, because in that case all characters > 255 are in
3353     the class, so any that were explicitly given as well can be ignored. If
3354     (when there are explicit characters > 255 that must be listed) there are no
3355     characters < 256, we can omit the bitmap in the actual compiled code. */
3356 nigel 77
3357     #ifdef SUPPORT_UTF8
3358 ph10 264 if (class_utf8 && !should_flip_negation)
3359 nigel 77 {
3360     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3361     *code++ = OP_XCLASS;
3362     code += LINK_SIZE;
3363     *code = negate_class? XCL_NOT : 0;
3364    
3365 nigel 93 /* If the map is required, move up the extra data to make room for it;
3366     otherwise just move the code pointer to the end of the extra data. */
3367 nigel 77
3368     if (class_charcount > 0)
3369     {
3370     *code++ |= XCL_MAP;
3371 nigel 93 memmove(code + 32, code, class_utf8data - code);
3372 nigel 77 memcpy(code, classbits, 32);
3373 nigel 93 code = class_utf8data + 32;
3374 nigel 77 }
3375 nigel 93 else code = class_utf8data;
3376 nigel 77
3377     /* Now fill in the complete length of the item */
3378    
3379     PUT(previous, 1, code - previous);
3380     break; /* End of class handling */
3381     }
3382     #endif
3383    
3384 ph10 264 /* If there are no characters > 255, set the opcode to OP_CLASS or
3385     OP_NCLASS, depending on whether the whole class was negated and whether
3386     there were negative specials such as \S in the class. Then copy the 32-byte
3387     map into the code vector, negating it if necessary. */
3388    
3389     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3390 nigel 77 if (negate_class)
3391     {
3392 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3393     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3394 nigel 77 }
3395     else
3396     {
3397     memcpy(code, classbits, 32);
3398     }
3399     code += 32;
3400     break;
3401    
3402 nigel 93
3403     /* ===================================================================*/
3404 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3405     has been tested above. */
3406    
3407     case '{':
3408     if (!is_quantifier) goto NORMAL_CHAR;
3409     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3410     if (*errorcodeptr != 0) goto FAILED;
3411     goto REPEAT;
3412    
3413     case '*':
3414     repeat_min = 0;
3415     repeat_max = -1;
3416     goto REPEAT;
3417    
3418     case '+':
3419     repeat_min = 1;
3420     repeat_max = -1;
3421     goto REPEAT;
3422    
3423     case '?':
3424     repeat_min = 0;
3425     repeat_max = 1;
3426    
3427     REPEAT:
3428     if (previous == NULL)
3429     {
3430     *errorcodeptr = ERR9;
3431     goto FAILED;
3432     }
3433    
3434     if (repeat_min == 0)
3435     {
3436     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3437     reqbyte = zeroreqbyte; /* Ditto */
3438     }
3439    
3440     /* Remember whether this is a variable length repeat */
3441    
3442     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3443    
3444     op_type = 0; /* Default single-char op codes */
3445     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3446    
3447     /* Save start of previous item, in case we have to move it up to make space
3448     for an inserted OP_ONCE for the additional '+' extension. */
3449    
3450     tempcode = previous;
3451    
3452     /* If the next character is '+', we have a possessive quantifier. This
3453     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3454     If the next character is '?' this is a minimizing repeat, by default,
3455     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3456     repeat type to the non-default. */
3457    
3458     if (ptr[1] == '+')
3459     {
3460     repeat_type = 0; /* Force greedy */
3461     possessive_quantifier = TRUE;
3462     ptr++;
3463     }
3464     else if (ptr[1] == '?')
3465     {
3466     repeat_type = greedy_non_default;
3467     ptr++;
3468     }
3469     else repeat_type = greedy_default;
3470    
3471     /* If previous was a character match, abolish the item and generate a
3472     repeat item instead. If a char item has a minumum of more than one, ensure
3473     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3474     the first thing in a branch because the x will have gone into firstbyte
3475     instead. */
3476    
3477     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3478     {
3479     /* Deal with UTF-8 characters that take up more than one byte. It's
3480     easier to write this out separately than try to macrify it. Use c to
3481     hold the length of the character in bytes, plus 0x80 to flag that it's a
3482     length rather than a small character. */
3483    
3484     #ifdef SUPPORT_UTF8
3485     if (utf8 && (code[-1] & 0x80) != 0)
3486     {
3487     uschar *lastchar = code - 1;
3488     while((*lastchar & 0xc0) == 0x80) lastchar--;
3489     c = code - lastchar; /* Length of UTF-8 character */
3490     memcpy(utf8_char, lastchar, c); /* Save the char */
3491     c |= 0x80; /* Flag c as a length */
3492     }
3493     else
3494     #endif
3495    
3496     /* Handle the case of a single byte - either with no UTF8 support, or
3497     with UTF-8 disabled, or for a UTF-8 character < 128. */
3498    
3499     {
3500     c = code[-1];
3501     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3502     }
3503    
3504 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3505     the line is something that cannot possibly match this character. If so,
3506     automatically possessifying this item gains some performance in the case
3507     where the match fails. */
3508    
3509     if (!possessive_quantifier &&
3510     repeat_max < 0 &&
3511     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3512     options, cd))
3513     {
3514     repeat_type = 0; /* Force greedy */
3515     possessive_quantifier = TRUE;
3516     }
3517    
3518 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3519     }
3520    
3521     /* If previous was a single negated character ([^a] or similar), we use
3522     one of the special opcodes, replacing it. The code is shared with single-
3523     character repeats by setting opt_type to add a suitable offset into
3524 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3525     currently used only for single-byte chars. */
3526 nigel 77
3527     else if (*previous == OP_NOT)
3528     {
3529     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3530     c = previous[1];
3531 nigel 93 if (!possessive_quantifier &&
3532     repeat_max < 0 &&
3533     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3534     {
3535     repeat_type = 0; /* Force greedy */
3536     possessive_quantifier = TRUE;
3537     }
3538 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3539     }
3540    
3541     /* If previous was a character type match (\d or similar), abolish it and
3542     create a suitable repeat item. The code is shared with single-character
3543     repeats by setting op_type to add a suitable offset into repeat_type. Note
3544     the the Unicode property types will be present only when SUPPORT_UCP is
3545     defined, but we don't wrap the little bits of code here because it just
3546     makes it horribly messy. */
3547    
3548     else if (*previous < OP_EODN)
3549     {
3550     uschar *oldcode;
3551 nigel 87 int prop_type, prop_value;
3552 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3553     c = *previous;
3554    
3555 nigel 93 if (!possessive_quantifier &&
3556     repeat_max < 0 &&
3557     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3558     {
3559     repeat_type = 0; /* Force greedy */
3560     possessive_quantifier = TRUE;
3561     }
3562    
3563 nigel 77 OUTPUT_SINGLE_REPEAT:
3564 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3565     {
3566     prop_type = previous[1];
3567     prop_value = previous[2];
3568     }
3569     else prop_type = prop_value = -1;
3570 nigel 77
3571     oldcode = code;
3572     code = previous; /* Usually overwrite previous item */
3573    
3574     /* If the maximum is zero then the minimum must also be zero; Perl allows
3575     this case, so we do too - by simply omitting the item altogether. */
3576    
3577     if (repeat_max == 0) goto END_REPEAT;
3578    
3579     /* All real repeats make it impossible to handle partial matching (maybe
3580     one day we will be able to remove this restriction). */
3581    
3582 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3583 nigel 77
3584     /* Combine the op_type with the repeat_type */
3585    
3586     repeat_type += op_type;
3587    
3588     /* A minimum of zero is handled either as the special case * or ?, or as
3589     an UPTO, with the maximum given. */
3590    
3591     if (repeat_min == 0)
3592     {
3593     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3594     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3595     else
3596     {
3597     *code++ = OP_UPTO + repeat_type;
3598     PUT2INC(code, 0, repeat_max);
3599     }
3600     }
3601    
3602     /* A repeat minimum of 1 is optimized into some special cases. If the
3603 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3604 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3605     one less than the maximum. */
3606    
3607     else if (repeat_min == 1)
3608     {
3609     if (repeat_max == -1)
3610     *code++ = OP_PLUS + repeat_type;
3611     else
3612     {
3613     code = oldcode; /* leave previous item in place */
3614     if (repeat_max == 1) goto END_REPEAT;
3615     *code++ = OP_UPTO + repeat_type;
3616     PUT2INC(code, 0, repeat_max - 1);
3617     }
3618     }
3619    
3620     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3621     handled as an EXACT followed by an UPTO. */
3622    
3623     else
3624     {
3625     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3626     PUT2INC(code, 0, repeat_min);
3627    
3628     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3629     we have to insert the character for the previous code. For a repeated
3630 nigel 87 Unicode property match, there are two extra bytes that define the
3631 nigel 77 required property. In UTF-8 mode, long characters have their length in
3632     c, with the 0x80 bit as a flag. */
3633    
3634     if (repeat_max < 0)
3635     {
3636     #ifdef SUPPORT_UTF8
3637     if (utf8 && c >= 128)
3638     {
3639     memcpy(code, utf8_char, c & 7);
3640     code += c & 7;
3641     }
3642     else
3643     #endif
3644     {
3645     *code++ = c;
3646 nigel 87 if (prop_type >= 0)
3647     {
3648     *code++ = prop_type;
3649     *code++ = prop_value;
3650     }
3651 nigel 77 }
3652     *code++ = OP_STAR + repeat_type;
3653     }
3654    
3655     /* Else insert an UPTO if the max is greater than the min, again
3656 nigel 93 preceded by the character, for the previously inserted code. If the
3657     UPTO is just for 1 instance, we can use QUERY instead. */
3658 nigel 77
3659     else if (repeat_max != repeat_min)
3660     {
3661     #ifdef SUPPORT_UTF8
3662     if (utf8 && c >= 128)
3663     {
3664     memcpy(code, utf8_char, c & 7);
3665     code += c & 7;
3666     }
3667     else
3668     #endif
3669     *code++ = c;
3670 nigel 87 if (prop_type >= 0)
3671     {
3672     *code++ = prop_type;
3673     *code++ = prop_value;
3674     }
3675 nigel 77 repeat_max -= repeat_min;
3676 nigel 93
3677     if (repeat_max == 1)
3678     {
3679     *code++ = OP_QUERY + repeat_type;
3680     }
3681     else
3682     {
3683     *code++ = OP_UPTO + repeat_type;
3684     PUT2INC(code, 0, repeat_max);
3685     }
3686 nigel 77 }
3687     }
3688    
3689     /* The character or character type itself comes last in all cases. */
3690    
3691     #ifdef SUPPORT_UTF8
3692     if (utf8 && c >= 128)
3693     {
3694     memcpy(code, utf8_char, c & 7);
3695     code += c & 7;
3696     }
3697     else
3698     #endif
3699     *code++ = c;
3700    
3701 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3702     define the required property. */
3703 nigel 77
3704     #ifdef SUPPORT_UCP
3705 nigel 87 if (prop_type >= 0)
3706     {
3707     *code++ = prop_type;
3708     *code++ = prop_value;
3709     }
3710 nigel 77 #endif
3711     }
3712    
3713     /* If previous was a character class or a back reference, we put the repeat
3714     stuff after it, but just skip the item if the repeat was {0,0}. */
3715    
3716     else if (*previous == OP_CLASS ||
3717     *previous == OP_NCLASS ||
3718     #ifdef SUPPORT_UTF8
3719     *previous == OP_XCLASS ||
3720     #endif
3721     *previous == OP_REF)
3722     {
3723     if (repeat_max == 0)
3724     {
3725     code = previous;
3726     goto END_REPEAT;
3727     }
3728    
3729     /* All real repeats make it impossible to handle partial matching (maybe
3730     one day we will be able to remove this restriction). */
3731    
3732 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3733 nigel 77
3734     if (repeat_min == 0 && repeat_max == -1)
3735     *code++ = OP_CRSTAR + repeat_type;
3736     else if (repeat_min == 1 && repeat_max == -1)
3737     *code++ = OP_CRPLUS + repeat_type;
3738     else if (repeat_min == 0 && repeat_max == 1)
3739     *code++ = OP_CRQUERY + repeat_type;
3740     else
3741     {
3742     *code++ = OP_CRRANGE + repeat_type;
3743     PUT2INC(code, 0, repeat_min);
3744     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3745     PUT2INC(code, 0, repeat_max);
3746     }
3747     }
3748    
3749     /* If previous was a bracket group, we may have to replicate it in certain
3750     cases. */
3751    
3752 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3753     *previous == OP_ONCE || *previous == OP_COND)
3754 nigel 77 {
3755     register int i;
3756     int ketoffset = 0;
3757     int len = code - previous;
3758     uschar *bralink = NULL;
3759    
3760 nigel 93 /* Repeating a DEFINE group is pointless */
3761    
3762     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3763     {
3764     *errorcodeptr = ERR55;
3765     goto FAILED;
3766     }
3767    
3768 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3769     by scanning through from the start, and compute the offset back to it
3770     from the current code pointer. There may be an OP_OPT setting following
3771     the final KET, so we can't find the end just by going back from the code
3772     pointer. */
3773    
3774     if (repeat_max == -1)
3775     {
3776     register uschar *ket = previous;
3777     do ket += GET(ket, 1); while (*ket != OP_KET);
3778     ketoffset = code - ket;
3779     }
3780    
3781     /* The case of a zero minimum is special because of the need to stick
3782     OP_BRAZERO in front of it, and because the group appears once in the
3783     data, whereas in other cases it appears the minimum number of times. For
3784     this reason, it is simplest to treat this case separately, as otherwise
3785     the code gets far too messy. There are several special subcases when the
3786     minimum is zero. */
3787    
3788     if (repeat_min == 0)
3789     {
3790     /* If the maximum is also zero, we just omit the group from the output
3791     altogether. */
3792    
3793     if (repeat_max == 0)
3794     {
3795     code = previous;
3796     goto END_REPEAT;
3797     }
3798    
3799     /* If the maximum is 1 or unlimited, we just have to stick in the
3800     BRAZERO and do no more at this point. However, we do need to adjust
3801     any OP_RECURSE calls inside the group that refer to the group itself or
3802 nigel 93 any internal or forward referenced group, because the offset is from
3803     the start of the whole regex. Temporarily terminate the pattern while
3804     doing this. */
3805 nigel 77
3806     if (repeat_max <= 1)
3807     {
3808     *code = OP_END;
3809 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3810 nigel 77 memmove(previous+1, previous, len);
3811     code++;
3812     *previous++ = OP_BRAZERO + repeat_type;
3813     }
3814    
3815     /* If the maximum is greater than 1 and limited, we have to replicate
3816     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3817     The first one has to be handled carefully because it's the original
3818     copy, which has to be moved up. The remainder can be handled by code
3819     that is common with the non-zero minimum case below. We have to
3820     adjust the value or repeat_max, since one less copy is required. Once
3821     again, we may have to adjust any OP_RECURSE calls inside the group. */
3822    
3823     else
3824     {
3825     int offset;
3826     *code = OP_END;
3827 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3828 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3829     code += 2 + LINK_SIZE;
3830     *previous++ = OP_BRAZERO + repeat_type;
3831     *previous++ = OP_BRA;
3832    
3833     /* We chain together the bracket offset fields that have to be
3834     filled in later when the ends of the brackets are reached. */
3835    
3836     offset = (bralink == NULL)? 0 : previous - bralink;
3837     bralink = previous;
3838     PUTINC(previous, 0, offset);
3839     }
3840    
3841     repeat_max--;
3842     }
3843    
3844     /* If the minimum is greater than zero, replicate the group as many
3845     times as necessary, and adjust the maximum to the number of subsequent
3846     copies that we need. If we set a first char from the group, and didn't
3847 nigel 93 set a required char, copy the latter from the former. If there are any
3848     forward reference subroutine calls in the group, there will be entries on
3849     the workspace list; replicate these with an appropriate increment. */
3850 nigel 77
3851     else
3852     {
3853     if (repeat_min > 1)
3854     {
3855 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3856 ph10 202 just adjust the length as if we had. Do some paranoid checks for
3857     potential integer overflow. */
3858 nigel 93
3859     if (lengthptr != NULL)
3860 ph10 202 {
3861     int delta = (repeat_min - 1)*length_prevgroup;
3862     if ((double)(repeat_min - 1)*(double)length_prevgroup >
3863     (double)INT_MAX ||
3864     OFLOW_MAX - *lengthptr < delta)
3865     {
3866     *errorcodeptr = ERR20;
3867     goto FAILED;
3868     }
3869     *lengthptr += delta;
3870     }
3871 nigel 93
3872     /* This is compiling for real */
3873    
3874     else
3875 nigel 77 {
3876 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3877     for (i = 1; i < repeat_min; i++)
3878     {
3879     uschar *hc;
3880     uschar *this_hwm = cd->hwm;
3881     memcpy(code, previous, len);
3882     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3883     {
3884     PUT(cd->hwm, 0, GET(hc, 0) + len);
3885     cd->hwm += LINK_SIZE;
3886     }
3887     save_hwm = this_hwm;
3888     code += len;
3889     }
3890 nigel 77 }
3891     }
3892 nigel 93
3893 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3894     }
3895    
3896     /* This code is common to both the zero and non-zero minimum cases. If
3897     the maximum is limited, it replicates the group in a nested fashion,
3898     remembering the bracket starts on a stack. In the case of a zero minimum,
3899     the first one was set up above. In all cases the repeat_max now specifies
3900 nigel 93 the number of additional copies needed. Again, we must remember to
3901     replicate entries on the forward reference list. */
3902 nigel 77
3903     if (repeat_max >= 0)
3904     {
3905 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3906     just adjust the length as if we had. For each repetition we must add 1
3907     to the length for BRAZERO and for all but the last repetition we must
3908 ph10 202 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3909     paranoid checks to avoid integer overflow. */
3910 nigel 93
3911     if (lengthptr != NULL && repeat_max > 0)
3912 ph10 202 {
3913     int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3914     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3915     if ((double)repeat_max *
3916     (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3917     > (double)INT_MAX ||
3918     OFLOW_MAX - *lengthptr < delta)
3919     {
3920     *errorcodeptr = ERR20;
3921     goto FAILED;
3922     }
3923     *lengthptr += delta;
3924     }
3925 nigel 93
3926     /* This is compiling for real */
3927    
3928     else for (i = repeat_max - 1; i >= 0; i--)
3929 nigel 77 {
3930 nigel 93 uschar *hc;
3931     uschar *this_hwm = cd->hwm;
3932    
3933 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3934    
3935     /* All but the final copy start a new nesting, maintaining the
3936     chain of brackets outstanding. */
3937    
3938     if (i != 0)
3939     {
3940     int offset;
3941     *code++ = OP_BRA;
3942     offset = (bralink == NULL)? 0 : code - bralink;
3943     bralink = code;
3944     PUTINC(code, 0, offset);
3945     }
3946    
3947     memcpy(code, previous, len);
3948 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3949     {
3950     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3951     cd->hwm += LINK_SIZE;
3952     }
3953     save_hwm = this_hwm;
3954 nigel 77 code += len;
3955     }
3956    
3957     /* Now chain through the pending brackets, and fill in their length
3958     fields (which are holding the chain links pro tem). */
3959    
3960     while (bralink != NULL)
3961     {
3962     int oldlinkoffset;
3963     int offset = code - bralink + 1;
3964     uschar *bra = code - offset;
3965     oldlinkoffset = GET(bra, 1);
3966     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3967     *code++ = OP_KET;
3968     PUTINC(code, 0, offset);
3969     PUT(bra, 1, offset);
3970     }
3971     }
3972    
3973     /* If the maximum is unlimited, set a repeater in the final copy. We
3974     can't just offset backwards from the current code point, because we
3975     don't know if there's been an options resetting after the ket. The
3976 nigel 93 correct offset was computed above.
3977 nigel 77
3978 nigel 93 Then, when we are doing the actual compile phase, check to see whether
3979     this group is a non-atomic one that could match an empty string. If so,
3980     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3981     that runtime checking can be done. [This check is also applied to
3982     atomic groups at runtime, but in a different way.] */
3983    
3984     else
3985     {
3986     uschar *ketcode = code - ketoffset;
3987     uschar *bracode = ketcode - GET(ketcode, 1);
3988     *ketcode = OP_KETRMAX + repeat_type;
3989     if (lengthptr == NULL && *bracode != OP_ONCE)
3990     {
3991     uschar *scode = bracode;
3992     do
3993     {
3994     if (could_be_empty_branch(scode, ketcode, utf8))
3995     {
3996     *bracode += OP_SBRA - OP_BRA;
3997     break;
3998     }
3999     scode += GET(scode, 1);
4000     }
4001     while (*scode == OP_ALT);
4002     }
4003     }
4004 nigel 77 }
4005    
4006     /* Else there's some kind of shambles */
4007    
4008     else
4009     {
4010     *errorcodeptr = ERR11;
4011     goto FAILED;
4012     }
4013    
4014 nigel 93 /* If the character following a repeat is '+', or if certain optimization
4015     tests above succeeded, possessive_quantifier is TRUE. For some of the
4016     simpler opcodes, there is an special alternative opcode for this. For
4017     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4018     The '+' notation is just syntactic sugar, taken from Sun's Java package,
4019     but the special opcodes can optimize it a bit. The repeated item starts at
4020     tempcode, not at previous, which might be the first part of a string whose
4021     (former) last char we repeated.
4022 nigel 77
4023 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4024     an 'upto' may follow. We skip over an 'exact' item, and then test the
4025     length of what remains before proceeding. */
4026    
4027 nigel 77 if (possessive_quantifier)
4028     {
4029 nigel 93 int len;
4030     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4031     *tempcode == OP_NOTEXACT)
4032 ph10 285 tempcode += _pcre_OP_lengths[*tempcode] +
4033     ((*tempcode == OP_TYPEEXACT &&
4034     (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4035 nigel 93 len = code - tempcode;
4036     if (len > 0) switch (*tempcode)
4037     {
4038     case OP_STAR: *tempcode = OP_POSSTAR; break;
4039     case OP_PLUS: *tempcode = OP_POSPLUS; break;
4040     case OP_QUERY: *tempcode = OP_POSQUERY; break;
4041     case OP_UPTO: *tempcode = OP_POSUPTO; break;
4042    
4043     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4044     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4045     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4046     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4047    
4048     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4049     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4050     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4051     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4052    
4053     default:
4054     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4055     code += 1 + LINK_SIZE;
4056     len += 1 + LINK_SIZE;
4057     tempcode[0] = OP_ONCE;
4058     *code++ = OP_KET;
4059     PUTINC(code, 0, len);
4060     PUT(tempcode, 1, len);
4061     break;
4062     }
4063 nigel 77 }
4064    
4065     /* In all case we no longer have a previous item. We also set the
4066     "follows varying string" flag for subsequently encountered reqbytes if
4067     it isn't already set and we have just passed a varying length item. */
4068    
4069     END_REPEAT:
4070     previous = NULL;
4071     cd->req_varyopt |= reqvary;