/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 391 - (hide annotations) (download)
Tue Mar 17 21:16:01 2009 UTC (5 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 207878 byte(s)
Add support for UTF-8 in EBCDIC environments.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 381 Copyright (c) 1997-2009 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 391 #ifndef EBCDIC
101    
102     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
103     in UTF-8 mode. */
104    
105     static const short int escapes[] = {
106     0, 0,
107     0, 0,
108     0, 0,
109     0, 0,
110     0, 0,
111     CHAR_COLON, CHAR_SEMICOLON,
112     CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
113     CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
114     CHAR_COMMERCIAL_AT, -ESC_A,
115     -ESC_B, -ESC_C,
116     -ESC_D, -ESC_E,
117     0, -ESC_G,
118     -ESC_H, 0,
119     0, -ESC_K,
120     0, 0,
121     0, 0,
122     -ESC_P, -ESC_Q,
123     -ESC_R, -ESC_S,
124     0, 0,
125     -ESC_V, -ESC_W,
126     -ESC_X, 0,
127     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
128     CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
129     CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
130     CHAR_GRAVE_ACCENT, 7,
131     -ESC_b, 0,
132     -ESC_d, ESC_e,
133     ESC_f, 0,
134     -ESC_h, 0,
135     0, -ESC_k,
136     0, 0,
137     ESC_n, 0,
138     -ESC_p, 0,
139     ESC_r, -ESC_s,
140     ESC_tee, 0,
141     -ESC_v, -ESC_w,
142     0, 0,
143     -ESC_z
144 nigel 77 };
145    
146 ph10 391 #else
147    
148     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
149    
150 nigel 77 static const short int escapes[] = {
151     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
152     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
153     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
154     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
155     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
156     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
157     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
158     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
159 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
160 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
161 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
162 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
163 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
164     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
165     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
166     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
167 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
168 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
169 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
170 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
171 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
172     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
173     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
174     };
175     #endif
176    
177    
178 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
179     searched linearly. Put all the names into a single string, in order to reduce
180 ph10 391 the number of relocations when a shared library is dynamically linked. The
181     string is built from string macros so that it works in UTF-8 mode on EBCDIC
182     platforms. */
183 ph10 210
184     typedef struct verbitem {
185     int len;
186     int op;
187 ph10 211 } verbitem;
188 ph10 210
189 ph10 240 static const char verbnames[] =
190 ph10 391 STRING_ACCEPT0
191     STRING_COMMIT0
192     STRING_F0
193     STRING_FAIL0
194     STRING_PRUNE0
195     STRING_SKIP0
196     STRING_THEN;
197 ph10 240
198 ph10 327 static const verbitem verbs[] = {
199 ph10 240 { 6, OP_ACCEPT },
200     { 6, OP_COMMIT },
201     { 1, OP_FAIL },
202     { 4, OP_FAIL },
203     { 5, OP_PRUNE },
204     { 4, OP_SKIP },
205     { 4, OP_THEN }
206 ph10 210 };
207    
208 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
209 ph10 210
210    
211 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
212     now all in a single string, to reduce the number of relocations when a shared
213 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
214     length entry. The first three must be alpha, lower, upper, as this is assumed
215     for handling case independence. */
216 nigel 77
217 ph10 240 static const char posix_names[] =
218 ph10 391 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
219     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
220     STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
221     STRING_word0 STRING_xdigit;
222 nigel 77
223     static const uschar posix_name_lengths[] = {
224     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
225    
226 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
227     base map, with an optional addition or removal of another map. Then, for some
228     classes, there is some additional tweaking: for [:blank:] the vertical space
229     characters are removed, and for [:alpha:] and [:alnum:] the underscore
230     character is removed. The triples in the table consist of the base map offset,
231     second map offset or -1 if no second map, and a non-negative value for map
232     addition or a negative value for map subtraction (if there are two maps). The
233     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
234     remove vertical space characters, 2 => remove underscore. */
235 nigel 77
236     static const int posix_class_maps[] = {
237 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
238     cbit_lower, -1, 0, /* lower */
239     cbit_upper, -1, 0, /* upper */
240     cbit_word, -1, 2, /* alnum - word without underscore */
241     cbit_print, cbit_cntrl, 0, /* ascii */
242     cbit_space, -1, 1, /* blank - a GNU extension */
243     cbit_cntrl, -1, 0, /* cntrl */
244     cbit_digit, -1, 0, /* digit */
245     cbit_graph, -1, 0, /* graph */
246     cbit_print, -1, 0, /* print */
247     cbit_punct, -1, 0, /* punct */
248     cbit_space, -1, 0, /* space */
249     cbit_word, -1, 0, /* word - a Perl extension */
250     cbit_xdigit,-1, 0 /* xdigit */
251 nigel 77 };
252    
253    
254 nigel 93 #define STRING(a) # a
255     #define XSTRING(s) STRING(s)
256    
257 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
258 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
259     they are documented. Always add a new error instead. Messages marked DEAD below
260 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
261     the number of relocations needed when a shared library is loaded dynamically,
262     it is now one long string. We cannot use a table of offsets, because the
263     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
264     simply count through to the one we want - this isn't a performance issue
265 ph10 240 because these strings are used only when there is a compilation error. */
266 nigel 77
267 ph10 240 static const char error_texts[] =
268     "no error\0"
269     "\\ at end of pattern\0"
270     "\\c at end of pattern\0"
271     "unrecognized character follows \\\0"
272     "numbers out of order in {} quantifier\0"
273 nigel 77 /* 5 */
274 ph10 240 "number too big in {} quantifier\0"
275     "missing terminating ] for character class\0"
276     "invalid escape sequence in character class\0"
277     "range out of order in character class\0"
278     "nothing to repeat\0"
279 nigel 77 /* 10 */
280 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
281     "internal error: unexpected repeat\0"
282 ph10 269 "unrecognized character after (? or (?-\0"
283 ph10 240 "POSIX named classes are supported only within a class\0"
284     "missing )\0"
285 nigel 77 /* 15 */
286 ph10 240 "reference to non-existent subpattern\0"
287     "erroffset passed as NULL\0"
288     "unknown option bit(s) set\0"
289     "missing ) after comment\0"
290     "parentheses nested too deeply\0" /** DEAD **/
291 nigel 77 /* 20 */
292 ph10 240 "regular expression is too large\0"
293     "failed to get memory\0"
294     "unmatched parentheses\0"
295     "internal error: code overflow\0"
296     "unrecognized character after (?<\0"
297 nigel 77 /* 25 */
298 ph10 240 "lookbehind assertion is not fixed length\0"
299     "malformed number or name after (?(\0"
300     "conditional group contains more than two branches\0"
301     "assertion expected after (?(\0"
302     "(?R or (?[+-]digits must be followed by )\0"
303 nigel 77 /* 30 */
304 ph10 240 "unknown POSIX class name\0"
305     "POSIX collating elements are not supported\0"
306     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
307     "spare error\0" /** DEAD **/
308     "character value in \\x{...} sequence is too large\0"
309 nigel 77 /* 35 */
310 ph10 240 "invalid condition (?(0)\0"
311     "\\C not allowed in lookbehind assertion\0"
312     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
313     "number after (?C is > 255\0"
314     "closing ) for (?C expected\0"
315 nigel 77 /* 40 */
316 ph10 240 "recursive call could loop indefinitely\0"
317     "unrecognized character after (?P\0"
318     "syntax error in subpattern name (missing terminator)\0"
319     "two named subpatterns have the same name\0"
320     "invalid UTF-8 string\0"
321 nigel 77 /* 45 */
322 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
323     "malformed \\P or \\p sequence\0"
324     "unknown property name after \\P or \\p\0"
325     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
326     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
327 nigel 91 /* 50 */
328 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
329     "octal value is greater than \\377 (not in UTF-8 mode)\0"
330     "internal error: overran compiling workspace\0"
331     "internal error: previously-checked referenced subpattern not found\0"
332     "DEFINE group contains more than one branch\0"
333 nigel 93 /* 55 */
334 ph10 240 "repeating a DEFINE group is not allowed\0"
335     "inconsistent NEWLINE options\0"
336 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
337     "a numbered reference must not be zero\0"
338 ph10 240 "(*VERB) with an argument is not supported\0"
339 ph10 211 /* 60 */
340 ph10 240 "(*VERB) not recognized\0"
341 ph10 268 "number is too big\0"
342 ph10 272 "subpattern name expected\0"
343 ph10 336 "digit expected after (?+\0"
344 ph10 345 "] is an invalid data character in JavaScript compatibility mode";
345 nigel 77
346    
347     /* Table to identify digits and hex digits. This is used when compiling
348     patterns. Note that the tables in chartables are dependent on the locale, and
349     may mark arbitrary characters as digits - but the PCRE compiling code expects
350     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
351     a private table here. It costs 256 bytes, but it is a lot faster than doing
352     character value tests (at least in some simple cases I timed), and in some
353     applications one wants PCRE to compile efficiently as well as match
354     efficiently.
355    
356     For convenience, we use the same bit definitions as in chartables:
357    
358     0x04 decimal digit
359     0x08 hexadecimal digit
360    
361     Then we can use ctype_digit and ctype_xdigit in the code. */
362    
363 ph10 391 #ifndef EBCDIC
364    
365     /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
366     UTF-8 mode. */
367    
368 nigel 77 static const unsigned char digitab[] =
369     {
370     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
371     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
374     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
375     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
376     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
377     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
378     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
379     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
380     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
381     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
382     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
383     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
387     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
388     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
393     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
394     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
395     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
396     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
397     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
398     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
399     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
400     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
401     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
402    
403 ph10 391 #else
404    
405     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
406    
407 nigel 77 static const unsigned char digitab[] =
408     {
409     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
410     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
411     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
412     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
413     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
414     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
415     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
416     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
417     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
418     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
419     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
420 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
421 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
422     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
423     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
424     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
425     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
426     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
427     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
428     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
429     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
430     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
431     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
432     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
433     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
434     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
435     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
436     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
437     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
438     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
439     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
440     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
441    
442     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
443     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
444     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
445     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
446     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
447     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
448     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
449     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
450     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
451     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
452     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
453     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
454 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
455 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
456     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
457     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
458     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
459     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
460     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
461     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
462     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
463     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
464     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
465     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
466     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
467     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
468     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
469     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
470     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
471     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
472     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
473     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
474     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
475     #endif
476    
477    
478     /* Definition to allow mutual recursion */
479    
480     static BOOL
481 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
482 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
483 nigel 77
484    
485    
486     /*************************************************
487 ph10 240 * Find an error text *
488     *************************************************/
489    
490 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
491     some of the text is of unknown length, we can't use a table of offsets.
492     Instead, just count through the strings. This is not a performance issue
493 ph10 240 because it happens only when there has been a compilation error.
494    
495     Argument: the error number
496     Returns: pointer to the error string
497     */
498    
499     static const char *
500     find_error_text(int n)
501     {
502     const char *s = error_texts;
503 ph10 369 for (; n > 0; n--) while (*s++ != 0) {};
504 ph10 240 return s;
505     }
506    
507    
508     /*************************************************
509 nigel 77 * Handle escapes *
510     *************************************************/
511    
512     /* This function is called when a \ has been encountered. It either returns a
513     positive value for a simple escape such as \n, or a negative value which
514 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
515     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
516     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
517     ptr is pointing at the \. On exit, it is on the final character of the escape
518     sequence.
519 nigel 77
520     Arguments:
521     ptrptr points to the pattern position pointer
522     errorcodeptr points to the errorcode variable
523     bracount number of previous extracting brackets
524     options the options bits
525     isclass TRUE if inside a character class
526    
527     Returns: zero or positive => a data character
528     negative => a special escape sequence
529 ph10 213 on error, errorcodeptr is set
530 nigel 77 */
531    
532     static int
533     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
534     int options, BOOL isclass)
535     {
536 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
537     const uschar *ptr = *ptrptr + 1;
538 nigel 77 int c, i;
539    
540 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
541     ptr--; /* Set pointer back to the last byte */
542    
543 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
544    
545     if (c == 0) *errorcodeptr = ERR1;
546    
547 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
548     in a table. A non-zero result is something that can be returned immediately.
549 nigel 77 Otherwise further processing may be required. */
550    
551 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
552     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
553     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
554 nigel 77
555 ph10 97 #else /* EBCDIC coding */
556 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
557 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
558     #endif
559    
560     /* Escapes that need further processing, or are illegal. */
561    
562     else
563     {
564     const uschar *oldptr;
565 nigel 93 BOOL braced, negated;
566    
567 nigel 77 switch (c)
568     {
569     /* A number of Perl escapes are not handled by PCRE. We give an explicit
570     error. */
571    
572 ph10 391 case CHAR_l:
573     case CHAR_L:
574     case CHAR_N:
575     case CHAR_u:
576     case CHAR_U:
577 nigel 77 *errorcodeptr = ERR37;
578     break;
579    
580 ph10 333 /* \g must be followed by one of a number of specific things:
581 ph10 345
582 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
583     backreference. If negative, it is a relative backreference. This is a Perl
584     5.10 feature.
585 ph10 345
586 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
587     is part of Perl's movement towards a unified syntax for back references. As
588     this is synonymous with \k{name}, we fudge it up by pretending it really
589     was \k.
590 ph10 345
591     (3) For Oniguruma compatibility we also support \g followed by a name or a
592     number either in angle brackets or in single quotes. However, these are
593     (possibly recursive) subroutine calls, _not_ backreferences. Just return
594 ph10 333 the -ESC_g code (cf \k). */
595 nigel 93
596 ph10 391 case CHAR_g:
597     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
598 ph10 333 {
599     c = -ESC_g;
600 ph10 345 break;
601     }
602 ph10 333
603     /* Handle the Perl-compatible cases */
604 ph10 345
605 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
606 nigel 93 {
607 ph10 171 const uschar *p;
608 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
609     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
610     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
611 ph10 171 {
612     c = -ESC_k;
613     break;
614 ph10 172 }
615 nigel 93 braced = TRUE;
616     ptr++;
617     }
618     else braced = FALSE;
619    
620 ph10 391 if (ptr[1] == CHAR_MINUS)
621 nigel 93 {
622     negated = TRUE;
623     ptr++;
624     }
625     else negated = FALSE;
626    
627     c = 0;
628     while ((digitab[ptr[1]] & ctype_digit) != 0)
629 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
630 ph10 220
631 ph10 333 if (c < 0) /* Integer overflow */
632 ph10 213 {
633     *errorcodeptr = ERR61;
634     break;
635 ph10 220 }
636 ph10 345
637 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
638 nigel 93 {
639     *errorcodeptr = ERR57;
640 ph10 213 break;
641 nigel 93 }
642 ph10 345
643 ph10 333 if (c == 0)
644     {
645     *errorcodeptr = ERR58;
646     break;
647 ph10 345 }
648 nigel 93
649     if (negated)
650     {
651     if (c > bracount)
652     {
653     *errorcodeptr = ERR15;
654 ph10 213 break;
655 nigel 93 }
656     c = bracount - (c - 1);
657     }
658    
659     c = -(ESC_REF + c);
660     break;
661    
662 nigel 77 /* The handling of escape sequences consisting of a string of digits
663     starting with one that is not zero is not straightforward. By experiment,
664     the way Perl works seems to be as follows:
665    
666     Outside a character class, the digits are read as a decimal number. If the
667     number is less than 10, or if there are that many previous extracting
668     left brackets, then it is a back reference. Otherwise, up to three octal
669     digits are read to form an escaped byte. Thus \123 is likely to be octal
670     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
671     value is greater than 377, the least significant 8 bits are taken. Inside a
672     character class, \ followed by a digit is always an octal number. */
673    
674 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
675     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
676 nigel 77
677     if (!isclass)
678     {
679     oldptr = ptr;
680 ph10 391 c -= CHAR_0;
681 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
682 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
683 ph10 333 if (c < 0) /* Integer overflow */
684 ph10 213 {
685     *errorcodeptr = ERR61;
686 ph10 220 break;
687     }
688 nigel 77 if (c < 10 || c <= bracount)
689     {
690     c = -(ESC_REF + c);
691     break;
692     }
693     ptr = oldptr; /* Put the pointer back and fall through */
694     }
695    
696     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
697     generates a binary zero byte and treats the digit as a following literal.
698     Thus we have to pull back the pointer by one. */
699    
700 ph10 391 if ((c = *ptr) >= CHAR_8)
701 nigel 77 {
702     ptr--;
703     c = 0;
704     break;
705     }
706    
707     /* \0 always starts an octal number, but we may drop through to here with a
708 nigel 91 larger first octal digit. The original code used just to take the least
709     significant 8 bits of octal numbers (I think this is what early Perls used
710     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
711     than 3 octal digits. */
712 nigel 77
713 ph10 391 case CHAR_0:
714     c -= CHAR_0;
715     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
716     c = c * 8 + *(++ptr) - CHAR_0;
717 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
718 nigel 77 break;
719    
720 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
721     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
722     treated as a data character. */
723 nigel 77
724 ph10 391 case CHAR_x:
725     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
726 nigel 77 {
727     const uschar *pt = ptr + 2;
728 nigel 87 int count = 0;
729    
730 nigel 77 c = 0;
731     while ((digitab[*pt] & ctype_xdigit) != 0)
732     {
733 nigel 87 register int cc = *pt++;
734 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
735 nigel 77 count++;
736 nigel 87
737 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
738     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
739     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
740 ph10 97 #else /* EBCDIC coding */
741 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
742     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
743 nigel 77 #endif
744     }
745 nigel 87
746 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
747 nigel 77 {
748 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
749 nigel 77 ptr = pt;
750     break;
751     }
752 nigel 87
753 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
754     recognize this construct; fall through to the normal \x handling. */
755     }
756    
757 nigel 87 /* Read just a single-byte hex-defined char */
758 nigel 77
759     c = 0;
760     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
761     {
762 ph10 391 int cc; /* Some compilers don't like */
763     cc = *(++ptr); /* ++ in initializers */
764     #ifndef EBCDIC /* ASCII/UTF-8 coding */
765     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
766     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
767 ph10 97 #else /* EBCDIC coding */
768 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
769     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
770 nigel 77 #endif
771     }
772     break;
773    
774 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
775     This coding is ASCII-specific, but then the whole concept of \cx is
776     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
777 nigel 77
778 ph10 391 case CHAR_c:
779 nigel 77 c = *(++ptr);
780     if (c == 0)
781     {
782     *errorcodeptr = ERR2;
783 ph10 213 break;
784 nigel 77 }
785    
786 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
787     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
788 nigel 77 c ^= 0x40;
789 ph10 97 #else /* EBCDIC coding */
790 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
791 nigel 77 c ^= 0xC0;
792     #endif
793     break;
794    
795     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
796 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
797     otherwise, for Perl compatibility, it is a literal. This code looks a bit
798     odd, but there used to be some cases other than the default, and there may
799     be again in future, so I haven't "optimized" it. */
800 nigel 77
801     default:
802     if ((options & PCRE_EXTRA) != 0) switch(c)
803     {
804     default:
805     *errorcodeptr = ERR3;
806     break;
807     }
808     break;
809     }
810     }
811    
812     *ptrptr = ptr;
813     return c;
814     }
815    
816    
817    
818     #ifdef SUPPORT_UCP
819     /*************************************************
820     * Handle \P and \p *
821     *************************************************/
822    
823     /* This function is called after \P or \p has been encountered, provided that
824     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
825     pointing at the P or p. On exit, it is pointing at the final character of the
826     escape sequence.
827    
828     Argument:
829     ptrptr points to the pattern position pointer
830     negptr points to a boolean that is set TRUE for negation else FALSE
831 nigel 87 dptr points to an int that is set to the detailed property value
832 nigel 77 errorcodeptr points to the error code variable
833    
834 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
835 nigel 77 */
836    
837     static int
838 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
839 nigel 77 {
840     int c, i, bot, top;
841     const uschar *ptr = *ptrptr;
842 nigel 87 char name[32];
843 nigel 77
844     c = *(++ptr);
845     if (c == 0) goto ERROR_RETURN;
846    
847     *negptr = FALSE;
848    
849 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
850     negation. */
851 nigel 77
852 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
853 nigel 77 {
854 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
855 nigel 77 {
856     *negptr = TRUE;
857     ptr++;
858     }
859 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
860 nigel 77 {
861     c = *(++ptr);
862     if (c == 0) goto ERROR_RETURN;
863 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
864 nigel 77 name[i] = c;
865     }
866 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
867 nigel 77 name[i] = 0;
868     }
869    
870     /* Otherwise there is just one following character */
871    
872     else
873     {
874     name[0] = c;
875     name[1] = 0;
876     }
877    
878     *ptrptr = ptr;
879    
880     /* Search for a recognized property name using binary chop */
881    
882     bot = 0;
883     top = _pcre_utt_size;
884    
885     while (bot < top)
886     {
887 nigel 87 i = (bot + top) >> 1;
888 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
889 nigel 87 if (c == 0)
890     {
891     *dptr = _pcre_utt[i].value;
892     return _pcre_utt[i].type;
893     }
894 nigel 77 if (c > 0) bot = i + 1; else top = i;
895     }
896    
897     *errorcodeptr = ERR47;
898     *ptrptr = ptr;
899     return -1;
900    
901     ERROR_RETURN:
902     *errorcodeptr = ERR46;
903     *ptrptr = ptr;
904     return -1;
905     }
906     #endif
907    
908    
909    
910    
911     /*************************************************
912     * Check for counted repeat *
913     *************************************************/
914    
915     /* This function is called when a '{' is encountered in a place where it might
916     start a quantifier. It looks ahead to see if it really is a quantifier or not.
917     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
918     where the ddds are digits.
919    
920     Arguments:
921     p pointer to the first char after '{'
922    
923     Returns: TRUE or FALSE
924     */
925    
926     static BOOL
927     is_counted_repeat(const uschar *p)
928     {
929     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
930     while ((digitab[*p] & ctype_digit) != 0) p++;
931 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
932 nigel 77
933 ph10 391 if (*p++ != CHAR_COMMA) return FALSE;
934     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
935 nigel 77
936     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
937     while ((digitab[*p] & ctype_digit) != 0) p++;
938    
939 ph10 391 return (*p == CHAR_RIGHT_CURLY_BRACKET);
940 nigel 77 }
941    
942    
943    
944     /*************************************************
945     * Read repeat counts *
946     *************************************************/
947    
948     /* Read an item of the form {n,m} and return the values. This is called only
949     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
950     so the syntax is guaranteed to be correct, but we need to check the values.
951    
952     Arguments:
953     p pointer to first char after '{'
954     minp pointer to int for min
955     maxp pointer to int for max
956     returned as -1 if no max
957     errorcodeptr points to error code variable
958    
959     Returns: pointer to '}' on success;
960     current ptr on error, with errorcodeptr set non-zero
961     */
962    
963     static const uschar *
964     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
965     {
966     int min = 0;
967     int max = -1;
968    
969 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
970     an integer overflow. */
971    
972 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
973 nigel 81 if (min < 0 || min > 65535)
974     {
975     *errorcodeptr = ERR5;
976     return p;
977     }
978 nigel 77
979 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
980     Also, max must not be less than min. */
981    
982 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
983 nigel 77 {
984 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
985 nigel 77 {
986     max = 0;
987 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
988 nigel 81 if (max < 0 || max > 65535)
989     {
990     *errorcodeptr = ERR5;
991     return p;
992     }
993 nigel 77 if (max < min)
994     {
995     *errorcodeptr = ERR4;
996     return p;
997     }
998     }
999     }
1000    
1001 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1002     '}'. */
1003 nigel 77
1004 nigel 81 *minp = min;
1005     *maxp = max;
1006 nigel 77 return p;
1007     }
1008    
1009    
1010    
1011     /*************************************************
1012 nigel 93 * Find forward referenced subpattern *
1013 nigel 91 *************************************************/
1014    
1015 nigel 93 /* This function scans along a pattern's text looking for capturing
1016     subpatterns, and counting them. If it finds a named pattern that matches the
1017     name it is given, it returns its number. Alternatively, if the name is NULL, it
1018     returns when it reaches a given numbered subpattern. This is used for forward
1019     references to subpatterns. We know that if (?P< is encountered, the name will
1020     be terminated by '>' because that is checked in the first pass.
1021 nigel 91
1022     Arguments:
1023 nigel 93 ptr current position in the pattern
1024 ph10 345 cd compile background data
1025 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1026     lorn name length, or subpattern number if name is NULL
1027     xmode TRUE if we are in /x mode
1028 nigel 91
1029     Returns: the number of the named subpattern, or -1 if not found
1030     */
1031    
1032     static int
1033 ph10 341 find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,
1034 nigel 93 BOOL xmode)
1035 nigel 91 {
1036     const uschar *thisname;
1037 ph10 341 int count = cd->bracount;
1038 nigel 93
1039 nigel 91 for (; *ptr != 0; ptr++)
1040     {
1041 nigel 93 int term;
1042    
1043     /* Skip over backslashed characters and also entire \Q...\E */
1044    
1045 ph10 391 if (*ptr == CHAR_BACKSLASH)
1046 nigel 93 {
1047     if (*(++ptr) == 0) return -1;
1048 ph10 391 if (*ptr == CHAR_Q) for (;;)
1049 nigel 93 {
1050 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1051 nigel 93 if (*ptr == 0) return -1;
1052 ph10 391 if (*(++ptr) == CHAR_E) break;
1053 nigel 93 }
1054     continue;
1055     }
1056    
1057 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1058     are handled for real. If the first character is '^', skip it. Also, if the
1059     first few characters (either before or after ^) are \Q\E or \E we skip them
1060 ph10 391 too. This makes for compatibility with Perl. Note the use of STR macros to
1061     encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1062 nigel 93
1063 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1064 nigel 93 {
1065 ph10 340 BOOL negate_class = FALSE;
1066     for (;;)
1067     {
1068     int c = *(++ptr);
1069 ph10 391 if (c == CHAR_BACKSLASH)
1070 ph10 340 {
1071 ph10 391 if (ptr[1] == CHAR_E)
1072     ptr++;
1073     else if (strncmp((const char *)ptr+1,
1074     STR_Q STR_BACKSLASH STR_E, 3) == 0)
1075     ptr += 3;
1076     else
1077     break;
1078 ph10 340 }
1079 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
1080 ph10 340 negate_class = TRUE;
1081     else break;
1082     }
1083    
1084     /* If the next character is ']', it is a data character that must be
1085 ph10 341 skipped, except in JavaScript compatibility mode. */
1086 ph10 345
1087 ph10 391 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1088     (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1089 ph10 345 ptr++;
1090    
1091 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1092 nigel 93 {
1093 ph10 220 if (*ptr == 0) return -1;
1094 ph10 391 if (*ptr == CHAR_BACKSLASH)
1095 nigel 93 {
1096     if (*(++ptr) == 0) return -1;
1097 ph10 391 if (*ptr == CHAR_Q) for (;;)
1098 nigel 93 {
1099 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1100 nigel 93 if (*ptr == 0) return -1;
1101 ph10 391 if (*(++ptr) == CHAR_E) break;
1102 nigel 93 }
1103     continue;
1104     }
1105     }
1106     continue;
1107     }
1108    
1109     /* Skip comments in /x mode */
1110    
1111 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1112 nigel 93 {
1113 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1114 nigel 93 if (*ptr == 0) return -1;
1115     continue;
1116     }
1117    
1118     /* An opening parens must now be a real metacharacter */
1119    
1120 ph10 391 if (*ptr != CHAR_LEFT_PARENTHESIS) continue;
1121     if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1122 nigel 93 {
1123     count++;
1124     if (name == NULL && count == lorn) return count;
1125     continue;
1126     }
1127    
1128     ptr += 2;
1129 ph10 391 if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1130 nigel 93
1131     /* We have to disambiguate (?<! and (?<= from (?<name> */
1132    
1133 ph10 391 if ((*ptr != CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_EXCLAMATION_MARK ||
1134     ptr[1] == CHAR_EQUALS_SIGN) && *ptr != CHAR_APOSTROPHE)
1135 nigel 93 continue;
1136    
1137 nigel 91 count++;
1138 nigel 93
1139     if (name == NULL && count == lorn) return count;
1140     term = *ptr++;
1141 ph10 391 if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1142 nigel 91 thisname = ptr;
1143 nigel 93 while (*ptr != term) ptr++;
1144     if (name != NULL && lorn == ptr - thisname &&
1145     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1146 nigel 91 return count;
1147     }
1148 nigel 93
1149 nigel 91 return -1;
1150     }
1151    
1152    
1153    
1154     /*************************************************
1155 nigel 77 * Find first significant op code *
1156     *************************************************/
1157    
1158     /* This is called by several functions that scan a compiled expression looking
1159     for a fixed first character, or an anchoring op code etc. It skips over things
1160     that do not influence this. For some calls, a change of option is important.
1161     For some calls, it makes sense to skip negative forward and all backward
1162     assertions, and also the \b assertion; for others it does not.
1163    
1164     Arguments:
1165     code pointer to the start of the group
1166     options pointer to external options
1167     optbit the option bit whose changing is significant, or
1168     zero if none are
1169     skipassert TRUE if certain assertions are to be skipped
1170    
1171     Returns: pointer to the first significant opcode
1172     */
1173    
1174     static const uschar*
1175     first_significant_code(const uschar *code, int *options, int optbit,
1176     BOOL skipassert)
1177     {
1178     for (;;)
1179     {
1180     switch ((int)*code)
1181     {
1182     case OP_OPT:
1183     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1184     *options = (int)code[1];
1185     code += 2;
1186     break;
1187    
1188     case OP_ASSERT_NOT:
1189     case OP_ASSERTBACK:
1190     case OP_ASSERTBACK_NOT:
1191     if (!skipassert) return code;
1192     do code += GET(code, 1); while (*code == OP_ALT);
1193     code += _pcre_OP_lengths[*code];
1194     break;
1195    
1196     case OP_WORD_BOUNDARY:
1197     case OP_NOT_WORD_BOUNDARY:
1198     if (!skipassert) return code;
1199     /* Fall through */
1200    
1201     case OP_CALLOUT:
1202     case OP_CREF:
1203 nigel 93 case OP_RREF:
1204     case OP_DEF:
1205 nigel 77 code += _pcre_OP_lengths[*code];
1206     break;
1207    
1208     default:
1209     return code;
1210     }
1211     }
1212     /* Control never reaches here */
1213     }
1214    
1215    
1216    
1217    
1218     /*************************************************
1219     * Find the fixed length of a pattern *
1220     *************************************************/
1221    
1222     /* Scan a pattern and compute the fixed length of subject that will match it,
1223     if the length is fixed. This is needed for dealing with backward assertions.
1224     In UTF8 mode, the result is in characters rather than bytes.
1225    
1226     Arguments:
1227     code points to the start of the pattern (the bracket)
1228     options the compiling options
1229    
1230     Returns: the fixed length, or -1 if there is no fixed length,
1231     or -2 if \C was encountered
1232     */
1233    
1234     static int
1235     find_fixedlength(uschar *code, int options)
1236     {
1237     int length = -1;
1238    
1239     register int branchlength = 0;
1240     register uschar *cc = code + 1 + LINK_SIZE;
1241    
1242     /* Scan along the opcodes for this branch. If we get to the end of the
1243     branch, check the length against that of the other branches. */
1244    
1245     for (;;)
1246     {
1247     int d;
1248     register int op = *cc;
1249     switch (op)
1250     {
1251 nigel 93 case OP_CBRA:
1252 nigel 77 case OP_BRA:
1253     case OP_ONCE:
1254     case OP_COND:
1255 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1256 nigel 77 if (d < 0) return d;
1257     branchlength += d;
1258     do cc += GET(cc, 1); while (*cc == OP_ALT);
1259     cc += 1 + LINK_SIZE;
1260     break;
1261    
1262     /* Reached end of a branch; if it's a ket it is the end of a nested
1263     call. If it's ALT it is an alternation in a nested call. If it is
1264     END it's the end of the outer call. All can be handled by the same code. */
1265    
1266     case OP_ALT:
1267     case OP_KET:
1268     case OP_KETRMAX:
1269     case OP_KETRMIN:
1270     case OP_END:
1271     if (length < 0) length = branchlength;
1272     else if (length != branchlength) return -1;
1273     if (*cc != OP_ALT) return length;
1274     cc += 1 + LINK_SIZE;
1275     branchlength = 0;
1276     break;
1277    
1278     /* Skip over assertive subpatterns */
1279    
1280     case OP_ASSERT:
1281     case OP_ASSERT_NOT:
1282     case OP_ASSERTBACK:
1283     case OP_ASSERTBACK_NOT:
1284     do cc += GET(cc, 1); while (*cc == OP_ALT);
1285     /* Fall through */
1286    
1287     /* Skip over things that don't match chars */
1288    
1289     case OP_REVERSE:
1290     case OP_CREF:
1291 nigel 93 case OP_RREF:
1292     case OP_DEF:
1293 nigel 77 case OP_OPT:
1294     case OP_CALLOUT:
1295     case OP_SOD:
1296     case OP_SOM:
1297     case OP_EOD:
1298     case OP_EODN:
1299     case OP_CIRC:
1300     case OP_DOLL:
1301     case OP_NOT_WORD_BOUNDARY:
1302     case OP_WORD_BOUNDARY:
1303     cc += _pcre_OP_lengths[*cc];
1304     break;
1305    
1306     /* Handle literal characters */
1307    
1308     case OP_CHAR:
1309     case OP_CHARNC:
1310 nigel 91 case OP_NOT:
1311 nigel 77 branchlength++;
1312     cc += 2;
1313     #ifdef SUPPORT_UTF8
1314     if ((options & PCRE_UTF8) != 0)
1315     {
1316     while ((*cc & 0xc0) == 0x80) cc++;
1317     }
1318     #endif
1319     break;
1320    
1321     /* Handle exact repetitions. The count is already in characters, but we
1322     need to skip over a multibyte character in UTF8 mode. */
1323    
1324     case OP_EXACT:
1325     branchlength += GET2(cc,1);
1326     cc += 4;
1327     #ifdef SUPPORT_UTF8
1328     if ((options & PCRE_UTF8) != 0)
1329     {
1330     while((*cc & 0x80) == 0x80) cc++;
1331     }
1332     #endif
1333     break;
1334    
1335     case OP_TYPEEXACT:
1336     branchlength += GET2(cc,1);
1337 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1338 nigel 77 cc += 4;
1339     break;
1340    
1341     /* Handle single-char matchers */
1342    
1343     case OP_PROP:
1344     case OP_NOTPROP:
1345 nigel 87 cc += 2;
1346 nigel 77 /* Fall through */
1347    
1348     case OP_NOT_DIGIT:
1349     case OP_DIGIT:
1350     case OP_NOT_WHITESPACE:
1351     case OP_WHITESPACE:
1352     case OP_NOT_WORDCHAR:
1353     case OP_WORDCHAR:
1354     case OP_ANY:
1355 ph10 342 case OP_ALLANY:
1356 nigel 77 branchlength++;
1357     cc++;
1358     break;
1359    
1360     /* The single-byte matcher isn't allowed */
1361    
1362     case OP_ANYBYTE:
1363     return -2;
1364    
1365     /* Check a class for variable quantification */
1366    
1367     #ifdef SUPPORT_UTF8
1368     case OP_XCLASS:
1369     cc += GET(cc, 1) - 33;
1370     /* Fall through */
1371     #endif
1372    
1373     case OP_CLASS:
1374     case OP_NCLASS:
1375     cc += 33;
1376    
1377     switch (*cc)
1378     {
1379     case OP_CRSTAR:
1380     case OP_CRMINSTAR:
1381     case OP_CRQUERY:
1382     case OP_CRMINQUERY:
1383     return -1;
1384    
1385     case OP_CRRANGE:
1386     case OP_CRMINRANGE:
1387     if (GET2(cc,1) != GET2(cc,3)) return -1;
1388     branchlength += GET2(cc,1);
1389     cc += 5;
1390     break;
1391    
1392     default:
1393     branchlength++;
1394     }
1395     break;
1396    
1397     /* Anything else is variable length */
1398    
1399     default:
1400     return -1;
1401     }
1402     }
1403     /* Control never gets here */
1404     }
1405    
1406    
1407    
1408    
1409     /*************************************************
1410     * Scan compiled regex for numbered bracket *
1411     *************************************************/
1412    
1413     /* This little function scans through a compiled pattern until it finds a
1414     capturing bracket with the given number.
1415    
1416     Arguments:
1417     code points to start of expression
1418     utf8 TRUE in UTF-8 mode
1419     number the required bracket number
1420    
1421     Returns: pointer to the opcode for the bracket, or NULL if not found
1422     */
1423    
1424     static const uschar *
1425     find_bracket(const uschar *code, BOOL utf8, int number)
1426     {
1427     for (;;)
1428     {
1429     register int c = *code;
1430     if (c == OP_END) return NULL;
1431 nigel 91
1432     /* XCLASS is used for classes that cannot be represented just by a bit
1433     map. This includes negated single high-valued characters. The length in
1434     the table is zero; the actual length is stored in the compiled code. */
1435    
1436     if (c == OP_XCLASS) code += GET(code, 1);
1437    
1438 nigel 93 /* Handle capturing bracket */
1439 nigel 91
1440 nigel 93 else if (c == OP_CBRA)
1441 nigel 77 {
1442 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1443 nigel 77 if (n == number) return (uschar *)code;
1444 nigel 93 code += _pcre_OP_lengths[c];
1445 nigel 77 }
1446 nigel 91
1447 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1448     repeated character types, we have to test for \p and \P, which have an extra
1449 ph10 218 two bytes of parameters. */
1450 nigel 91
1451 nigel 77 else
1452     {
1453 ph10 218 switch(c)
1454     {
1455     case OP_TYPESTAR:
1456     case OP_TYPEMINSTAR:
1457     case OP_TYPEPLUS:
1458     case OP_TYPEMINPLUS:
1459     case OP_TYPEQUERY:
1460     case OP_TYPEMINQUERY:
1461     case OP_TYPEPOSSTAR:
1462     case OP_TYPEPOSPLUS:
1463     case OP_TYPEPOSQUERY:
1464     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1465 ph10 220 break;
1466 ph10 221
1467     case OP_TYPEUPTO:
1468     case OP_TYPEMINUPTO:
1469     case OP_TYPEEXACT:
1470     case OP_TYPEPOSUPTO:
1471     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1472     break;
1473 ph10 220 }
1474    
1475 ph10 218 /* Add in the fixed length from the table */
1476 ph10 220
1477 nigel 77 code += _pcre_OP_lengths[c];
1478 ph10 220
1479 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1480     a multi-byte character. The length in the table is a minimum, so we have to
1481     arrange to skip the extra bytes. */
1482 ph10 220
1483 ph10 107 #ifdef SUPPORT_UTF8
1484 nigel 77 if (utf8) switch(c)
1485     {
1486     case OP_CHAR:
1487     case OP_CHARNC:
1488     case OP_EXACT:
1489     case OP_UPTO:
1490     case OP_MINUPTO:
1491 nigel 93 case OP_POSUPTO:
1492 nigel 77 case OP_STAR:
1493     case OP_MINSTAR:
1494 nigel 93 case OP_POSSTAR:
1495 nigel 77 case OP_PLUS:
1496     case OP_MINPLUS:
1497 nigel 93 case OP_POSPLUS:
1498 nigel 77 case OP_QUERY:
1499     case OP_MINQUERY:
1500 nigel 93 case OP_POSQUERY:
1501     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1502 nigel 77 break;
1503     }
1504 ph10 369 #else
1505     (void)(utf8); /* Keep compiler happy by referencing function argument */
1506 ph10 111 #endif
1507 nigel 77 }
1508     }
1509     }
1510    
1511    
1512    
1513     /*************************************************
1514     * Scan compiled regex for recursion reference *
1515     *************************************************/
1516    
1517     /* This little function scans through a compiled pattern until it finds an
1518     instance of OP_RECURSE.
1519    
1520     Arguments:
1521     code points to start of expression
1522     utf8 TRUE in UTF-8 mode
1523    
1524     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1525     */
1526    
1527     static const uschar *
1528     find_recurse(const uschar *code, BOOL utf8)
1529     {
1530     for (;;)
1531     {
1532     register int c = *code;
1533     if (c == OP_END) return NULL;
1534 nigel 91 if (c == OP_RECURSE) return code;
1535 ph10 220
1536 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1537     map. This includes negated single high-valued characters. The length in
1538     the table is zero; the actual length is stored in the compiled code. */
1539    
1540     if (c == OP_XCLASS) code += GET(code, 1);
1541    
1542 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1543     repeated character types, we have to test for \p and \P, which have an extra
1544 ph10 218 two bytes of parameters. */
1545 nigel 91
1546 nigel 77 else
1547     {
1548 ph10 218 switch(c)
1549     {
1550     case OP_TYPESTAR:
1551     case OP_TYPEMINSTAR:
1552     case OP_TYPEPLUS:
1553     case OP_TYPEMINPLUS:
1554     case OP_TYPEQUERY:
1555     case OP_TYPEMINQUERY:
1556     case OP_TYPEPOSSTAR:
1557     case OP_TYPEPOSPLUS:
1558     case OP_TYPEPOSQUERY:
1559     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1560 ph10 220 break;
1561 ph10 221
1562     case OP_TYPEPOSUPTO:
1563     case OP_TYPEUPTO:
1564     case OP_TYPEMINUPTO:
1565     case OP_TYPEEXACT:
1566     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1567     break;
1568 ph10 220 }
1569    
1570 ph10 218 /* Add in the fixed length from the table */
1571    
1572 nigel 77 code += _pcre_OP_lengths[c];
1573 ph10 220
1574 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1575     by a multi-byte character. The length in the table is a minimum, so we have
1576     to arrange to skip the extra bytes. */
1577 ph10 220
1578 ph10 107 #ifdef SUPPORT_UTF8
1579 nigel 77 if (utf8) switch(c)
1580     {
1581     case OP_CHAR:
1582     case OP_CHARNC:
1583     case OP_EXACT:
1584     case OP_UPTO:
1585     case OP_MINUPTO:
1586 nigel 93 case OP_POSUPTO:
1587 nigel 77 case OP_STAR:
1588     case OP_MINSTAR:
1589 nigel 93 case OP_POSSTAR:
1590 nigel 77 case OP_PLUS:
1591     case OP_MINPLUS:
1592 nigel 93 case OP_POSPLUS:
1593 nigel 77 case OP_QUERY:
1594     case OP_MINQUERY:
1595 nigel 93 case OP_POSQUERY:
1596     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1597 nigel 77 break;
1598     }
1599 ph10 369 #else
1600     (void)(utf8); /* Keep compiler happy by referencing function argument */
1601 ph10 111 #endif
1602 nigel 77 }
1603     }
1604     }
1605    
1606    
1607    
1608     /*************************************************
1609     * Scan compiled branch for non-emptiness *
1610     *************************************************/
1611    
1612     /* This function scans through a branch of a compiled pattern to see whether it
1613 nigel 93 can match the empty string or not. It is called from could_be_empty()
1614     below and from compile_branch() when checking for an unlimited repeat of a
1615     group that can match nothing. Note that first_significant_code() skips over
1616 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1617     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1618     bracket whose current branch will already have been scanned.
1619 nigel 77
1620     Arguments:
1621     code points to start of search
1622     endcode points to where to stop
1623     utf8 TRUE if in UTF8 mode
1624    
1625     Returns: TRUE if what is matched could be empty
1626     */
1627    
1628     static BOOL
1629     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1630     {
1631     register int c;
1632 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1633 nigel 77 code < endcode;
1634     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1635     {
1636     const uschar *ccode;
1637    
1638     c = *code;
1639 ph10 286
1640     /* Skip over forward assertions; the other assertions are skipped by
1641 ph10 282 first_significant_code() with a TRUE final argument. */
1642 ph10 286
1643 ph10 282 if (c == OP_ASSERT)
1644 ph10 286 {
1645 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1646     c = *code;
1647     continue;
1648 ph10 286 }
1649 ph10 172
1650 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1651 nigel 77
1652 ph10 335 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1653 ph10 170 {
1654 ph10 172 code += _pcre_OP_lengths[c];
1655 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1656     c = *code;
1657     continue;
1658     }
1659    
1660     /* For other groups, scan the branches. */
1661 ph10 172
1662 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1663 nigel 77 {
1664     BOOL empty_branch;
1665     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1666    
1667     /* Scan a closed bracket */
1668    
1669     empty_branch = FALSE;
1670     do
1671     {
1672     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1673     empty_branch = TRUE;
1674     code += GET(code, 1);
1675     }
1676     while (*code == OP_ALT);
1677     if (!empty_branch) return FALSE; /* All branches are non-empty */
1678 ph10 172 c = *code;
1679 nigel 93 continue;
1680 nigel 77 }
1681    
1682 nigel 93 /* Handle the other opcodes */
1683    
1684     switch (c)
1685 nigel 77 {
1686 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1687     cannot be represented just by a bit map. This includes negated single
1688     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1689 ph10 220 actual length is stored in the compiled code, so we must update "code"
1690 ph10 216 here. */
1691 nigel 77
1692     #ifdef SUPPORT_UTF8
1693     case OP_XCLASS:
1694 ph10 216 ccode = code += GET(code, 1);
1695 nigel 77 goto CHECK_CLASS_REPEAT;
1696     #endif
1697    
1698     case OP_CLASS:
1699     case OP_NCLASS:
1700     ccode = code + 33;
1701    
1702     #ifdef SUPPORT_UTF8
1703     CHECK_CLASS_REPEAT:
1704     #endif
1705    
1706     switch (*ccode)
1707     {
1708     case OP_CRSTAR: /* These could be empty; continue */
1709     case OP_CRMINSTAR:
1710     case OP_CRQUERY:
1711     case OP_CRMINQUERY:
1712     break;
1713    
1714     default: /* Non-repeat => class must match */
1715     case OP_CRPLUS: /* These repeats aren't empty */
1716     case OP_CRMINPLUS:
1717     return FALSE;
1718    
1719     case OP_CRRANGE:
1720     case OP_CRMINRANGE:
1721     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1722     break;
1723     }
1724     break;
1725    
1726     /* Opcodes that must match a character */
1727    
1728     case OP_PROP:
1729     case OP_NOTPROP:
1730     case OP_EXTUNI:
1731     case OP_NOT_DIGIT:
1732     case OP_DIGIT:
1733     case OP_NOT_WHITESPACE:
1734     case OP_WHITESPACE:
1735     case OP_NOT_WORDCHAR:
1736     case OP_WORDCHAR:
1737     case OP_ANY:
1738 ph10 345 case OP_ALLANY:
1739 nigel 77 case OP_ANYBYTE:
1740     case OP_CHAR:
1741     case OP_CHARNC:
1742     case OP_NOT:
1743     case OP_PLUS:
1744     case OP_MINPLUS:
1745 nigel 93 case OP_POSPLUS:
1746 nigel 77 case OP_EXACT:
1747     case OP_NOTPLUS:
1748     case OP_NOTMINPLUS:
1749 nigel 93 case OP_NOTPOSPLUS:
1750 nigel 77 case OP_NOTEXACT:
1751     case OP_TYPEPLUS:
1752     case OP_TYPEMINPLUS:
1753 nigel 93 case OP_TYPEPOSPLUS:
1754 nigel 77 case OP_TYPEEXACT:
1755     return FALSE;
1756 ph10 227
1757     /* These are going to continue, as they may be empty, but we have to
1758     fudge the length for the \p and \P cases. */
1759    
1760 ph10 224 case OP_TYPESTAR:
1761     case OP_TYPEMINSTAR:
1762     case OP_TYPEPOSSTAR:
1763     case OP_TYPEQUERY:
1764     case OP_TYPEMINQUERY:
1765     case OP_TYPEPOSQUERY:
1766     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1767 ph10 227 break;
1768    
1769 ph10 224 /* Same for these */
1770 ph10 227
1771 ph10 224 case OP_TYPEUPTO:
1772     case OP_TYPEMINUPTO:
1773     case OP_TYPEPOSUPTO:
1774     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1775     break;
1776 nigel 77
1777     /* End of branch */
1778    
1779     case OP_KET:
1780     case OP_KETRMAX:
1781     case OP_KETRMIN:
1782     case OP_ALT:
1783     return TRUE;
1784    
1785 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1786     MINUPTO, and POSUPTO may be followed by a multibyte character */
1787 nigel 77
1788     #ifdef SUPPORT_UTF8
1789     case OP_STAR:
1790     case OP_MINSTAR:
1791 nigel 93 case OP_POSSTAR:
1792 nigel 77 case OP_QUERY:
1793     case OP_MINQUERY:
1794 nigel 93 case OP_POSQUERY:
1795 nigel 77 case OP_UPTO:
1796     case OP_MINUPTO:
1797 nigel 93 case OP_POSUPTO:
1798 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1799     break;
1800     #endif
1801     }
1802     }
1803    
1804     return TRUE;
1805     }
1806    
1807    
1808    
1809     /*************************************************
1810     * Scan compiled regex for non-emptiness *
1811     *************************************************/
1812    
1813     /* This function is called to check for left recursive calls. We want to check
1814     the current branch of the current pattern to see if it could match the empty
1815     string. If it could, we must look outwards for branches at other levels,
1816     stopping when we pass beyond the bracket which is the subject of the recursion.
1817    
1818     Arguments:
1819     code points to start of the recursion
1820     endcode points to where to stop (current RECURSE item)
1821     bcptr points to the chain of current (unclosed) branch starts
1822     utf8 TRUE if in UTF-8 mode
1823    
1824     Returns: TRUE if what is matched could be empty
1825     */
1826    
1827     static BOOL
1828     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1829     BOOL utf8)
1830     {
1831     while (bcptr != NULL && bcptr->current >= code)
1832     {
1833     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1834     bcptr = bcptr->outer;
1835     }
1836     return TRUE;
1837     }
1838    
1839    
1840    
1841     /*************************************************
1842     * Check for POSIX class syntax *
1843     *************************************************/
1844    
1845     /* This function is called when the sequence "[:" or "[." or "[=" is
1846 ph10 295 encountered in a character class. It checks whether this is followed by a
1847 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1848 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
1849 nigel 77
1850 ph10 298 Originally, this function only recognized a sequence of letters between the
1851     terminators, but it seems that Perl recognizes any sequence of characters,
1852     though of course unknown POSIX names are subsequently rejected. Perl gives an
1853     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1854     didn't consider this to be a POSIX class. Likewise for [:1234:].
1855 ph10 295
1856 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
1857     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1858     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1859     below handles the special case of \], but does not try to do any other escape
1860     processing. This makes it different from Perl for cases such as [:l\ower:]
1861 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1862 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1863 ph10 295 I think.
1864    
1865     Arguments:
1866 nigel 77 ptr pointer to the initial [
1867     endptr where to return the end pointer
1868    
1869     Returns: TRUE or FALSE
1870     */
1871    
1872     static BOOL
1873 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1874 nigel 77 {
1875     int terminator; /* Don't combine these lines; the Solaris cc */
1876     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1877 ph10 295 for (++ptr; *ptr != 0; ptr++)
1878 nigel 77 {
1879 ph10 391 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
1880 ph10 298 {
1881 ph10 391 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
1882     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
1883 ph10 295 {
1884     *endptr = ptr;
1885     return TRUE;
1886 ph10 298 }
1887     }
1888     }
1889 nigel 77 return FALSE;
1890     }
1891    
1892    
1893    
1894    
1895     /*************************************************
1896     * Check POSIX class name *
1897     *************************************************/
1898    
1899     /* This function is called to check the name given in a POSIX-style class entry
1900     such as [:alnum:].
1901    
1902     Arguments:
1903     ptr points to the first letter
1904     len the length of the name
1905    
1906     Returns: a value representing the name, or -1 if unknown
1907     */
1908    
1909     static int
1910     check_posix_name(const uschar *ptr, int len)
1911     {
1912 ph10 240 const char *pn = posix_names;
1913 nigel 77 register int yield = 0;
1914     while (posix_name_lengths[yield] != 0)
1915     {
1916     if (len == posix_name_lengths[yield] &&
1917 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
1918 ph10 243 pn += posix_name_lengths[yield] + 1;
1919 nigel 77 yield++;
1920     }
1921     return -1;
1922     }
1923    
1924    
1925     /*************************************************
1926     * Adjust OP_RECURSE items in repeated group *
1927     *************************************************/
1928    
1929     /* OP_RECURSE items contain an offset from the start of the regex to the group
1930     that is referenced. This means that groups can be replicated for fixed
1931     repetition simply by copying (because the recursion is allowed to refer to
1932     earlier groups that are outside the current group). However, when a group is
1933 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1934     inserted before it, after it has been compiled. This means that any OP_RECURSE
1935     items within it that refer to the group itself or any contained groups have to
1936     have their offsets adjusted. That one of the jobs of this function. Before it
1937     is called, the partially compiled regex must be temporarily terminated with
1938     OP_END.
1939 nigel 77
1940 nigel 93 This function has been extended with the possibility of forward references for
1941     recursions and subroutine calls. It must also check the list of such references
1942     for the group we are dealing with. If it finds that one of the recursions in
1943     the current group is on this list, it adjusts the offset in the list, not the
1944     value in the reference (which is a group number).
1945    
1946 nigel 77 Arguments:
1947     group points to the start of the group
1948     adjust the amount by which the group is to be moved
1949     utf8 TRUE in UTF-8 mode
1950     cd contains pointers to tables etc.
1951 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1952 nigel 77
1953     Returns: nothing
1954     */
1955    
1956     static void
1957 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1958     uschar *save_hwm)
1959 nigel 77 {
1960     uschar *ptr = group;
1961 ph10 224
1962 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1963     {
1964 nigel 93 int offset;
1965     uschar *hc;
1966    
1967     /* See if this recursion is on the forward reference list. If so, adjust the
1968     reference. */
1969 ph10 345
1970 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1971     {
1972     offset = GET(hc, 0);
1973     if (cd->start_code + offset == ptr + 1)
1974     {
1975     PUT(hc, 0, offset + adjust);
1976     break;
1977     }
1978     }
1979    
1980     /* Otherwise, adjust the recursion offset if it's after the start of this
1981     group. */
1982    
1983     if (hc >= cd->hwm)
1984     {
1985     offset = GET(ptr, 1);
1986     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1987     }
1988    
1989 nigel 77 ptr += 1 + LINK_SIZE;
1990     }
1991     }
1992    
1993    
1994    
1995     /*************************************************
1996     * Insert an automatic callout point *
1997     *************************************************/
1998    
1999     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2000     callout points before each pattern item.
2001    
2002     Arguments:
2003     code current code pointer
2004     ptr current pattern pointer
2005     cd pointers to tables etc
2006    
2007     Returns: new code pointer
2008     */
2009    
2010     static uschar *
2011     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2012     {
2013     *code++ = OP_CALLOUT;
2014     *code++ = 255;
2015     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2016     PUT(code, LINK_SIZE, 0); /* Default length */
2017     return code + 2*LINK_SIZE;
2018     }
2019    
2020    
2021    
2022     /*************************************************
2023     * Complete a callout item *
2024     *************************************************/
2025    
2026     /* A callout item contains the length of the next item in the pattern, which
2027     we can't fill in till after we have reached the relevant point. This is used
2028     for both automatic and manual callouts.
2029    
2030     Arguments:
2031     previous_callout points to previous callout item
2032     ptr current pattern pointer
2033     cd pointers to tables etc
2034    
2035     Returns: nothing
2036     */
2037    
2038     static void
2039     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2040     {
2041     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2042     PUT(previous_callout, 2 + LINK_SIZE, length);
2043     }
2044    
2045    
2046    
2047     #ifdef SUPPORT_UCP
2048     /*************************************************
2049     * Get othercase range *
2050     *************************************************/
2051    
2052     /* This function is passed the start and end of a class range, in UTF-8 mode
2053     with UCP support. It searches up the characters, looking for internal ranges of
2054     characters in the "other" case. Each call returns the next one, updating the
2055     start address.
2056    
2057     Arguments:
2058     cptr points to starting character value; updated
2059     d end value
2060     ocptr where to put start of othercase range
2061     odptr where to put end of othercase range
2062    
2063     Yield: TRUE when range returned; FALSE when no more
2064     */
2065    
2066     static BOOL
2067 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2068     unsigned int *odptr)
2069 nigel 77 {
2070 nigel 93 unsigned int c, othercase, next;
2071 nigel 77
2072     for (c = *cptr; c <= d; c++)
2073 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2074 nigel 77
2075     if (c > d) return FALSE;
2076    
2077     *ocptr = othercase;
2078     next = othercase + 1;
2079    
2080     for (++c; c <= d; c++)
2081     {
2082 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2083 nigel 77 next++;
2084     }
2085    
2086     *odptr = next - 1;
2087     *cptr = c;
2088    
2089     return TRUE;
2090     }
2091     #endif /* SUPPORT_UCP */
2092    
2093    
2094 nigel 93
2095 nigel 77 /*************************************************
2096 nigel 93 * Check if auto-possessifying is possible *
2097     *************************************************/
2098    
2099     /* This function is called for unlimited repeats of certain items, to see
2100     whether the next thing could possibly match the repeated item. If not, it makes
2101     sense to automatically possessify the repeated item.
2102    
2103     Arguments:
2104     op_code the repeated op code
2105     this data for this item, depends on the opcode
2106     utf8 TRUE in UTF-8 mode
2107     utf8_char used for utf8 character bytes, NULL if not relevant
2108     ptr next character in pattern
2109     options options bits
2110     cd contains pointers to tables etc.
2111    
2112     Returns: TRUE if possessifying is wanted
2113     */
2114    
2115     static BOOL
2116     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2117     const uschar *ptr, int options, compile_data *cd)
2118     {
2119     int next;
2120    
2121     /* Skip whitespace and comments in extended mode */
2122    
2123     if ((options & PCRE_EXTENDED) != 0)
2124     {
2125     for (;;)
2126     {
2127     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2128 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2129 nigel 93 {
2130     while (*(++ptr) != 0)
2131     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2132     }
2133     else break;
2134     }
2135     }
2136    
2137     /* If the next item is one that we can handle, get its value. A non-negative
2138     value is a character, a negative value is an escape value. */
2139    
2140 ph10 391 if (*ptr == CHAR_BACKSLASH)
2141 nigel 93 {
2142     int temperrorcode = 0;
2143     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2144     if (temperrorcode != 0) return FALSE;
2145     ptr++; /* Point after the escape sequence */
2146     }
2147    
2148     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2149     {
2150     #ifdef SUPPORT_UTF8
2151     if (utf8) { GETCHARINC(next, ptr); } else
2152     #endif
2153     next = *ptr++;
2154     }
2155    
2156     else return FALSE;
2157    
2158     /* Skip whitespace and comments in extended mode */
2159    
2160     if ((options & PCRE_EXTENDED) != 0)
2161     {
2162     for (;;)
2163     {
2164     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2165 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2166 nigel 93 {
2167     while (*(++ptr) != 0)
2168     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2169     }
2170     else break;
2171     }
2172     }
2173    
2174     /* If the next thing is itself optional, we have to give up. */
2175    
2176 ph10 391 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2177     strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2178     return FALSE;
2179 nigel 93
2180     /* Now compare the next item with the previous opcode. If the previous is a
2181     positive single character match, "item" either contains the character or, if
2182     "item" is greater than 127 in utf8 mode, the character's bytes are in
2183     utf8_char. */
2184    
2185    
2186     /* Handle cases when the next item is a character. */
2187    
2188     if (next >= 0) switch(op_code)
2189     {
2190     case OP_CHAR:
2191     #ifdef SUPPORT_UTF8
2192     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2193 ph10 369 #else
2194     (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2195 nigel 93 #endif
2196     return item != next;
2197    
2198     /* For CHARNC (caseless character) we must check the other case. If we have
2199     Unicode property support, we can use it to test the other case of
2200     high-valued characters. */
2201    
2202     case OP_CHARNC:
2203     #ifdef SUPPORT_UTF8
2204     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2205     #endif
2206     if (item == next) return FALSE;
2207     #ifdef SUPPORT_UTF8
2208     if (utf8)
2209     {
2210     unsigned int othercase;
2211     if (next < 128) othercase = cd->fcc[next]; else
2212     #ifdef SUPPORT_UCP
2213 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2214 nigel 93 #else
2215     othercase = NOTACHAR;
2216     #endif
2217     return (unsigned int)item != othercase;
2218     }
2219     else
2220     #endif /* SUPPORT_UTF8 */
2221     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2222    
2223     /* For OP_NOT, "item" must be a single-byte character. */
2224    
2225     case OP_NOT:
2226     if (item == next) return TRUE;
2227     if ((options & PCRE_CASELESS) == 0) return FALSE;
2228     #ifdef SUPPORT_UTF8
2229     if (utf8)
2230     {
2231     unsigned int othercase;
2232     if (next < 128) othercase = cd->fcc[next]; else
2233     #ifdef SUPPORT_UCP
2234 ph10 349 othercase = UCD_OTHERCASE(next);
2235 nigel 93 #else
2236     othercase = NOTACHAR;
2237     #endif
2238     return (unsigned int)item == othercase;
2239     }
2240     else
2241     #endif /* SUPPORT_UTF8 */
2242     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2243    
2244     case OP_DIGIT:
2245     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2246    
2247     case OP_NOT_DIGIT:
2248     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2249    
2250     case OP_WHITESPACE:
2251     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2252    
2253     case OP_NOT_WHITESPACE:
2254     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2255    
2256     case OP_WORDCHAR:
2257     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2258    
2259     case OP_NOT_WORDCHAR:
2260     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2261    
2262 ph10 180 case OP_HSPACE:
2263     case OP_NOT_HSPACE:
2264     switch(next)
2265     {
2266     case 0x09:
2267     case 0x20:
2268     case 0xa0:
2269     case 0x1680:
2270     case 0x180e:
2271     case 0x2000:
2272     case 0x2001:
2273     case 0x2002:
2274     case 0x2003:
2275     case 0x2004:
2276     case 0x2005:
2277     case 0x2006:
2278     case 0x2007:
2279     case 0x2008:
2280     case 0x2009:
2281     case 0x200A:
2282     case 0x202f:
2283     case 0x205f:
2284     case 0x3000:
2285     return op_code != OP_HSPACE;
2286     default:
2287     return op_code == OP_HSPACE;
2288     }
2289    
2290     case OP_VSPACE:
2291     case OP_NOT_VSPACE:
2292     switch(next)
2293     {
2294     case 0x0a:
2295     case 0x0b:
2296     case 0x0c:
2297     case 0x0d:
2298     case 0x85:
2299     case 0x2028:
2300     case 0x2029:
2301     return op_code != OP_VSPACE;
2302     default:
2303     return op_code == OP_VSPACE;
2304     }
2305    
2306 nigel 93 default:
2307     return FALSE;
2308     }
2309    
2310    
2311     /* Handle the case when the next item is \d, \s, etc. */
2312    
2313     switch(op_code)
2314     {
2315     case OP_CHAR:
2316     case OP_CHARNC:
2317     #ifdef SUPPORT_UTF8
2318     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2319     #endif
2320     switch(-next)
2321     {
2322     case ESC_d:
2323     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2324    
2325     case ESC_D:
2326     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2327    
2328     case ESC_s:
2329     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2330    
2331     case ESC_S:
2332     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2333    
2334     case ESC_w:
2335     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2336    
2337     case ESC_W:
2338     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2339 ph10 182
2340 ph10 180 case ESC_h:
2341     case ESC_H:
2342     switch(item)
2343     {
2344     case 0x09:
2345     case 0x20:
2346     case 0xa0:
2347     case 0x1680:
2348     case 0x180e:
2349     case 0x2000:
2350     case 0x2001:
2351     case 0x2002:
2352     case 0x2003:
2353     case 0x2004:
2354     case 0x2005:
2355     case 0x2006:
2356     case 0x2007:
2357     case 0x2008:
2358     case 0x2009:
2359     case 0x200A:
2360     case 0x202f:
2361     case 0x205f:
2362     case 0x3000:
2363     return -next != ESC_h;
2364     default:
2365     return -next == ESC_h;
2366 ph10 182 }
2367    
2368 ph10 180 case ESC_v:
2369     case ESC_V:
2370     switch(item)
2371     {
2372     case 0x0a:
2373     case 0x0b:
2374     case 0x0c:
2375     case 0x0d:
2376     case 0x85:
2377     case 0x2028:
2378     case 0x2029:
2379     return -next != ESC_v;
2380     default:
2381     return -next == ESC_v;
2382 ph10 182 }
2383 nigel 93
2384     default:
2385     return FALSE;
2386     }
2387    
2388     case OP_DIGIT:
2389 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2390     next == -ESC_h || next == -ESC_v;
2391 nigel 93
2392     case OP_NOT_DIGIT:
2393     return next == -ESC_d;
2394    
2395     case OP_WHITESPACE:
2396     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2397    
2398     case OP_NOT_WHITESPACE:
2399 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2400 nigel 93
2401 ph10 180 case OP_HSPACE:
2402     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2403    
2404     case OP_NOT_HSPACE:
2405     return next == -ESC_h;
2406 ph10 182
2407 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2408 ph10 182 case OP_VSPACE:
2409 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2410    
2411     case OP_NOT_VSPACE:
2412 ph10 182 return next == -ESC_v;
2413 ph10 180
2414 nigel 93 case OP_WORDCHAR:
2415 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2416 nigel 93
2417     case OP_NOT_WORDCHAR:
2418     return next == -ESC_w || next == -ESC_d;
2419 ph10 182
2420 nigel 93 default:
2421     return FALSE;
2422     }
2423    
2424     /* Control does not reach here */
2425     }
2426    
2427    
2428    
2429     /*************************************************
2430 nigel 77 * Compile one branch *
2431     *************************************************/
2432    
2433 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2434 nigel 77 changed during the branch, the pointer is used to change the external options
2435 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2436     to find out the amount of memory needed, as well as during the real compile
2437     phase. The value of lengthptr distinguishes the two phases.
2438 nigel 77
2439     Arguments:
2440     optionsptr pointer to the option bits
2441     codeptr points to the pointer to the current code point
2442     ptrptr points to the current pattern pointer
2443     errorcodeptr points to error code variable
2444     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2445     reqbyteptr set to the last literal character required, else < 0
2446     bcptr points to current branch chain
2447     cd contains pointers to tables etc.
2448 nigel 93 lengthptr NULL during the real compile phase
2449     points to length accumulator during pre-compile phase
2450 nigel 77
2451     Returns: TRUE on success
2452     FALSE, with *errorcodeptr set non-zero on error
2453     */
2454    
2455     static BOOL
2456 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2457     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2458     compile_data *cd, int *lengthptr)
2459 nigel 77 {
2460     int repeat_type, op_type;
2461     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2462     int bravalue = 0;
2463     int greedy_default, greedy_non_default;
2464     int firstbyte, reqbyte;
2465     int zeroreqbyte, zerofirstbyte;
2466     int req_caseopt, reqvary, tempreqvary;
2467     int options = *optionsptr;
2468     int after_manual_callout = 0;
2469 nigel 93 int length_prevgroup = 0;
2470 nigel 77 register int c;
2471     register uschar *code = *codeptr;
2472 nigel 93 uschar *last_code = code;
2473     uschar *orig_code = code;
2474 nigel 77 uschar *tempcode;
2475     BOOL inescq = FALSE;
2476     BOOL groupsetfirstbyte = FALSE;
2477     const uschar *ptr = *ptrptr;
2478     const uschar *tempptr;
2479     uschar *previous = NULL;
2480     uschar *previous_callout = NULL;
2481 nigel 93 uschar *save_hwm = NULL;
2482 nigel 77 uschar classbits[32];
2483    
2484     #ifdef SUPPORT_UTF8
2485     BOOL class_utf8;
2486     BOOL utf8 = (options & PCRE_UTF8) != 0;
2487     uschar *class_utf8data;
2488 ph10 300 uschar *class_utf8data_base;
2489 nigel 77 uschar utf8_char[6];
2490     #else
2491     BOOL utf8 = FALSE;
2492 nigel 93 uschar *utf8_char = NULL;
2493 nigel 77 #endif
2494    
2495 nigel 93 #ifdef DEBUG
2496     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2497     #endif
2498    
2499 nigel 77 /* Set up the default and non-default settings for greediness */
2500    
2501     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2502     greedy_non_default = greedy_default ^ 1;
2503    
2504     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2505     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2506     matches a non-fixed char first char; reqbyte just remains unset if we never
2507     find one.
2508    
2509     When we hit a repeat whose minimum is zero, we may have to adjust these values
2510     to take the zero repeat into account. This is implemented by setting them to
2511     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2512     item types that can be repeated set these backoff variables appropriately. */
2513    
2514     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2515    
2516     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2517     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2518     value > 255. It is added into the firstbyte or reqbyte variables to record the
2519     case status of the value. This is used only for ASCII characters. */
2520    
2521     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2522    
2523     /* Switch on next character until the end of the branch */
2524    
2525     for (;; ptr++)
2526     {
2527     BOOL negate_class;
2528 ph10 286 BOOL should_flip_negation;
2529 nigel 77 BOOL possessive_quantifier;
2530     BOOL is_quantifier;
2531 nigel 93 BOOL is_recurse;
2532 ph10 180 BOOL reset_bracount;
2533 nigel 77 int class_charcount;
2534     int class_lastchar;
2535     int newoptions;
2536     int recno;
2537 ph10 172 int refsign;
2538 nigel 77 int skipbytes;
2539     int subreqbyte;
2540     int subfirstbyte;
2541 nigel 93 int terminator;
2542 nigel 77 int mclength;
2543     uschar mcbuffer[8];
2544    
2545 nigel 93 /* Get next byte in the pattern */
2546 nigel 77
2547     c = *ptr;
2548 ph10 345
2549 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2550     previous cycle of this loop. */
2551    
2552     if (lengthptr != NULL)
2553     {
2554     #ifdef DEBUG
2555     if (code > cd->hwm) cd->hwm = code; /* High water info */
2556     #endif
2557     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2558     {
2559     *errorcodeptr = ERR52;
2560     goto FAILED;
2561     }
2562    
2563     /* There is at least one situation where code goes backwards: this is the
2564     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2565     the class is simply eliminated. However, it is created first, so we have to
2566     allow memory for it. Therefore, don't ever reduce the length at this point.
2567     */
2568    
2569     if (code < last_code) code = last_code;
2570 ph10 202
2571     /* Paranoid check for integer overflow */
2572    
2573     if (OFLOW_MAX - *lengthptr < code - last_code)
2574     {
2575     *errorcodeptr = ERR20;
2576     goto FAILED;
2577     }
2578    
2579 nigel 93 *lengthptr += code - last_code;
2580     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2581    
2582     /* If "previous" is set and it is not at the start of the work space, move
2583     it back to there, in order to avoid filling up the work space. Otherwise,
2584     if "previous" is NULL, reset the current code pointer to the start. */
2585    
2586     if (previous != NULL)
2587     {
2588     if (previous > orig_code)
2589     {
2590     memmove(orig_code, previous, code - previous);
2591     code -= previous - orig_code;
2592     previous = orig_code;
2593     }
2594     }
2595     else code = orig_code;
2596    
2597     /* Remember where this code item starts so we can pick up the length
2598     next time round. */
2599    
2600     last_code = code;
2601     }
2602    
2603     /* In the real compile phase, just check the workspace used by the forward
2604     reference list. */
2605    
2606     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2607     {
2608     *errorcodeptr = ERR52;
2609     goto FAILED;
2610     }
2611    
2612 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2613    
2614     if (inescq && c != 0)
2615     {
2616 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2617 nigel 77 {
2618     inescq = FALSE;
2619     ptr++;
2620     continue;
2621     }
2622     else
2623     {
2624     if (previous_callout != NULL)
2625     {
2626 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2627     complete_callout(previous_callout, ptr, cd);
2628 nigel 77 previous_callout = NULL;
2629     }
2630     if ((options & PCRE_AUTO_CALLOUT) != 0)
2631     {
2632     previous_callout = code;
2633     code = auto_callout(code, ptr, cd);
2634     }
2635     goto NORMAL_CHAR;
2636     }
2637     }
2638    
2639     /* Fill in length of a previous callout, except when the next thing is
2640     a quantifier. */
2641    
2642 ph10 391 is_quantifier =
2643     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2644     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2645 nigel 77
2646     if (!is_quantifier && previous_callout != NULL &&
2647     after_manual_callout-- <= 0)
2648     {
2649 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2650     complete_callout(previous_callout, ptr, cd);
2651 nigel 77 previous_callout = NULL;
2652     }
2653    
2654     /* In extended mode, skip white space and comments */
2655    
2656     if ((options & PCRE_EXTENDED) != 0)
2657     {
2658     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2659 ph10 391 if (c == CHAR_NUMBER_SIGN)
2660 nigel 77 {
2661 nigel 93 while (*(++ptr) != 0)
2662 nigel 91 {
2663 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2664 nigel 91 }
2665 nigel 93 if (*ptr != 0) continue;
2666    
2667 nigel 91 /* Else fall through to handle end of string */
2668     c = 0;
2669 nigel 77 }
2670     }
2671    
2672     /* No auto callout for quantifiers. */
2673    
2674     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2675     {
2676     previous_callout = code;
2677     code = auto_callout(code, ptr, cd);
2678     }
2679    
2680     switch(c)
2681     {
2682 nigel 93 /* ===================================================================*/
2683     case 0: /* The branch terminates at string end */
2684 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
2685     case CHAR_RIGHT_PARENTHESIS:
2686 nigel 77 *firstbyteptr = firstbyte;
2687     *reqbyteptr = reqbyte;
2688     *codeptr = code;
2689     *ptrptr = ptr;
2690 nigel 93 if (lengthptr != NULL)
2691     {
2692 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2693     {
2694     *errorcodeptr = ERR20;
2695     goto FAILED;
2696     }
2697 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2698     DPRINTF((">> end branch\n"));
2699     }
2700 nigel 77 return TRUE;
2701    
2702 nigel 93
2703     /* ===================================================================*/
2704 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2705     the setting of any following char as a first character. */
2706    
2707 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
2708 nigel 77 if ((options & PCRE_MULTILINE) != 0)
2709     {
2710     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2711     }
2712     previous = NULL;
2713     *code++ = OP_CIRC;
2714     break;
2715    
2716 ph10 391 case CHAR_DOLLAR_SIGN:
2717 nigel 77 previous = NULL;
2718     *code++ = OP_DOLL;
2719     break;
2720    
2721     /* There can never be a first char if '.' is first, whatever happens about
2722     repeats. The value of reqbyte doesn't change either. */
2723    
2724 ph10 391 case CHAR_DOT:
2725 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2726     zerofirstbyte = firstbyte;
2727     zeroreqbyte = reqbyte;
2728     previous = code;
2729 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2730 nigel 77 break;
2731    
2732 nigel 93
2733     /* ===================================================================*/
2734 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2735     32-byte bitmap of the permitted characters, except in the special case
2736     where there is only one such character. For negated classes, we build the
2737     map as usual, then invert it at the end. However, we use a different opcode
2738     so that data characters > 255 can be handled correctly.
2739 nigel 77
2740     If the class contains characters outside the 0-255 range, a different
2741     opcode is compiled. It may optionally have a bit map for characters < 256,
2742     but those above are are explicitly listed afterwards. A flag byte tells
2743     whether the bitmap is present, and whether this is a negated class or not.
2744 ph10 345
2745 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
2746     default (Perl) mode, it is treated as a data character. */
2747 ph10 345
2748 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
2749 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2750     {
2751     *errorcodeptr = ERR64;
2752 ph10 345 goto FAILED;
2753 ph10 336 }
2754 ph10 345 goto NORMAL_CHAR;
2755 nigel 77
2756 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
2757 nigel 77 previous = code;
2758    
2759     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2760     they are encountered at the top level, so we'll do that too. */
2761    
2762 ph10 391 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2763     ptr[1] == CHAR_EQUALS_SIGN) &&
2764 ph10 295 check_posix_syntax(ptr, &tempptr))
2765 nigel 77 {
2766 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2767 nigel 77 goto FAILED;
2768     }
2769    
2770 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2771 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2772 ph10 205 skip them too. This makes for compatibility with Perl. */
2773 ph10 208
2774 ph10 205 negate_class = FALSE;
2775     for (;;)
2776 nigel 77 {
2777     c = *(++ptr);
2778 ph10 391 if (c == CHAR_BACKSLASH)
2779 ph10 205 {
2780 ph10 391 if (ptr[1] == CHAR_E)
2781     ptr++;
2782     else if (strncmp((const char *)ptr+1,
2783     STR_Q STR_BACKSLASH STR_E, 3) == 0)
2784     ptr += 3;
2785     else
2786     break;
2787 ph10 205 }
2788 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2789 ph10 205 negate_class = TRUE;
2790     else break;
2791 ph10 208 }
2792 ph10 345
2793     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2794     an initial ']' is taken as a data character -- the code below handles
2795 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2796     [^] must match any character, so generate OP_ALLANY. */
2797 ph10 345
2798 ph10 391 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2799     (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2800 ph10 341 {
2801     *code++ = negate_class? OP_ALLANY : OP_FAIL;
2802     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2803     zerofirstbyte = firstbyte;
2804     break;
2805 ph10 345 }
2806 nigel 77
2807 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
2808     negation flag at the end, so that support for characters > 255 works
2809 ph10 264 correctly (they are all included in the class). */
2810    
2811     should_flip_negation = FALSE;
2812    
2813 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
2814 nigel 93 of just a single character (as long as it's < 256). However, For higher
2815     valued UTF-8 characters, we don't yet do any optimization. */
2816 nigel 77
2817     class_charcount = 0;
2818     class_lastchar = -1;
2819    
2820 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2821     temporary bit of memory, in case the class contains only 1 character (less
2822     than 256), because in that case the compiled code doesn't use the bit map.
2823     */
2824    
2825     memset(classbits, 0, 32 * sizeof(uschar));
2826    
2827 nigel 77 #ifdef SUPPORT_UTF8
2828     class_utf8 = FALSE; /* No chars >= 256 */
2829 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2830 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2831 nigel 77 #endif
2832    
2833     /* Process characters until ] is reached. By writing this as a "do" it
2834 nigel 93 means that an initial ] is taken as a data character. At the start of the
2835     loop, c contains the first byte of the character. */
2836 nigel 77
2837 nigel 93 if (c != 0) do
2838 nigel 77 {
2839 nigel 93 const uschar *oldptr;
2840    
2841 nigel 77 #ifdef SUPPORT_UTF8
2842     if (utf8 && c > 127)
2843     { /* Braces are required because the */
2844     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2845     }
2846 ph10 309
2847 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2848 ph10 309 data and reset the pointer. This is so that very large classes that
2849 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
2850 ph10 309 (which is on the stack). */
2851    
2852 ph10 300 if (lengthptr != NULL)
2853     {
2854     *lengthptr += class_utf8data - class_utf8data_base;
2855 ph10 309 class_utf8data = class_utf8data_base;
2856     }
2857    
2858 nigel 77 #endif
2859    
2860     /* Inside \Q...\E everything is literal except \E */
2861    
2862     if (inescq)
2863     {
2864 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
2865 nigel 77 {
2866 nigel 93 inescq = FALSE; /* Reset literal state */
2867     ptr++; /* Skip the 'E' */
2868     continue; /* Carry on with next */
2869 nigel 77 }
2870 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2871 nigel 77 }
2872    
2873     /* Handle POSIX class names. Perl allows a negation extension of the
2874     form [:^name:]. A square bracket that doesn't match the syntax is
2875     treated as a literal. We also recognize the POSIX constructions
2876     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2877     5.6 and 5.8 do. */
2878    
2879 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
2880     (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2881     ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
2882 nigel 77 {
2883     BOOL local_negate = FALSE;
2884 nigel 87 int posix_class, taboffset, tabopt;
2885 nigel 77 register const uschar *cbits = cd->cbits;
2886 nigel 87 uschar pbits[32];
2887 nigel 77
2888 ph10 391 if (ptr[1] != CHAR_COLON)
2889 nigel 77 {
2890     *errorcodeptr = ERR31;
2891     goto FAILED;
2892     }
2893    
2894     ptr += 2;
2895 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2896 nigel 77 {
2897     local_negate = TRUE;
2898 ph10 286 should_flip_negation = TRUE; /* Note negative special */
2899 nigel 77 ptr++;
2900     }
2901    
2902     posix_class = check_posix_name(ptr, tempptr - ptr);
2903     if (posix_class < 0)
2904     {
2905     *errorcodeptr = ERR30;
2906     goto FAILED;
2907     }
2908    
2909     /* If matching is caseless, upper and lower are converted to
2910     alpha. This relies on the fact that the class table starts with
2911     alpha, lower, upper as the first 3 entries. */
2912    
2913     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2914     posix_class = 0;
2915    
2916 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2917     because we may be adding and subtracting from it, and we don't want to
2918     subtract bits that may be in the main map already. At the end we or the
2919     result into the bit map that is being built. */
2920 nigel 77
2921     posix_class *= 3;
2922 nigel 87
2923     /* Copy in the first table (always present) */
2924    
2925     memcpy(pbits, cbits + posix_class_maps[posix_class],
2926     32 * sizeof(uschar));
2927    
2928     /* If there is a second table, add or remove it as required. */
2929    
2930     taboffset = posix_class_maps[posix_class + 1];
2931     tabopt = posix_class_maps[posix_class + 2];
2932    
2933     if (taboffset >= 0)
2934 nigel 77 {
2935 nigel 87 if (tabopt >= 0)
2936     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2937 nigel 77 else
2938 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2939 nigel 77 }
2940    
2941 nigel 87 /* Not see if we need to remove any special characters. An option
2942     value of 1 removes vertical space and 2 removes underscore. */
2943    
2944     if (tabopt < 0) tabopt = -tabopt;
2945     if (tabopt == 1) pbits[1] &= ~0x3c;
2946     else if (tabopt == 2) pbits[11] &= 0x7f;
2947    
2948     /* Add the POSIX table or its complement into the main table that is
2949     being built and we are done. */
2950    
2951     if (local_negate)
2952     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2953     else
2954     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2955    
2956 nigel 77 ptr = tempptr + 1;
2957     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2958     continue; /* End of POSIX syntax handling */
2959     }
2960    
2961     /* Backslash may introduce a single character, or it may introduce one
2962 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2963     case. Inside a class (and only there) it is treated as backspace.
2964     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2965 ph10 205 to 'or' into the one we are building. We assume they have more than one
2966 nigel 77 character in them, so set class_charcount bigger than one. */
2967    
2968 ph10 391 if (c == CHAR_BACKSLASH)
2969 nigel 77 {
2970 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2971     if (*errorcodeptr != 0) goto FAILED;
2972 nigel 77
2973 ph10 391 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
2974     else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
2975     else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
2976 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2977     {
2978 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
2979 nigel 77 {
2980     ptr += 2; /* avoid empty string */
2981     }
2982     else inescq = TRUE;
2983     continue;
2984     }
2985 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2986 nigel 77
2987     if (c < 0)
2988     {
2989     register const uschar *cbits = cd->cbits;
2990     class_charcount += 2; /* Greater than 1 is what matters */
2991 nigel 93
2992     /* Save time by not doing this in the pre-compile phase. */
2993    
2994     if (lengthptr == NULL) switch (-c)
2995 nigel 77 {
2996     case ESC_d:
2997     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2998     continue;
2999    
3000     case ESC_D:
3001 ph10 286 should_flip_negation = TRUE;
3002 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3003     continue;
3004    
3005     case ESC_w:
3006     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3007     continue;
3008    
3009     case ESC_W:
3010 ph10 286 should_flip_negation = TRUE;
3011 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3012     continue;
3013    
3014     case ESC_s:
3015     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3016     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3017     continue;
3018    
3019     case ESC_S:
3020 ph10 286 should_flip_negation = TRUE;
3021 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3022     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3023     continue;
3024    
3025 nigel 93 default: /* Not recognized; fall through */
3026     break; /* Need "default" setting to stop compiler warning. */
3027     }
3028    
3029     /* In the pre-compile phase, just do the recognition. */
3030    
3031     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3032     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3033 ph10 180
3034 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
3035     they use extra memory. */
3036 ph10 180
3037 ph10 178 if (-c == ESC_h)
3038     {
3039     SETBIT(classbits, 0x09); /* VT */
3040     SETBIT(classbits, 0x20); /* SPACE */
3041 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
3042 ph10 178 #ifdef SUPPORT_UTF8
3043     if (utf8)
3044 ph10 180 {
3045 ph10 178 class_utf8 = TRUE;
3046     *class_utf8data++ = XCL_SINGLE;
3047 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3048 ph10 178 *class_utf8data++ = XCL_SINGLE;
3049 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3050     *class_utf8data++ = XCL_RANGE;
3051     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3052     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3053 ph10 178 *class_utf8data++ = XCL_SINGLE;
3054 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3055 ph10 178 *class_utf8data++ = XCL_SINGLE;
3056 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3057 ph10 178 *class_utf8data++ = XCL_SINGLE;
3058 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3059     }
3060     #endif
3061     continue;
3062     }
3063 nigel 93
3064 ph10 178 if (-c == ESC_H)
3065     {
3066     for (c = 0; c < 32; c++)
3067     {
3068     int x = 0xff;
3069     switch (c)
3070 ph10 180 {
3071 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
3072     case 0x20/8: x ^= 1 << (0x20%8); break;
3073     case 0xa0/8: x ^= 1 << (0xa0%8); break;
3074     default: break;
3075     }
3076     classbits[c] |= x;
3077 ph10 180 }
3078    
3079 ph10 178 #ifdef SUPPORT_UTF8
3080     if (utf8)
3081 ph10 180 {
3082 ph10 178 class_utf8 = TRUE;
3083 ph10 180 *class_utf8data++ = XCL_RANGE;
3084     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3085     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3086     *class_utf8data++ = XCL_RANGE;
3087     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3088     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3089     *class_utf8data++ = XCL_RANGE;
3090     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3091     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3092     *class_utf8data++ = XCL_RANGE;
3093     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3094     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3095     *class_utf8data++ = XCL_RANGE;
3096     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3097     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3098     *class_utf8data++ = XCL_RANGE;
3099     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3100     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3101     *class_utf8data++ = XCL_RANGE;
3102     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3103     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3104     }
3105     #endif
3106     continue;
3107     }
3108 ph10 178
3109     if (-c == ESC_v)
3110     {
3111     SETBIT(classbits, 0x0a); /* LF */
3112     SETBIT(classbits, 0x0b); /* VT */
3113 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3114     SETBIT(classbits, 0x0d); /* CR */
3115     SETBIT(classbits, 0x85); /* NEL */
3116 ph10 178 #ifdef SUPPORT_UTF8
3117     if (utf8)
3118 ph10 180 {
3119 ph10 178 class_utf8 = TRUE;
3120 ph10 180 *class_utf8data++ = XCL_RANGE;
3121     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3122     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3123     }
3124     #endif
3125     continue;
3126     }
3127 ph10 178
3128     if (-c == ESC_V)
3129     {
3130     for (c = 0; c < 32; c++)
3131     {
3132     int x = 0xff;
3133     switch (c)
3134 ph10 180 {
3135 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3136     x ^= 1 << (0x0b%8);
3137     x ^= 1 << (0x0c%8);
3138 ph10 180 x ^= 1 << (0x0d%8);
3139 ph10 178 break;
3140     case 0x85/8: x ^= 1 << (0x85%8); break;
3141     default: break;
3142     }
3143     classbits[c] |= x;
3144 ph10 180 }
3145    
3146 ph10 178 #ifdef SUPPORT_UTF8
3147     if (utf8)
3148 ph10 180 {
3149 ph10 178 class_utf8 = TRUE;
3150 ph10 180 *class_utf8data++ = XCL_RANGE;
3151     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3152     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3153     *class_utf8data++ = XCL_RANGE;
3154     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3155     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3156     }
3157     #endif
3158     continue;
3159     }
3160 ph10 178
3161 nigel 93 /* We need to deal with \P and \p in both phases. */
3162    
3163 nigel 77 #ifdef SUPPORT_UCP
3164 nigel 93 if (-c == ESC_p || -c == ESC_P)
3165     {
3166     BOOL negated;
3167     int pdata;
3168     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3169     if (ptype < 0) goto FAILED;
3170     class_utf8 = TRUE;
3171     *class_utf8data++ = ((-c == ESC_p) != negated)?
3172     XCL_PROP : XCL_NOTPROP;
3173     *class_utf8data++ = ptype;
3174     *class_utf8data++ = pdata;
3175     class_charcount -= 2; /* Not a < 256 character */
3176 nigel 77 continue;
3177 nigel 93 }
3178 nigel 77 #endif
3179 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
3180     strict mode. By default, for compatibility with Perl, they are
3181     treated as literals. */
3182 nigel 77
3183 nigel 93 if ((options & PCRE_EXTRA) != 0)
3184     {
3185     *errorcodeptr = ERR7;
3186     goto FAILED;
3187     }
3188 nigel 77
3189 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3190     c = *ptr; /* Get the final character and fall through */
3191 nigel 77 }
3192    
3193     /* Fall through if we have a single character (c >= 0). This may be
3194 nigel 93 greater than 256 in UTF-8 mode. */
3195 nigel 77
3196     } /* End of backslash handling */
3197    
3198     /* A single character may be followed by '-' to form a range. However,
3199     Perl does not permit ']' to be the end of the range. A '-' character
3200 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3201     entirely. The code for handling \Q and \E is messy. */
3202 nigel 77
3203 nigel 93 CHECK_RANGE:
3204 ph10 391 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3205 nigel 77 {
3206 nigel 93 inescq = FALSE;
3207     ptr += 2;
3208     }
3209    
3210     oldptr = ptr;
3211 ph10 231
3212 ph10 230 /* Remember \r or \n */
3213 ph10 231
3214 ph10 391 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3215 ph10 231
3216 ph10 230 /* Check for range */
3217 nigel 93
3218 ph10 391 if (!inescq && ptr[1] == CHAR_MINUS)
3219 nigel 93 {
3220 nigel 77 int d;
3221     ptr += 2;
3222 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3223 nigel 77
3224 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3225     mode. */
3226    
3227 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3228 nigel 93 {
3229     ptr += 2;
3230 ph10 391 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3231     { ptr += 2; continue; }
3232 nigel 93 inescq = TRUE;
3233     break;
3234     }
3235    
3236 ph10 391 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3237 nigel 93 {
3238     ptr = oldptr;
3239     goto LONE_SINGLE_CHARACTER;
3240     }
3241    
3242 nigel 77 #ifdef SUPPORT_UTF8
3243     if (utf8)
3244     { /* Braces are required because the */
3245     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3246     }
3247     else
3248     #endif
3249     d = *ptr; /* Not UTF-8 mode */
3250    
3251     /* The second part of a range can be a single-character escape, but
3252     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3253     in such circumstances. */
3254    
3255 ph10 391 if (!inescq && d == CHAR_BACKSLASH)
3256 nigel 77 {
3257 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3258     if (*errorcodeptr != 0) goto FAILED;
3259 nigel 77
3260 ph10 275 /* \b is backspace; \X is literal X; \R is literal R; any other
3261 nigel 93 special means the '-' was literal */
3262 nigel 77
3263     if (d < 0)
3264     {
3265 ph10 391 if (d == -ESC_b) d = CHAR_BS;
3266     else if (d == -ESC_X) d = CHAR_X;
3267     else if (d == -ESC_R) d = CHAR_R; else
3268 nigel 77 {
3269 nigel 93 ptr = oldptr;
3270 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3271     }
3272     }
3273     }
3274    
3275 nigel 93 /* Check that the two values are in the correct order. Optimize
3276     one-character ranges */
3277 nigel 77
3278 nigel 93 if (d < c)
3279     {
3280     *errorcodeptr = ERR8;
3281     goto FAILED;
3282     }
3283    
3284 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3285    
3286 ph10 230 /* Remember \r or \n */
3287 ph10 231
3288 ph10 391 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3289 ph10 231
3290 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3291     matching, we have to use an XCLASS with extra data items. Caseless
3292     matching for characters > 127 is available only if UCP support is
3293     available. */
3294    
3295     #ifdef SUPPORT_UTF8
3296     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3297     {
3298     class_utf8 = TRUE;
3299    
3300     /* With UCP support, we can find the other case equivalents of
3301     the relevant characters. There may be several ranges. Optimize how
3302     they fit with the basic range. */
3303    
3304     #ifdef SUPPORT_UCP
3305     if ((options & PCRE_CASELESS) != 0)
3306     {
3307 nigel 93 unsigned int occ, ocd;
3308     unsigned int cc = c;
3309     unsigned int origd = d;
3310 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3311     {
3312 ph10 180 if (occ >= (unsigned int)c &&
3313     ocd <= (unsigned int)d)
3314 ph10 176 continue; /* Skip embedded ranges */
3315 nigel 77
3316 ph10 180 if (occ < (unsigned int)c &&
3317 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3318 nigel 77 { /* if there is overlap, */
3319     c = occ; /* noting that if occ < c */
3320     continue; /* we can't have ocd > d */
3321     } /* because a subrange is */
3322 ph10 180 if (ocd > (unsigned int)d &&
3323 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3324 nigel 77 { /* the basic range. */
3325     d = ocd;
3326     continue;
3327     }
3328    
3329     if (occ == ocd)
3330     {
3331     *class_utf8data++ = XCL_SINGLE;
3332     }
3333     else
3334     {
3335     *class_utf8data++ = XCL_RANGE;
3336     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3337     }
3338     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3339     }
3340     }
3341     #endif /* SUPPORT_UCP */
3342    
3343     /* Now record the original range, possibly modified for UCP caseless
3344     overlapping ranges. */
3345    
3346     *class_utf8data++ = XCL_RANGE;
3347     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3348     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3349    
3350     /* With UCP support, we are done. Without UCP support, there is no
3351     caseless matching for UTF-8 characters > 127; we can use the bit map
3352     for the smaller ones. */
3353    
3354     #ifdef SUPPORT_UCP
3355     continue; /* With next character in the class */
3356     #else
3357     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3358    
3359     /* Adjust upper limit and fall through to set up the map */
3360    
3361     d = 127;
3362    
3363     #endif /* SUPPORT_UCP */
3364     }
3365     #endif /* SUPPORT_UTF8 */
3366    
3367     /* We use the bit map for all cases when not in UTF-8 mode; else
3368     ranges that lie entirely within 0-127 when there is UCP support; else
3369     for partial ranges without UCP support. */
3370    
3371 nigel 93 class_charcount += d - c + 1;
3372     class_lastchar = d;
3373    
3374     /* We can save a bit of time by skipping this in the pre-compile. */
3375    
3376     if (lengthptr == NULL) for (; c <= d; c++)
3377 nigel 77 {
3378     classbits[c/8] |= (1 << (c&7));
3379     if ((options & PCRE_CASELESS) != 0)
3380     {
3381     int uc = cd->fcc[c]; /* flip case */
3382     classbits[uc/8] |= (1 << (uc&7));
3383     }
3384     }
3385    
3386     continue; /* Go get the next char in the class */
3387     }
3388    
3389     /* Handle a lone single character - we can get here for a normal
3390     non-escape char, or after \ that introduces a single character or for an
3391     apparent range that isn't. */
3392    
3393     LONE_SINGLE_CHARACTER:
3394 ph10 231
3395 nigel 77 /* Handle a character that cannot go in the bit map */
3396    
3397     #ifdef SUPPORT_UTF8
3398     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3399     {
3400     class_utf8 = TRUE;
3401     *class_utf8data++ = XCL_SINGLE;
3402     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3403    
3404     #ifdef SUPPORT_UCP
3405     if ((options & PCRE_CASELESS) != 0)
3406     {
3407 nigel 93 unsigned int othercase;
3408 ph10 349 if ((othercase = UCD_OTHERCASE(c)) != c)
3409 nigel 77 {
3410     *class_utf8data++ = XCL_SINGLE;
3411     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3412     }
3413     }
3414     #endif /* SUPPORT_UCP */
3415    
3416     }
3417     else
3418     #endif /* SUPPORT_UTF8 */
3419    
3420     /* Handle a single-byte character */
3421     {
3422     classbits[c/8] |= (1 << (c&7));
3423     if ((options & PCRE_CASELESS) != 0)
3424     {
3425     c = cd->fcc[c]; /* flip case */
3426     classbits[c/8] |= (1 << (c&7));
3427     }
3428     class_charcount++;
3429     class_lastchar = c;
3430     }
3431     }
3432    
3433 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3434 nigel 77
3435 ph10 391 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3436 nigel 77
3437 nigel 93 if (c == 0) /* Missing terminating ']' */
3438     {
3439     *errorcodeptr = ERR6;
3440     goto FAILED;
3441     }
3442 ph10 231
3443    
3444 ph10 230 /* This code has been disabled because it would mean that \s counts as
3445     an explicit \r or \n reference, and that's not really what is wanted. Now
3446     we set the flag only if there is a literal "\r" or "\n" in the class. */
3447 ph10 227
3448 ph10 230 #if 0
3449 ph10 226 /* Remember whether \r or \n are in this class */
3450 ph10 227
3451 ph10 226 if (negate_class)
3452     {
3453 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3454 ph10 226 }
3455     else
3456     {
3457 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3458 ph10 227 }
3459 ph10 230 #endif
3460 ph10 227
3461 ph10 231
3462 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3463 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3464     use of \p or \P, in other words, no use of any XCLASS features, we can
3465     optimize.
3466    
3467 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3468     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3469     operate on single-bytes only. This is an historical hangover. Maybe one day
3470     we can tidy these opcodes to handle multi-byte characters.
3471 nigel 77
3472     The optimization throws away the bit map. We turn the item into a
3473     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3474     that OP_NOT does not support multibyte characters. In the positive case, it
3475     can cause firstbyte to be set. Otherwise, there can be no first char if
3476     this item is first, whatever repeat count may follow. In the case of
3477     reqbyte, save the previous value for reinstating. */
3478    
3479     #ifdef SUPPORT_UTF8
3480 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3481 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3482 nigel 77 #else
3483     if (class_charcount == 1)
3484     #endif
3485     {
3486     zeroreqbyte = reqbyte;
3487    
3488     /* The OP_NOT opcode works on one-byte characters only. */
3489    
3490     if (negate_class)
3491     {
3492     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3493     zerofirstbyte = firstbyte;
3494     *code++ = OP_NOT;
3495     *code++ = class_lastchar;
3496     break;
3497     }
3498    
3499     /* For a single, positive character, get the value into mcbuffer, and
3500     then we can handle this with the normal one-character code. */
3501    
3502     #ifdef SUPPORT_UTF8
3503     if (utf8 && class_lastchar > 127)
3504     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3505     else
3506     #endif
3507     {
3508     mcbuffer[0] = class_lastchar;
3509     mclength = 1;
3510     }
3511     goto ONE_CHAR;
3512     } /* End of 1-char optimization */
3513    
3514     /* The general case - not the one-char optimization. If this is the first
3515     thing in the branch, there can be no first char setting, whatever the
3516     repeat count. Any reqbyte setting must remain unchanged after any kind of
3517     repeat. */
3518    
3519     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3520     zerofirstbyte = firstbyte;
3521     zeroreqbyte = reqbyte;
3522    
3523     /* If there are characters with values > 255, we have to compile an
3524 ph10 286 extended class, with its own opcode, unless there was a negated special
3525     such as \S in the class, because in that case all characters > 255 are in
3526     the class, so any that were explicitly given as well can be ignored. If
3527 ph10 264 (when there are explicit characters > 255 that must be listed) there are no
3528     characters < 256, we can omit the bitmap in the actual compiled code. */
3529 nigel 77
3530     #ifdef SUPPORT_UTF8
3531 ph10 264 if (class_utf8 && !should_flip_negation)
3532 nigel 77 {
3533     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3534     *code++ = OP_XCLASS;
3535     code += LINK_SIZE;
3536     *code = negate_class? XCL_NOT : 0;
3537    
3538 nigel 93 /* If the map is required, move up the extra data to make room for it;
3539     otherwise just move the code pointer to the end of the extra data. */
3540 nigel 77
3541     if (class_charcount > 0)
3542     {
3543     *code++ |= XCL_MAP;
3544 nigel 93 memmove(code + 32, code, class_utf8data - code);
3545 nigel 77 memcpy(code, classbits, 32);
3546 nigel 93 code = class_utf8data + 32;
3547 nigel 77 }
3548 nigel 93 else code = class_utf8data;
3549 nigel 77
3550     /* Now fill in the complete length of the item */
3551    
3552     PUT(previous, 1, code - previous);
3553     break; /* End of class handling */
3554     }
3555     #endif
3556    
3557 ph10 286 /* If there are no characters > 255, set the opcode to OP_CLASS or
3558     OP_NCLASS, depending on whether the whole class was negated and whether
3559     there were negative specials such as \S in the class. Then copy the 32-byte
3560 ph10 264 map into the code vector, negating it if necessary. */
3561 ph10 286
3562 ph10 264 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3563 nigel 77 if (negate_class)
3564     {
3565 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3566     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3567 nigel 77 }
3568     else
3569     {
3570     memcpy(code, classbits, 32);
3571     }
3572     code += 32;
3573     break;
3574    
3575 nigel 93
3576     /* ===================================================================*/
3577 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3578     has been tested above. */
3579    
3580 ph10 391 case CHAR_LEFT_CURLY_BRACKET:
3581 nigel 77 if (!is_quantifier) goto NORMAL_CHAR;
3582     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3583     if (*errorcodeptr != 0) goto FAILED;
3584     goto REPEAT;
3585    
3586 ph10 391 case CHAR_ASTERISK:
3587 nigel 77 repeat_min = 0;
3588     repeat_max = -1;
3589     goto REPEAT;
3590    
3591 ph10 391 case CHAR_PLUS:
3592 nigel 77 repeat_min = 1;
3593     repeat_max = -1;
3594     goto REPEAT;
3595    
3596 ph10 391 case CHAR_QUESTION_MARK:
3597 nigel 77 repeat_min = 0;
3598     repeat_max = 1;
3599    
3600     REPEAT:
3601     if (previous == NULL)
3602     {
3603     *errorcodeptr = ERR9;
3604     goto FAILED;
3605     }
3606    
3607     if (repeat_min == 0)
3608     {
3609     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3610     reqbyte = zeroreqbyte; /* Ditto */
3611     }
3612    
3613     /* Remember whether this is a variable length repeat */
3614    
3615     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3616    
3617     op_type = 0; /* Default single-char op codes */
3618     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3619    
3620     /* Save start of previous item, in case we have to move it up to make space
3621     for an inserted OP_ONCE for the additional '+' extension. */
3622    
3623     tempcode = previous;
3624    
3625     /* If the next character is '+', we have a possessive quantifier. This
3626     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3627     If the next character is '?' this is a minimizing repeat, by default,
3628     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3629     repeat type to the non-default. */
3630    
3631 ph10 391 if (ptr[1] == CHAR_PLUS)
3632 nigel 77 {
3633     repeat_type = 0; /* Force greedy */
3634     possessive_quantifier = TRUE;
3635     ptr++;
3636     }
3637 ph10 391 else if (ptr[1] == CHAR_QUESTION_MARK)
3638 nigel 77 {
3639     repeat_type = greedy_non_default;
3640     ptr++;
3641     }
3642     else repeat_type = greedy_default;
3643    
3644     /* If previous was a character match, abolish the item and generate a
3645     repeat item instead. If a char item has a minumum of more than one, ensure
3646     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3647     the first thing in a branch because the x will have gone into firstbyte
3648     instead. */
3649    
3650     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3651     {
3652     /* Deal with UTF-8 characters that take up more than one byte. It's
3653     easier to write this out separately than try to macrify it. Use c to
3654     hold the length of the character in bytes, plus 0x80 to flag that it's a
3655     length rather than a small character. */
3656    
3657     #ifdef SUPPORT_UTF8
3658     if (utf8 && (code[-1] & 0x80) != 0)
3659     {
3660     uschar *lastchar = code - 1;
3661     while((*lastchar & 0xc0) == 0x80) lastchar--;
3662     c = code - lastchar; /* Length of UTF-8 character */
3663     memcpy(utf8_char, lastchar, c); /* Save the char */
3664     c |= 0x80; /* Flag c as a length */
3665     }
3666     else
3667     #endif
3668    
3669     /* Handle the case of a single byte - either with no UTF8 support, or
3670     with UTF-8 disabled, or for a UTF-8 character < 128. */
3671    
3672     {
3673     c = code[-1];
3674     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3675     }
3676    
3677 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3678     the line is something that cannot possibly match this character. If so,
3679     automatically possessifying this item gains some performance in the case
3680     where the match fails. */
3681    
3682     if (!possessive_quantifier &&
3683     repeat_max < 0 &&
3684     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3685     options, cd))
3686     {
3687     repeat_type = 0; /* Force greedy */
3688     possessive_quantifier = TRUE;
3689     }
3690    
3691 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3692     }
3693    
3694     /* If previous was a single negated character ([^a] or similar), we use
3695     one of the special opcodes, replacing it. The code is shared with single-
3696     character repeats by setting opt_type to add a suitable offset into
3697 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3698     currently used only for single-byte chars. */
3699 nigel 77
3700     else if (*previous == OP_NOT)
3701     {
3702     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3703     c = previous[1];
3704 nigel 93 if (!possessive_quantifier &&
3705     repeat_max < 0 &&
3706     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3707     {
3708     repeat_type = 0; /* Force greedy */
3709     possessive_quantifier = TRUE;
3710     }
3711 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3712     }
3713    
3714     /* If previous was a character type match (\d or similar), abolish it and
3715     create a suitable repeat item. The code is shared with single-character
3716     repeats by setting op_type to add a suitable offset into repeat_type. Note
3717     the the Unicode property types will be present only when SUPPORT_UCP is
3718     defined, but we don't wrap the little bits of code here because it just
3719     makes it horribly messy. */
3720    
3721     else if (*previous < OP_EODN)
3722     {
3723     uschar *oldcode;
3724 nigel 87 int prop_type, prop_value;
3725 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3726     c = *previous;
3727    
3728 nigel 93 if (!possessive_quantifier &&
3729     repeat_max < 0 &&
3730     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3731     {
3732     repeat_type = 0; /* Force greedy */
3733     possessive_quantifier = TRUE;
3734     }
3735    
3736 nigel 77 OUTPUT_SINGLE_REPEAT:
3737 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3738     {
3739     prop_type = previous[1];
3740     prop_value = previous[2];
3741     }
3742     else prop_type = prop_value = -1;
3743 nigel 77
3744     oldcode = code;
3745     code = previous; /* Usually overwrite previous item */
3746    
3747     /* If the maximum is zero then the minimum must also be zero; Perl allows
3748     this case, so we do too - by simply omitting the item altogether. */
3749    
3750     if (repeat_max == 0) goto END_REPEAT;
3751    
3752     /* All real repeats make it impossible to handle partial matching (maybe
3753     one day we will be able to remove this restriction). */
3754    
3755 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3756 nigel 77
3757     /* Combine the op_type with the repeat_type */
3758    
3759     repeat_type += op_type;
3760    
3761     /* A minimum of zero is handled either as the special case * or ?, or as
3762     an UPTO, with the maximum given. */
3763    
3764     if (repeat_min == 0)
3765     {
3766     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3767     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3768     else
3769     {
3770     *code++ = OP_UPTO + repeat_type;
3771     PUT2INC(code, 0, repeat_max);
3772     }
3773     }
3774    
3775     /* A repeat minimum of 1 is optimized into some special cases. If the
3776 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3777 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3778     one less than the maximum. */
3779    
3780     else if (repeat_min == 1)
3781     {
3782     if (repeat_max == -1)
3783     *code++ = OP_PLUS + repeat_type;
3784     else
3785     {
3786     code = oldcode; /* leave previous item in place */
3787     if (repeat_max == 1) goto END_REPEAT;
3788     *code++ = OP_UPTO + repeat_type;
3789     PUT2INC(code, 0, repeat_max - 1);
3790     }
3791     }
3792    
3793     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3794     handled as an EXACT followed by an UPTO. */
3795    
3796     else
3797     {
3798     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3799     PUT2INC(code, 0, repeat_min);
3800    
3801     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3802     we have to insert the character for the previous code. For a repeated
3803 nigel 87 Unicode property match, there are two extra bytes that define the
3804 nigel 77 required property. In UTF-8 mode, long characters have their length in
3805     c, with the 0x80 bit as a flag. */
3806    
3807     if (repeat_max < 0)
3808     {
3809     #ifdef SUPPORT_UTF8
3810     if (utf8 && c >= 128)
3811     {
3812     memcpy(code, utf8_char, c & 7);
3813     code += c & 7;
3814     }
3815     else
3816     #endif
3817     {
3818     *code++ = c;
3819 nigel 87 if (prop_type >= 0)
3820     {
3821     *code++ = prop_type;
3822     *code++ = prop_value;
3823     }
3824 nigel 77 }
3825     *code++ = OP_STAR + repeat_type;
3826     }
3827    
3828     /* Else insert an UPTO if the max is greater than the min, again
3829 nigel 93 preceded by the character, for the previously inserted code. If the
3830     UPTO is just for 1 instance, we can use QUERY instead. */
3831 nigel 77
3832     else if (repeat_max != repeat_min)
3833     {
3834     #ifdef SUPPORT_UTF8
3835     if (utf8 && c >= 128)
3836     {
3837     memcpy(code, utf8_char, c & 7);
3838     code += c & 7;
3839     }
3840     else
3841     #endif
3842     *code++ = c;
3843 nigel 87 if (prop_type >= 0)
3844     {
3845     *code++ = prop_type;
3846     *code++ = prop_value;
3847     }
3848 nigel 77 repeat_max -= repeat_min;
3849 nigel 93
3850     if (repeat_max == 1)
3851     {
3852     *code++ = OP_QUERY + repeat_type;
3853     }
3854     else
3855     {
3856     *code++ = OP_UPTO + repeat_type;
3857     PUT2INC(code, 0, repeat_max);
3858     }
3859 nigel 77 }
3860     }
3861    
3862     /* The character or character type itself comes last in all cases. */
3863    
3864     #ifdef SUPPORT_UTF8
3865     if (utf8 && c >= 128)
3866     {
3867     memcpy(code, utf8_char, c & 7);
3868     code += c & 7;
3869     }
3870     else
3871     #endif
3872     *code++ = c;
3873    
3874 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3875     define the required property. */
3876 nigel 77
3877     #ifdef SUPPORT_UCP
3878 nigel 87 if (prop_type >= 0)
3879     {
3880     *code++ = prop_type;
3881     *code++ = prop_value;
3882     }
3883 nigel 77 #endif
3884     }
3885    
3886     /* If previous was a character class or a back reference, we put the repeat
3887     stuff after it, but just skip the item if the repeat was {0,0}. */
3888    
3889     else if (*previous == OP_CLASS ||
3890     *previous == OP_NCLASS ||
3891     #ifdef SUPPORT_UTF8
3892     *previous == OP_XCLASS ||
3893     #endif
3894     *previous == OP_REF)
3895     {
3896     if (repeat_max == 0)
3897     {
3898     code = previous;
3899     goto END_REPEAT;
3900     }
3901    
3902     /* All real repeats make it impossible to handle partial matching (maybe
3903     one day we will be able to remove this restriction). */
3904    
3905 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3906 nigel 77
3907     if (repeat_min == 0 && repeat_max == -1)
3908     *code++ = OP_CRSTAR + repeat_type;
3909     else if (repeat_min == 1 && repeat_max == -1)
3910     *code++ = OP_CRPLUS + repeat_type;
3911     else if (repeat_min == 0 && repeat_max == 1)
3912     *code++ = OP_CRQUERY + repeat_type;
3913     else
3914     {
3915     *code++ = OP_CRRANGE + repeat_type;
3916     PUT2INC(code, 0, repeat_min);
3917     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3918     PUT2INC(code, 0, repeat_max);
3919     }
3920     }
3921    
3922     /* If previous was a bracket group, we may have to replicate it in certain
3923     cases. */
3924    
3925 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3926     *previous == OP_ONCE || *previous == OP_COND)
3927 nigel 77 {
3928     register int i;
3929     int ketoffset = 0;
3930     int len = code - previous;
3931     uschar *bralink = NULL;
3932    
3933 nigel 93 /* Repeating a DEFINE group is pointless */
3934    
3935     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3936     {
3937     *errorcodeptr = ERR55;
3938     goto FAILED;
3939     }
3940    
3941 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3942     by scanning through from the start, and compute the offset back to it
3943     from the current code pointer. There may be an OP_OPT setting following
3944     the final KET, so we can't find the end just by going back from the code
3945     pointer. */
3946    
3947     if (repeat_max == -1)
3948     {
3949     register uschar *ket = previous;
3950     do ket += GET(ket, 1); while (*ket != OP_KET);
3951     ketoffset = code - ket;
3952     }
3953    
3954     /* The case of a zero minimum is special because of the need to stick
3955     OP_BRAZERO in front of it, and because the group appears once in the
3956     data, whereas in other cases it appears the minimum number of times. For
3957     this reason, it is simplest to treat this case separately, as otherwise
3958     the code gets far too messy. There are several special subcases when the
3959     minimum is zero. */
3960    
3961     if (repeat_min == 0)
3962     {
3963 ph10 335 /* If the maximum is also zero, we used to just omit the group from the
3964     output altogether, like this:
3965 nigel 77
3966 ph10 335 ** if (repeat_max == 0)
3967     ** {
3968     ** code = previous;
3969     ** goto END_REPEAT;
3970     ** }
3971 nigel 77
3972 ph10 345 However, that fails when a group is referenced as a subroutine from
3973     elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3974     so that it is skipped on execution. As we don't have a list of which
3975     groups are referenced, we cannot do this selectively.
3976    
3977 ph10 335 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3978     and do no more at this point. However, we do need to adjust any
3979     OP_RECURSE calls inside the group that refer to the group itself or any
3980     internal or forward referenced group, because the offset is from the
3981     start of the whole regex. Temporarily terminate the pattern while doing
3982     this. */
3983 nigel 77
3984 ph10 335 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
3985 nigel 77 {
3986     *code = OP_END;
3987 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3988 nigel 77 memmove(previous+1, previous, len);
3989     code++;
3990 ph10 335 if (repeat_max == 0)
3991     {
3992     *previous++ = OP_SKIPZERO;
3993     goto END_REPEAT;
3994 ph10 345 }
3995 nigel 77 *previous++ = OP_BRAZERO + repeat_type;
3996     }
3997    
3998     /* If the maximum is greater than 1 and limited, we have to replicate
3999     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4000     The first one has to be handled carefully because it's the original
4001     copy, which has to be moved up. The remainder can be handled by code
4002     that is common with the non-zero minimum case below. We have to
4003     adjust the value or repeat_max, since one less copy is required. Once
4004     again, we may have to adjust any OP_RECURSE calls inside the group. */
4005    
4006     else
4007     {
4008     int offset;
4009     *code = OP_END;
4010 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4011 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
4012     code += 2 + LINK_SIZE;
4013     *previous++ = OP_BRAZERO + repeat_type;
4014     *previous++ = OP_BRA;
4015    
4016     /* We chain together the bracket offset fields that have to be
4017     filled in later when the ends of the brackets are reached. */
4018    
4019     offset = (bralink == NULL)? 0 : previous - bralink;
4020     bralink = previous;
4021     PUTINC(previous, 0, offset);
4022     }
4023    
4024     repeat_max--;
4025     }
4026    
4027     /* If the minimum is greater than zero, replicate the group as many
4028     times as necessary, and adjust the maximum to the number of subsequent
4029     copies that we need. If we set a first char from the group, and didn't
4030 nigel 93 set a required char, copy the latter from the former. If there are any
4031     forward reference subroutine calls in the group, there will be entries on
4032     the workspace list; replicate these with an appropriate increment. */
4033 nigel 77
4034     else
4035     {
4036     if (repeat_min > 1)
4037     {
4038 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
4039 ph10 202 just adjust the length as if we had. Do some paranoid checks for
4040     potential integer overflow. */
4041 nigel 93
4042     if (lengthptr != NULL)
4043 ph10 202 {
4044     int delta = (repeat_min - 1)*length_prevgroup;
4045     if ((double)(repeat_min - 1)*(double)length_prevgroup >
4046     (double)INT_MAX ||
4047     OFLOW_MAX - *lengthptr < delta)
4048     {
4049     *errorcodeptr = ERR20;
4050     goto FAILED;
4051     }
4052     *lengthptr += delta;
4053     }
4054 nigel 93
4055     /* This is compiling for real */
4056    
4057     else
4058 nigel 77