/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 503 - (hide annotations) (download)
Sun Mar 7 17:35:52 2010 UTC (3 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 222952 byte(s)
Fix incorrect compile time error for certain types of recursive patterns.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 475 Copyright (c) 1997-2010 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 ph10 475 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57     also used by pcretest. PCRE_DEBUG is not defined when building a production
58     library. */
59 nigel 85
60 ph10 475 #ifdef PCRE_DEBUG
61 nigel 85 #include "pcre_printint.src"
62     #endif
63    
64    
65 ph10 178 /* Macro for setting individual bits in class bitmaps. */
66    
67     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68    
69 ph10 202 /* Maximum length value to check against when making sure that the integer that
70     holds the compiled pattern length does not overflow. We make it a bit less than
71     INT_MAX to allow for adding in group terminating bytes, so that we don't have
72     to check them every time. */
73 ph10 178
74 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
75    
76    
77 nigel 77 /*************************************************
78     * Code parameters and static tables *
79     *************************************************/
80    
81 nigel 93 /* This value specifies the size of stack workspace that is used during the
82     first pre-compile phase that determines how much memory is required. The regex
83     is partly compiled into this space, but the compiled parts are discarded as
84     soon as they can be, so that hopefully there will never be an overrun. The code
85     does, however, check for an overrun. The largest amount I've seen used is 218,
86     so this number is very generous.
87 nigel 77
88 nigel 93 The same workspace is used during the second, actual compile phase for
89     remembering forward references to groups so that they can be filled in at the
90     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91     is 4 there is plenty of room. */
92 nigel 77
93 nigel 93 #define COMPILE_WORK_SIZE (4096)
94 nigel 77
95 nigel 93
96 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
97     are simple data values; negative values are for special things like \d and so
98     on. Zero means further processing is needed (for things like \x), or the escape
99     is invalid. */
100    
101 ph10 391 #ifndef EBCDIC
102    
103     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
104 ph10 392 in UTF-8 mode. */
105 ph10 391
106 ph10 392 static const short int escapes[] = {
107 ph10 391 0, 0,
108     0, 0,
109 ph10 392 0, 0,
110     0, 0,
111     0, 0,
112 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
113 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
114 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
115 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
116     -ESC_B, -ESC_C,
117     -ESC_D, -ESC_E,
118     0, -ESC_G,
119     -ESC_H, 0,
120     0, -ESC_K,
121 ph10 391 0, 0,
122 ph10 392 0, 0,
123 ph10 391 -ESC_P, -ESC_Q,
124     -ESC_R, -ESC_S,
125 ph10 392 0, 0,
126     -ESC_V, -ESC_W,
127     -ESC_X, 0,
128     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
129 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
130 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
131 ph10 391 CHAR_GRAVE_ACCENT, 7,
132 ph10 392 -ESC_b, 0,
133     -ESC_d, ESC_e,
134 ph10 391 ESC_f, 0,
135     -ESC_h, 0,
136 ph10 392 0, -ESC_k,
137 ph10 391 0, 0,
138     ESC_n, 0,
139 ph10 392 -ESC_p, 0,
140     ESC_r, -ESC_s,
141 ph10 391 ESC_tee, 0,
142 ph10 392 -ESC_v, -ESC_w,
143     0, 0,
144 ph10 391 -ESC_z
145 nigel 77 };
146    
147 ph10 392 #else
148 ph10 391
149     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
150    
151 nigel 77 static const short int escapes[] = {
152     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
153     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
154     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
155     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
156     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
157     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
158     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
159     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
160 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
161 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
162 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
163 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
164 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
165     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
166     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
167     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
168 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
169 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
170 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
171 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
172 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
173     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
174     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
175     };
176     #endif
177    
178    
179 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
180     searched linearly. Put all the names into a single string, in order to reduce
181 ph10 392 the number of relocations when a shared library is dynamically linked. The
182     string is built from string macros so that it works in UTF-8 mode on EBCDIC
183 ph10 391 platforms. */
184 ph10 210
185     typedef struct verbitem {
186     int len;
187     int op;
188 ph10 211 } verbitem;
189 ph10 210
190 ph10 240 static const char verbnames[] =
191 ph10 391 STRING_ACCEPT0
192     STRING_COMMIT0
193     STRING_F0
194     STRING_FAIL0
195     STRING_PRUNE0
196     STRING_SKIP0
197     STRING_THEN;
198 ph10 240
199 ph10 327 static const verbitem verbs[] = {
200 ph10 240 { 6, OP_ACCEPT },
201     { 6, OP_COMMIT },
202     { 1, OP_FAIL },
203     { 4, OP_FAIL },
204     { 5, OP_PRUNE },
205     { 4, OP_SKIP },
206     { 4, OP_THEN }
207 ph10 210 };
208    
209 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
210 ph10 210
211    
212 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
213     now all in a single string, to reduce the number of relocations when a shared
214 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
215     length entry. The first three must be alpha, lower, upper, as this is assumed
216     for handling case independence. */
217 nigel 77
218 ph10 240 static const char posix_names[] =
219 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
220     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
221 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
222     STRING_word0 STRING_xdigit;
223 nigel 77
224     static const uschar posix_name_lengths[] = {
225     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
226    
227 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
228     base map, with an optional addition or removal of another map. Then, for some
229     classes, there is some additional tweaking: for [:blank:] the vertical space
230     characters are removed, and for [:alpha:] and [:alnum:] the underscore
231     character is removed. The triples in the table consist of the base map offset,
232     second map offset or -1 if no second map, and a non-negative value for map
233     addition or a negative value for map subtraction (if there are two maps). The
234     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
235     remove vertical space characters, 2 => remove underscore. */
236 nigel 77
237     static const int posix_class_maps[] = {
238 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
239     cbit_lower, -1, 0, /* lower */
240     cbit_upper, -1, 0, /* upper */
241     cbit_word, -1, 2, /* alnum - word without underscore */
242     cbit_print, cbit_cntrl, 0, /* ascii */
243     cbit_space, -1, 1, /* blank - a GNU extension */
244     cbit_cntrl, -1, 0, /* cntrl */
245     cbit_digit, -1, 0, /* digit */
246     cbit_graph, -1, 0, /* graph */
247     cbit_print, -1, 0, /* print */
248     cbit_punct, -1, 0, /* punct */
249     cbit_space, -1, 0, /* space */
250     cbit_word, -1, 0, /* word - a Perl extension */
251     cbit_xdigit,-1, 0 /* xdigit */
252 nigel 77 };
253    
254    
255 nigel 93 #define STRING(a) # a
256     #define XSTRING(s) STRING(s)
257    
258 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
259 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
260     they are documented. Always add a new error instead. Messages marked DEAD below
261 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
262     the number of relocations needed when a shared library is loaded dynamically,
263     it is now one long string. We cannot use a table of offsets, because the
264     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
265     simply count through to the one we want - this isn't a performance issue
266 ph10 499 because these strings are used only when there is a compilation error.
267 nigel 77
268 ph10 499 Each substring ends with \0 to insert a null character. This includes the final
269     substring, so that the whole string ends with \0\0, which can be detected when
270     counting through. */
271    
272 ph10 240 static const char error_texts[] =
273     "no error\0"
274     "\\ at end of pattern\0"
275     "\\c at end of pattern\0"
276     "unrecognized character follows \\\0"
277     "numbers out of order in {} quantifier\0"
278 nigel 77 /* 5 */
279 ph10 240 "number too big in {} quantifier\0"
280     "missing terminating ] for character class\0"
281     "invalid escape sequence in character class\0"
282     "range out of order in character class\0"
283     "nothing to repeat\0"
284 nigel 77 /* 10 */
285 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
286     "internal error: unexpected repeat\0"
287 ph10 269 "unrecognized character after (? or (?-\0"
288 ph10 240 "POSIX named classes are supported only within a class\0"
289     "missing )\0"
290 nigel 77 /* 15 */
291 ph10 240 "reference to non-existent subpattern\0"
292     "erroffset passed as NULL\0"
293     "unknown option bit(s) set\0"
294     "missing ) after comment\0"
295     "parentheses nested too deeply\0" /** DEAD **/
296 nigel 77 /* 20 */
297 ph10 240 "regular expression is too large\0"
298     "failed to get memory\0"
299     "unmatched parentheses\0"
300     "internal error: code overflow\0"
301     "unrecognized character after (?<\0"
302 nigel 77 /* 25 */
303 ph10 240 "lookbehind assertion is not fixed length\0"
304     "malformed number or name after (?(\0"
305     "conditional group contains more than two branches\0"
306     "assertion expected after (?(\0"
307     "(?R or (?[+-]digits must be followed by )\0"
308 nigel 77 /* 30 */
309 ph10 240 "unknown POSIX class name\0"
310     "POSIX collating elements are not supported\0"
311     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
312     "spare error\0" /** DEAD **/
313     "character value in \\x{...} sequence is too large\0"
314 nigel 77 /* 35 */
315 ph10 240 "invalid condition (?(0)\0"
316     "\\C not allowed in lookbehind assertion\0"
317     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
318     "number after (?C is > 255\0"
319     "closing ) for (?C expected\0"
320 nigel 77 /* 40 */
321 ph10 240 "recursive call could loop indefinitely\0"
322     "unrecognized character after (?P\0"
323     "syntax error in subpattern name (missing terminator)\0"
324     "two named subpatterns have the same name\0"
325     "invalid UTF-8 string\0"
326 nigel 77 /* 45 */
327 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
328     "malformed \\P or \\p sequence\0"
329     "unknown property name after \\P or \\p\0"
330     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
331     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
332 nigel 91 /* 50 */
333 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
334     "octal value is greater than \\377 (not in UTF-8 mode)\0"
335     "internal error: overran compiling workspace\0"
336     "internal error: previously-checked referenced subpattern not found\0"
337     "DEFINE group contains more than one branch\0"
338 nigel 93 /* 55 */
339 ph10 240 "repeating a DEFINE group is not allowed\0"
340     "inconsistent NEWLINE options\0"
341 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
342     "a numbered reference must not be zero\0"
343 ph10 240 "(*VERB) with an argument is not supported\0"
344 ph10 211 /* 60 */
345 ph10 240 "(*VERB) not recognized\0"
346 ph10 268 "number is too big\0"
347 ph10 272 "subpattern name expected\0"
348 ph10 336 "digit expected after (?+\0"
349 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
350     /* 65 */
351 ph10 499 "different names for subpatterns of the same number are not allowed\0";
352 nigel 77
353     /* Table to identify digits and hex digits. This is used when compiling
354     patterns. Note that the tables in chartables are dependent on the locale, and
355     may mark arbitrary characters as digits - but the PCRE compiling code expects
356     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
357     a private table here. It costs 256 bytes, but it is a lot faster than doing
358     character value tests (at least in some simple cases I timed), and in some
359     applications one wants PCRE to compile efficiently as well as match
360     efficiently.
361    
362     For convenience, we use the same bit definitions as in chartables:
363    
364     0x04 decimal digit
365     0x08 hexadecimal digit
366    
367     Then we can use ctype_digit and ctype_xdigit in the code. */
368    
369 ph10 392 #ifndef EBCDIC
370 ph10 391
371 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
372 ph10 391 UTF-8 mode. */
373    
374 nigel 77 static const unsigned char digitab[] =
375     {
376     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
377     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
378     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
379     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
380     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
381     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
382     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
383     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
384     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
387     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
388     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
393     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
394     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
395     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
396     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
397     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
398     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
399     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
400     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
401     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
402     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
403     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
404     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
405     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
406     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
407     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
408    
409 ph10 392 #else
410 ph10 391
411     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
412    
413 nigel 77 static const unsigned char digitab[] =
414     {
415     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
416     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
417     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
418     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
419     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
420     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
421     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
422     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
423     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
424     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
425     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
426 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
427 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
428     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
429     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
430     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
431     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
432     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
433     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
434     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
435     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
436     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
437     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
438     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
439     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
440     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
441     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
442     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
443     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
444     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
445     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
446     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
447    
448     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
449     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
450     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
451     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
452     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
453     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
454     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
455     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
456     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
457     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
458     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
459     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
460 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
461 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
462     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
463     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
464     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
465     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
466     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
467     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
468     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
469     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
470     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
471     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
472     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
473     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
474     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
475     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
476     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
477     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
478     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
479     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
480     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
481     #endif
482    
483    
484     /* Definition to allow mutual recursion */
485    
486     static BOOL
487 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
488 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
489 nigel 77
490    
491    
492     /*************************************************
493 ph10 240 * Find an error text *
494     *************************************************/
495    
496 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
497     some of the text is of unknown length, we can't use a table of offsets.
498     Instead, just count through the strings. This is not a performance issue
499 ph10 240 because it happens only when there has been a compilation error.
500    
501     Argument: the error number
502     Returns: pointer to the error string
503     */
504    
505     static const char *
506     find_error_text(int n)
507     {
508     const char *s = error_texts;
509 ph10 499 for (; n > 0; n--)
510     {
511     while (*s++ != 0) {};
512     if (*s == 0) return "Error text not found (please report)";
513     }
514 ph10 240 return s;
515     }
516    
517    
518     /*************************************************
519 nigel 77 * Handle escapes *
520     *************************************************/
521    
522     /* This function is called when a \ has been encountered. It either returns a
523     positive value for a simple escape such as \n, or a negative value which
524 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
525     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
526     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
527     ptr is pointing at the \. On exit, it is on the final character of the escape
528     sequence.
529 nigel 77
530     Arguments:
531     ptrptr points to the pattern position pointer
532     errorcodeptr points to the errorcode variable
533     bracount number of previous extracting brackets
534     options the options bits
535     isclass TRUE if inside a character class
536    
537     Returns: zero or positive => a data character
538     negative => a special escape sequence
539 ph10 213 on error, errorcodeptr is set
540 nigel 77 */
541    
542     static int
543     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
544     int options, BOOL isclass)
545     {
546 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
547     const uschar *ptr = *ptrptr + 1;
548 nigel 77 int c, i;
549    
550 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
551     ptr--; /* Set pointer back to the last byte */
552    
553 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
554    
555     if (c == 0) *errorcodeptr = ERR1;
556    
557 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
558     in a table. A non-zero result is something that can be returned immediately.
559 nigel 77 Otherwise further processing may be required. */
560    
561 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
562     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
563     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
564 nigel 77
565 ph10 97 #else /* EBCDIC coding */
566 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
567 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
568     #endif
569    
570     /* Escapes that need further processing, or are illegal. */
571    
572     else
573     {
574     const uschar *oldptr;
575 nigel 93 BOOL braced, negated;
576    
577 nigel 77 switch (c)
578     {
579     /* A number of Perl escapes are not handled by PCRE. We give an explicit
580     error. */
581    
582 ph10 391 case CHAR_l:
583     case CHAR_L:
584     case CHAR_N:
585     case CHAR_u:
586     case CHAR_U:
587 nigel 77 *errorcodeptr = ERR37;
588     break;
589    
590 ph10 333 /* \g must be followed by one of a number of specific things:
591 ph10 345
592 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
593     backreference. If negative, it is a relative backreference. This is a Perl
594     5.10 feature.
595 ph10 345
596 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
597     is part of Perl's movement towards a unified syntax for back references. As
598     this is synonymous with \k{name}, we fudge it up by pretending it really
599     was \k.
600 ph10 345
601     (3) For Oniguruma compatibility we also support \g followed by a name or a
602     number either in angle brackets or in single quotes. However, these are
603     (possibly recursive) subroutine calls, _not_ backreferences. Just return
604 ph10 333 the -ESC_g code (cf \k). */
605 nigel 93
606 ph10 391 case CHAR_g:
607     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
608 ph10 333 {
609     c = -ESC_g;
610 ph10 345 break;
611     }
612 ph10 333
613     /* Handle the Perl-compatible cases */
614 ph10 345
615 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
616 nigel 93 {
617 ph10 171 const uschar *p;
618 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
619     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
620     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
621 ph10 171 {
622     c = -ESC_k;
623     break;
624 ph10 172 }
625 nigel 93 braced = TRUE;
626     ptr++;
627     }
628     else braced = FALSE;
629    
630 ph10 391 if (ptr[1] == CHAR_MINUS)
631 nigel 93 {
632     negated = TRUE;
633     ptr++;
634     }
635     else negated = FALSE;
636    
637     c = 0;
638     while ((digitab[ptr[1]] & ctype_digit) != 0)
639 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
640 ph10 220
641 ph10 333 if (c < 0) /* Integer overflow */
642 ph10 213 {
643     *errorcodeptr = ERR61;
644     break;
645 ph10 220 }
646 ph10 345
647 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
648 nigel 93 {
649     *errorcodeptr = ERR57;
650 ph10 213 break;
651 nigel 93 }
652 ph10 345
653 ph10 333 if (c == 0)
654     {
655     *errorcodeptr = ERR58;
656     break;
657 ph10 345 }
658 nigel 93
659     if (negated)
660     {
661     if (c > bracount)
662     {
663     *errorcodeptr = ERR15;
664 ph10 213 break;
665 nigel 93 }
666     c = bracount - (c - 1);
667     }
668    
669     c = -(ESC_REF + c);
670     break;
671    
672 nigel 77 /* The handling of escape sequences consisting of a string of digits
673     starting with one that is not zero is not straightforward. By experiment,
674     the way Perl works seems to be as follows:
675    
676     Outside a character class, the digits are read as a decimal number. If the
677     number is less than 10, or if there are that many previous extracting
678     left brackets, then it is a back reference. Otherwise, up to three octal
679     digits are read to form an escaped byte. Thus \123 is likely to be octal
680     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
681     value is greater than 377, the least significant 8 bits are taken. Inside a
682     character class, \ followed by a digit is always an octal number. */
683    
684 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
685     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
686 nigel 77
687     if (!isclass)
688     {
689     oldptr = ptr;
690 ph10 391 c -= CHAR_0;
691 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
692 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
693 ph10 333 if (c < 0) /* Integer overflow */
694 ph10 213 {
695     *errorcodeptr = ERR61;
696 ph10 220 break;
697     }
698 nigel 77 if (c < 10 || c <= bracount)
699     {
700     c = -(ESC_REF + c);
701     break;
702     }
703     ptr = oldptr; /* Put the pointer back and fall through */
704     }
705    
706     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
707     generates a binary zero byte and treats the digit as a following literal.
708     Thus we have to pull back the pointer by one. */
709    
710 ph10 391 if ((c = *ptr) >= CHAR_8)
711 nigel 77 {
712     ptr--;
713     c = 0;
714     break;
715     }
716    
717     /* \0 always starts an octal number, but we may drop through to here with a
718 nigel 91 larger first octal digit. The original code used just to take the least
719     significant 8 bits of octal numbers (I think this is what early Perls used
720     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
721     than 3 octal digits. */
722 nigel 77
723 ph10 391 case CHAR_0:
724     c -= CHAR_0;
725     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
726     c = c * 8 + *(++ptr) - CHAR_0;
727 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
728 nigel 77 break;
729    
730 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
731     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
732     treated as a data character. */
733 nigel 77
734 ph10 391 case CHAR_x:
735     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
736 nigel 77 {
737     const uschar *pt = ptr + 2;
738 nigel 87 int count = 0;
739    
740 nigel 77 c = 0;
741     while ((digitab[*pt] & ctype_xdigit) != 0)
742     {
743 nigel 87 register int cc = *pt++;
744 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
745 nigel 77 count++;
746 nigel 87
747 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
748     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
749     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
750 ph10 97 #else /* EBCDIC coding */
751 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
752     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
753 nigel 77 #endif
754     }
755 nigel 87
756 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
757 nigel 77 {
758 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
759 nigel 77 ptr = pt;
760     break;
761     }
762 nigel 87
763 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
764     recognize this construct; fall through to the normal \x handling. */
765     }
766    
767 nigel 87 /* Read just a single-byte hex-defined char */
768 nigel 77
769     c = 0;
770     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
771     {
772 ph10 391 int cc; /* Some compilers don't like */
773     cc = *(++ptr); /* ++ in initializers */
774     #ifndef EBCDIC /* ASCII/UTF-8 coding */
775     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
776     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
777 ph10 97 #else /* EBCDIC coding */
778 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
779     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
780 nigel 77 #endif
781     }
782     break;
783    
784 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
785     This coding is ASCII-specific, but then the whole concept of \cx is
786     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
787 nigel 77
788 ph10 391 case CHAR_c:
789 nigel 77 c = *(++ptr);
790     if (c == 0)
791     {
792     *errorcodeptr = ERR2;
793 ph10 213 break;
794 nigel 77 }
795    
796 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
797     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
798 nigel 77 c ^= 0x40;
799 ph10 97 #else /* EBCDIC coding */
800 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
801 nigel 77 c ^= 0xC0;
802     #endif
803     break;
804    
805     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
806 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
807     otherwise, for Perl compatibility, it is a literal. This code looks a bit
808     odd, but there used to be some cases other than the default, and there may
809     be again in future, so I haven't "optimized" it. */
810 nigel 77
811     default:
812     if ((options & PCRE_EXTRA) != 0) switch(c)
813     {
814     default:
815     *errorcodeptr = ERR3;
816     break;
817     }
818     break;
819     }
820     }
821    
822     *ptrptr = ptr;
823     return c;
824     }
825    
826    
827    
828     #ifdef SUPPORT_UCP
829     /*************************************************
830     * Handle \P and \p *
831     *************************************************/
832    
833     /* This function is called after \P or \p has been encountered, provided that
834     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
835     pointing at the P or p. On exit, it is pointing at the final character of the
836     escape sequence.
837    
838     Argument:
839     ptrptr points to the pattern position pointer
840     negptr points to a boolean that is set TRUE for negation else FALSE
841 nigel 87 dptr points to an int that is set to the detailed property value
842 nigel 77 errorcodeptr points to the error code variable
843    
844 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
845 nigel 77 */
846    
847     static int
848 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
849 nigel 77 {
850     int c, i, bot, top;
851     const uschar *ptr = *ptrptr;
852 nigel 87 char name[32];
853 nigel 77
854     c = *(++ptr);
855     if (c == 0) goto ERROR_RETURN;
856    
857     *negptr = FALSE;
858    
859 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
860     negation. */
861 nigel 77
862 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
863 nigel 77 {
864 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
865 nigel 77 {
866     *negptr = TRUE;
867     ptr++;
868     }
869 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
870 nigel 77 {
871     c = *(++ptr);
872     if (c == 0) goto ERROR_RETURN;
873 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
874 nigel 77 name[i] = c;
875     }
876 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
877 nigel 77 name[i] = 0;
878     }
879    
880     /* Otherwise there is just one following character */
881    
882     else
883     {
884     name[0] = c;
885     name[1] = 0;
886     }
887    
888     *ptrptr = ptr;
889    
890     /* Search for a recognized property name using binary chop */
891    
892     bot = 0;
893     top = _pcre_utt_size;
894    
895     while (bot < top)
896     {
897 nigel 87 i = (bot + top) >> 1;
898 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
899 nigel 87 if (c == 0)
900     {
901     *dptr = _pcre_utt[i].value;
902     return _pcre_utt[i].type;
903     }
904 nigel 77 if (c > 0) bot = i + 1; else top = i;
905     }
906    
907     *errorcodeptr = ERR47;
908     *ptrptr = ptr;
909     return -1;
910    
911     ERROR_RETURN:
912     *errorcodeptr = ERR46;
913     *ptrptr = ptr;
914     return -1;
915     }
916     #endif
917    
918    
919    
920    
921     /*************************************************
922     * Check for counted repeat *
923     *************************************************/
924    
925     /* This function is called when a '{' is encountered in a place where it might
926     start a quantifier. It looks ahead to see if it really is a quantifier or not.
927     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
928     where the ddds are digits.
929    
930     Arguments:
931     p pointer to the first char after '{'
932    
933     Returns: TRUE or FALSE
934     */
935    
936     static BOOL
937     is_counted_repeat(const uschar *p)
938     {
939     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
940     while ((digitab[*p] & ctype_digit) != 0) p++;
941 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
942 nigel 77
943 ph10 391 if (*p++ != CHAR_COMMA) return FALSE;
944     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
945 nigel 77
946     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
947     while ((digitab[*p] & ctype_digit) != 0) p++;
948    
949 ph10 391 return (*p == CHAR_RIGHT_CURLY_BRACKET);
950 nigel 77 }
951    
952    
953    
954     /*************************************************
955     * Read repeat counts *
956     *************************************************/
957    
958     /* Read an item of the form {n,m} and return the values. This is called only
959     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
960     so the syntax is guaranteed to be correct, but we need to check the values.
961    
962     Arguments:
963     p pointer to first char after '{'
964     minp pointer to int for min
965     maxp pointer to int for max
966     returned as -1 if no max
967     errorcodeptr points to error code variable
968    
969     Returns: pointer to '}' on success;
970     current ptr on error, with errorcodeptr set non-zero
971     */
972    
973     static const uschar *
974     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
975     {
976     int min = 0;
977     int max = -1;
978    
979 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
980     an integer overflow. */
981    
982 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
983 nigel 81 if (min < 0 || min > 65535)
984     {
985     *errorcodeptr = ERR5;
986     return p;
987     }
988 nigel 77
989 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
990     Also, max must not be less than min. */
991    
992 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
993 nigel 77 {
994 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
995 nigel 77 {
996     max = 0;
997 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
998 nigel 81 if (max < 0 || max > 65535)
999     {
1000     *errorcodeptr = ERR5;
1001     return p;
1002     }
1003 nigel 77 if (max < min)
1004     {
1005     *errorcodeptr = ERR4;
1006     return p;
1007     }
1008     }
1009     }
1010    
1011 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1012     '}'. */
1013 nigel 77
1014 nigel 81 *minp = min;
1015     *maxp = max;
1016 nigel 77 return p;
1017     }
1018    
1019    
1020    
1021     /*************************************************
1022 ph10 408 * Subroutine for finding forward reference *
1023 nigel 91 *************************************************/
1024    
1025 ph10 408 /* This recursive function is called only from find_parens() below. The
1026     top-level call starts at the beginning of the pattern. All other calls must
1027     start at a parenthesis. It scans along a pattern's text looking for capturing
1028 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1029     name it is given, it returns its number. Alternatively, if the name is NULL, it
1030 ph10 408 returns when it reaches a given numbered subpattern. We know that if (?P< is
1031     encountered, the name will be terminated by '>' because that is checked in the
1032 ph10 411 first pass. Recursion is used to keep track of subpatterns that reset the
1033 ph10 408 capturing group numbers - the (?| feature.
1034 nigel 91
1035     Arguments:
1036 ph10 408 ptrptr address of the current character pointer (updated)
1037 ph10 345 cd compile background data
1038 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1039     lorn name length, or subpattern number if name is NULL
1040     xmode TRUE if we are in /x mode
1041 ph10 411 count pointer to the current capturing subpattern number (updated)
1042 nigel 91
1043     Returns: the number of the named subpattern, or -1 if not found
1044     */
1045    
1046     static int
1047 ph10 408 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1048     BOOL xmode, int *count)
1049 nigel 91 {
1050 ph10 408 uschar *ptr = *ptrptr;
1051     int start_count = *count;
1052     int hwm_count = start_count;
1053     BOOL dup_parens = FALSE;
1054 nigel 93
1055 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1056 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1057    
1058     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1059     {
1060     if (ptr[1] == CHAR_QUESTION_MARK &&
1061 ph10 411 ptr[2] == CHAR_VERTICAL_LINE)
1062 ph10 408 {
1063     ptr += 3;
1064 ph10 411 dup_parens = TRUE;
1065     }
1066 ph10 408
1067     /* Handle a normal, unnamed capturing parenthesis */
1068 ph10 411
1069 ph10 408 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1070     {
1071     *count += 1;
1072     if (name == NULL && *count == lorn) return *count;
1073 ph10 411 ptr++;
1074 ph10 408 }
1075    
1076     /* Handle a condition. If it is an assertion, just carry on so that it
1077     is processed as normal. If not, skip to the closing parenthesis of the
1078 ph10 411 condition (there can't be any nested parens. */
1079    
1080 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1081     {
1082 ph10 411 ptr += 2;
1083 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1084     {
1085     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1086 ph10 411 if (*ptr != 0) ptr++;
1087 ph10 408 }
1088 ph10 411 }
1089    
1090 ph10 408 /* We have either (? or (* and not a condition */
1091    
1092     else
1093 ph10 411 {
1094 ph10 408 ptr += 2;
1095     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1096    
1097     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1098 ph10 411
1099 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1100     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1101     {
1102     int term;
1103     const uschar *thisname;
1104     *count += 1;
1105     if (name == NULL && *count == lorn) return *count;
1106     term = *ptr++;
1107     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1108     thisname = ptr;
1109     while (*ptr != term) ptr++;
1110     if (name != NULL && lorn == ptr - thisname &&
1111     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1112     return *count;
1113 ph10 461 term++;
1114 ph10 411 }
1115 ph10 408 }
1116 ph10 411 }
1117 ph10 408
1118 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1119 ph10 408 bars. */
1120    
1121 nigel 91 for (; *ptr != 0; ptr++)
1122     {
1123 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1124    
1125 ph10 391 if (*ptr == CHAR_BACKSLASH)
1126 nigel 93 {
1127 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1128 ph10 391 if (*ptr == CHAR_Q) for (;;)
1129 nigel 93 {
1130 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1131 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1132 ph10 391 if (*(++ptr) == CHAR_E) break;
1133 nigel 93 }
1134     continue;
1135     }
1136    
1137 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1138     are handled for real. If the first character is '^', skip it. Also, if the
1139     first few characters (either before or after ^) are \Q\E or \E we skip them
1140 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1141 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1142 nigel 93
1143 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1144 nigel 93 {
1145 ph10 340 BOOL negate_class = FALSE;
1146     for (;;)
1147     {
1148 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1149 ph10 340 {
1150 ph10 438 if (ptr[2] == CHAR_E)
1151     ptr+= 2;
1152     else if (strncmp((const char *)ptr+2,
1153 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1154 ph10 438 ptr += 4;
1155 ph10 392 else
1156 ph10 391 break;
1157 ph10 340 }
1158 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1159 ph10 461 {
1160 ph10 340 negate_class = TRUE;
1161 ph10 438 ptr++;
1162 ph10 461 }
1163 ph10 340 else break;
1164     }
1165    
1166     /* If the next character is ']', it is a data character that must be
1167 ph10 341 skipped, except in JavaScript compatibility mode. */
1168 ph10 345
1169 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1170 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1171 ph10 345 ptr++;
1172    
1173 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1174 nigel 93 {
1175 ph10 220 if (*ptr == 0) return -1;
1176 ph10 391 if (*ptr == CHAR_BACKSLASH)
1177 nigel 93 {
1178 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1179 ph10 391 if (*ptr == CHAR_Q) for (;;)
1180 nigel 93 {
1181 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1182 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1183 ph10 391 if (*(++ptr) == CHAR_E) break;
1184 nigel 93 }
1185     continue;
1186     }
1187     }
1188     continue;
1189     }
1190    
1191     /* Skip comments in /x mode */
1192    
1193 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1194 nigel 93 {
1195 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1196 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1197 nigel 93 continue;
1198     }
1199    
1200 ph10 408 /* Check for the special metacharacters */
1201 ph10 411
1202 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1203 nigel 93 {
1204 ph10 408 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1205     if (rc > 0) return rc;
1206     if (*ptr == 0) goto FAIL_EXIT;
1207 nigel 93 }
1208 ph10 411
1209 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1210     {
1211     if (dup_parens && *count < hwm_count) *count = hwm_count;
1212     *ptrptr = ptr;
1213     return -1;
1214     }
1215 ph10 411
1216     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1217 ph10 408 {
1218     if (*count > hwm_count) hwm_count = *count;
1219     *count = start_count;
1220 ph10 411 }
1221 ph10 408 }
1222 nigel 93
1223 ph10 408 FAIL_EXIT:
1224     *ptrptr = ptr;
1225     return -1;
1226     }
1227 nigel 93
1228    
1229    
1230    
1231 ph10 408 /*************************************************
1232     * Find forward referenced subpattern *
1233     *************************************************/
1234 nigel 93
1235 ph10 408 /* This function scans along a pattern's text looking for capturing
1236     subpatterns, and counting them. If it finds a named pattern that matches the
1237     name it is given, it returns its number. Alternatively, if the name is NULL, it
1238     returns when it reaches a given numbered subpattern. This is used for forward
1239     references to subpatterns. We used to be able to start this scan from the
1240     current compiling point, using the current count value from cd->bracount, and
1241     do it all in a single loop, but the addition of the possibility of duplicate
1242     subpattern numbers means that we have to scan from the very start, in order to
1243     take account of such duplicates, and to use a recursive function to keep track
1244     of the different types of group.
1245    
1246     Arguments:
1247     cd compile background data
1248     name name to seek, or NULL if seeking a numbered subpattern
1249     lorn name length, or subpattern number if name is NULL
1250     xmode TRUE if we are in /x mode
1251    
1252     Returns: the number of the found subpattern, or -1 if not found
1253     */
1254    
1255     static int
1256     find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1257     {
1258     uschar *ptr = (uschar *)cd->start_pattern;
1259     int count = 0;
1260     int rc;
1261    
1262     /* If the pattern does not start with an opening parenthesis, the first call
1263     to find_parens_sub() will scan right to the end (if necessary). However, if it
1264     does start with a parenthesis, find_parens_sub() will return when it hits the
1265     matching closing parens. That is why we have to have a loop. */
1266    
1267 ph10 411 for (;;)
1268     {
1269 ph10 408 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1270 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1271     }
1272    
1273 ph10 408 return rc;
1274 nigel 91 }
1275    
1276    
1277    
1278 ph10 408
1279 nigel 91 /*************************************************
1280 nigel 77 * Find first significant op code *
1281     *************************************************/
1282    
1283     /* This is called by several functions that scan a compiled expression looking
1284     for a fixed first character, or an anchoring op code etc. It skips over things
1285     that do not influence this. For some calls, a change of option is important.
1286     For some calls, it makes sense to skip negative forward and all backward
1287     assertions, and also the \b assertion; for others it does not.
1288    
1289     Arguments:
1290     code pointer to the start of the group
1291     options pointer to external options
1292     optbit the option bit whose changing is significant, or
1293     zero if none are
1294     skipassert TRUE if certain assertions are to be skipped
1295    
1296     Returns: pointer to the first significant opcode
1297     */
1298    
1299     static const uschar*
1300     first_significant_code(const uschar *code, int *options, int optbit,
1301     BOOL skipassert)
1302     {
1303     for (;;)
1304     {
1305     switch ((int)*code)
1306     {
1307     case OP_OPT:
1308     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1309     *options = (int)code[1];
1310     code += 2;
1311     break;
1312    
1313     case OP_ASSERT_NOT:
1314     case OP_ASSERTBACK:
1315     case OP_ASSERTBACK_NOT:
1316     if (!skipassert) return code;
1317     do code += GET(code, 1); while (*code == OP_ALT);
1318     code += _pcre_OP_lengths[*code];
1319     break;
1320    
1321     case OP_WORD_BOUNDARY:
1322     case OP_NOT_WORD_BOUNDARY:
1323     if (!skipassert) return code;
1324     /* Fall through */
1325    
1326     case OP_CALLOUT:
1327     case OP_CREF:
1328 ph10 459 case OP_NCREF:
1329 nigel 93 case OP_RREF:
1330 ph10 459 case OP_NRREF:
1331 nigel 93 case OP_DEF:
1332 nigel 77 code += _pcre_OP_lengths[*code];
1333     break;
1334    
1335     default:
1336     return code;
1337     }
1338     }
1339     /* Control never reaches here */
1340     }
1341    
1342    
1343    
1344    
1345     /*************************************************
1346 ph10 454 * Find the fixed length of a branch *
1347 nigel 77 *************************************************/
1348    
1349 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1350 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1351 ph10 461 In UTF8 mode, the result is in characters rather than bytes. The branch is
1352 ph10 454 temporarily terminated with OP_END when this function is called.
1353 nigel 77
1354 ph10 461 This function is called when a backward assertion is encountered, so that if it
1355     fails, the error message can point to the correct place in the pattern.
1356 ph10 454 However, we cannot do this when the assertion contains subroutine calls,
1357 ph10 461 because they can be forward references. We solve this by remembering this case
1358 ph10 454 and doing the check at the end; a flag specifies which mode we are running in.
1359    
1360 nigel 77 Arguments:
1361     code points to the start of the pattern (the bracket)
1362     options the compiling options
1363 ph10 461 atend TRUE if called when the pattern is complete
1364     cd the "compile data" structure
1365 nigel 77
1366 ph10 461 Returns: the fixed length,
1367 ph10 454 or -1 if there is no fixed length,
1368 nigel 77 or -2 if \C was encountered
1369 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1370 nigel 77 */
1371    
1372     static int
1373 ph10 454 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1374 nigel 77 {
1375     int length = -1;
1376    
1377     register int branchlength = 0;
1378     register uschar *cc = code + 1 + LINK_SIZE;
1379    
1380     /* Scan along the opcodes for this branch. If we get to the end of the
1381     branch, check the length against that of the other branches. */
1382    
1383     for (;;)
1384     {
1385     int d;
1386 ph10 454 uschar *ce, *cs;
1387 nigel 77 register int op = *cc;
1388     switch (op)
1389     {
1390 nigel 93 case OP_CBRA:
1391 nigel 77 case OP_BRA:
1392     case OP_ONCE:
1393     case OP_COND:
1394 ph10 454 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1395 nigel 77 if (d < 0) return d;
1396     branchlength += d;
1397     do cc += GET(cc, 1); while (*cc == OP_ALT);
1398     cc += 1 + LINK_SIZE;
1399     break;
1400    
1401     /* Reached end of a branch; if it's a ket it is the end of a nested
1402     call. If it's ALT it is an alternation in a nested call. If it is
1403     END it's the end of the outer call. All can be handled by the same code. */
1404    
1405     case OP_ALT:
1406     case OP_KET:
1407     case OP_KETRMAX:
1408     case OP_KETRMIN:
1409     case OP_END:
1410     if (length < 0) length = branchlength;
1411     else if (length != branchlength) return -1;
1412     if (*cc != OP_ALT) return length;
1413     cc += 1 + LINK_SIZE;
1414     branchlength = 0;
1415     break;
1416 ph10 461
1417 ph10 454 /* A true recursion implies not fixed length, but a subroutine call may
1418     be OK. If the subroutine is a forward reference, we can't deal with
1419     it until the end of the pattern, so return -3. */
1420 ph10 461
1421 ph10 454 case OP_RECURSE:
1422     if (!atend) return -3;
1423     cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1424     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1425     if (cc > cs && cc < ce) return -1; /* Recursion */
1426     d = find_fixedlength(cs + 2, options, atend, cd);
1427 ph10 461 if (d < 0) return d;
1428 ph10 454 branchlength += d;
1429     cc += 1 + LINK_SIZE;
1430 ph10 461 break;
1431 nigel 77
1432     /* Skip over assertive subpatterns */
1433    
1434     case OP_ASSERT:
1435     case OP_ASSERT_NOT:
1436     case OP_ASSERTBACK:
1437     case OP_ASSERTBACK_NOT:
1438     do cc += GET(cc, 1); while (*cc == OP_ALT);
1439     /* Fall through */
1440    
1441     /* Skip over things that don't match chars */
1442    
1443     case OP_REVERSE:
1444     case OP_CREF:
1445 ph10 459 case OP_NCREF:
1446 nigel 93 case OP_RREF:
1447 ph10 459 case OP_NRREF:
1448 nigel 93 case OP_DEF:
1449 nigel 77 case OP_OPT:
1450     case OP_CALLOUT:
1451     case OP_SOD:
1452     case OP_SOM:
1453 ph10 500 case OP_SET_SOM:
1454 nigel 77 case OP_EOD:
1455     case OP_EODN:
1456     case OP_CIRC:
1457     case OP_DOLL:
1458     case OP_NOT_WORD_BOUNDARY:
1459     case OP_WORD_BOUNDARY:
1460     cc += _pcre_OP_lengths[*cc];
1461     break;
1462    
1463     /* Handle literal characters */
1464    
1465     case OP_CHAR:
1466     case OP_CHARNC:
1467 nigel 91 case OP_NOT:
1468 nigel 77 branchlength++;
1469     cc += 2;
1470     #ifdef SUPPORT_UTF8
1471 ph10 461 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1472 ph10 426 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1473 nigel 77 #endif
1474     break;
1475    
1476     /* Handle exact repetitions. The count is already in characters, but we
1477     need to skip over a multibyte character in UTF8 mode. */
1478    
1479     case OP_EXACT:
1480     branchlength += GET2(cc,1);
1481     cc += 4;
1482     #ifdef SUPPORT_UTF8
1483 ph10 461 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1484 ph10 426 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1485 nigel 77 #endif
1486     break;
1487    
1488     case OP_TYPEEXACT:
1489     branchlength += GET2(cc,1);
1490 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1491 nigel 77 cc += 4;
1492     break;
1493    
1494     /* Handle single-char matchers */
1495    
1496     case OP_PROP:
1497     case OP_NOTPROP:
1498 nigel 87 cc += 2;
1499 nigel 77 /* Fall through */
1500    
1501     case OP_NOT_DIGIT:
1502     case OP_DIGIT:
1503     case OP_NOT_WHITESPACE:
1504     case OP_WHITESPACE:
1505     case OP_NOT_WORDCHAR:
1506     case OP_WORDCHAR:
1507     case OP_ANY:
1508 ph10 342 case OP_ALLANY:
1509 nigel 77 branchlength++;
1510     cc++;
1511     break;
1512    
1513     /* The single-byte matcher isn't allowed */
1514    
1515     case OP_ANYBYTE:
1516     return -2;
1517    
1518     /* Check a class for variable quantification */
1519    
1520     #ifdef SUPPORT_UTF8
1521     case OP_XCLASS:
1522     cc += GET(cc, 1) - 33;
1523     /* Fall through */
1524     #endif
1525    
1526     case OP_CLASS:
1527     case OP_NCLASS:
1528     cc += 33;
1529    
1530     switch (*cc)
1531     {
1532     case OP_CRSTAR:
1533     case OP_CRMINSTAR:
1534     case OP_CRQUERY:
1535     case OP_CRMINQUERY:
1536     return -1;
1537    
1538     case OP_CRRANGE:
1539     case OP_CRMINRANGE:
1540     if (GET2(cc,1) != GET2(cc,3)) return -1;
1541     branchlength += GET2(cc,1);
1542     cc += 5;
1543     break;
1544    
1545     default:
1546     branchlength++;
1547     }
1548     break;
1549    
1550     /* Anything else is variable length */
1551    
1552     default:
1553     return -1;
1554     }
1555     }
1556     /* Control never gets here */
1557     }
1558    
1559    
1560    
1561    
1562     /*************************************************
1563 ph10 454 * Scan compiled regex for specific bracket *
1564 nigel 77 *************************************************/
1565    
1566     /* This little function scans through a compiled pattern until it finds a
1567 ph10 454 capturing bracket with the given number, or, if the number is negative, an
1568 ph10 461 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1569     so that it can be called from pcre_study() when finding the minimum matching
1570 ph10 455 length.
1571 nigel 77
1572     Arguments:
1573     code points to start of expression
1574     utf8 TRUE in UTF-8 mode
1575 ph10 454 number the required bracket number or negative to find a lookbehind
1576 nigel 77
1577     Returns: pointer to the opcode for the bracket, or NULL if not found
1578     */
1579    
1580 ph10 455 const uschar *
1581     _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1582 nigel 77 {
1583     for (;;)
1584     {
1585     register int c = *code;
1586     if (c == OP_END) return NULL;
1587 nigel 91
1588     /* XCLASS is used for classes that cannot be represented just by a bit
1589     map. This includes negated single high-valued characters. The length in
1590     the table is zero; the actual length is stored in the compiled code. */
1591    
1592     if (c == OP_XCLASS) code += GET(code, 1);
1593 ph10 461
1594 ph10 454 /* Handle recursion */
1595 ph10 461
1596 ph10 454 else if (c == OP_REVERSE)
1597     {
1598 ph10 461 if (number < 0) return (uschar *)code;
1599 ph10 454 code += _pcre_OP_lengths[c];
1600     }
1601 nigel 91
1602 nigel 93 /* Handle capturing bracket */
1603 nigel 91
1604 nigel 93 else if (c == OP_CBRA)
1605 nigel 77 {
1606 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1607 nigel 77 if (n == number) return (uschar *)code;
1608 nigel 93 code += _pcre_OP_lengths[c];
1609 nigel 77 }
1610 nigel 91
1611 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1612     repeated character types, we have to test for \p and \P, which have an extra
1613 ph10 218 two bytes of parameters. */
1614 nigel 91
1615 nigel 77 else
1616     {
1617 ph10 218 switch(c)
1618     {
1619     case OP_TYPESTAR:
1620     case OP_TYPEMINSTAR:
1621     case OP_TYPEPLUS:
1622     case OP_TYPEMINPLUS:
1623     case OP_TYPEQUERY:
1624     case OP_TYPEMINQUERY:
1625     case OP_TYPEPOSSTAR:
1626     case OP_TYPEPOSPLUS:
1627     case OP_TYPEPOSQUERY:
1628     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1629 ph10 220 break;
1630 ph10 221
1631     case OP_TYPEUPTO:
1632     case OP_TYPEMINUPTO:
1633     case OP_TYPEEXACT:
1634     case OP_TYPEPOSUPTO:
1635     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1636     break;
1637 ph10 220 }
1638    
1639 ph10 218 /* Add in the fixed length from the table */
1640 ph10 220
1641 nigel 77 code += _pcre_OP_lengths[c];
1642 ph10 220
1643 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1644     a multi-byte character. The length in the table is a minimum, so we have to
1645     arrange to skip the extra bytes. */
1646 ph10 220
1647 ph10 107 #ifdef SUPPORT_UTF8
1648 nigel 77 if (utf8) switch(c)
1649     {
1650     case OP_CHAR:
1651     case OP_CHARNC:
1652     case OP_EXACT:
1653     case OP_UPTO:
1654     case OP_MINUPTO:
1655 nigel 93 case OP_POSUPTO:
1656 nigel 77 case OP_STAR:
1657     case OP_MINSTAR:
1658 nigel 93 case OP_POSSTAR:
1659 nigel 77 case OP_PLUS:
1660     case OP_MINPLUS:
1661 nigel 93 case OP_POSPLUS:
1662 nigel 77 case OP_QUERY:
1663     case OP_MINQUERY:
1664 nigel 93 case OP_POSQUERY:
1665     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1666 nigel 77 break;
1667     }
1668 ph10 369 #else
1669     (void)(utf8); /* Keep compiler happy by referencing function argument */
1670 ph10 111 #endif
1671 nigel 77 }
1672     }
1673     }
1674    
1675    
1676    
1677     /*************************************************
1678     * Scan compiled regex for recursion reference *
1679     *************************************************/
1680    
1681     /* This little function scans through a compiled pattern until it finds an
1682     instance of OP_RECURSE.
1683    
1684     Arguments:
1685     code points to start of expression
1686     utf8 TRUE in UTF-8 mode
1687    
1688     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1689     */
1690    
1691     static const uschar *
1692     find_recurse(const uschar *code, BOOL utf8)
1693     {
1694     for (;;)
1695     {
1696     register int c = *code;
1697     if (c == OP_END) return NULL;
1698 nigel 91 if (c == OP_RECURSE) return code;
1699 ph10 220
1700 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1701     map. This includes negated single high-valued characters. The length in
1702     the table is zero; the actual length is stored in the compiled code. */
1703    
1704     if (c == OP_XCLASS) code += GET(code, 1);
1705    
1706 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1707     repeated character types, we have to test for \p and \P, which have an extra
1708 ph10 218 two bytes of parameters. */
1709 nigel 91
1710 nigel 77 else
1711     {
1712 ph10 218 switch(c)
1713     {
1714     case OP_TYPESTAR:
1715     case OP_TYPEMINSTAR:
1716     case OP_TYPEPLUS:
1717     case OP_TYPEMINPLUS:
1718     case OP_TYPEQUERY:
1719     case OP_TYPEMINQUERY:
1720     case OP_TYPEPOSSTAR:
1721     case OP_TYPEPOSPLUS:
1722     case OP_TYPEPOSQUERY:
1723     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1724 ph10 220 break;
1725 ph10 221
1726     case OP_TYPEPOSUPTO:
1727     case OP_TYPEUPTO:
1728     case OP_TYPEMINUPTO:
1729     case OP_TYPEEXACT:
1730     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1731     break;
1732 ph10 220 }
1733    
1734 ph10 218 /* Add in the fixed length from the table */
1735    
1736 nigel 77 code += _pcre_OP_lengths[c];
1737 ph10 220
1738 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1739     by a multi-byte character. The length in the table is a minimum, so we have
1740     to arrange to skip the extra bytes. */
1741 ph10 220
1742 ph10 107 #ifdef SUPPORT_UTF8
1743 nigel 77 if (utf8) switch(c)
1744     {
1745     case OP_CHAR:
1746     case OP_CHARNC:
1747     case OP_EXACT:
1748     case OP_UPTO:
1749     case OP_MINUPTO:
1750 nigel 93 case OP_POSUPTO:
1751 nigel 77 case OP_STAR:
1752     case OP_MINSTAR:
1753 nigel 93 case OP_POSSTAR:
1754 nigel 77 case OP_PLUS:
1755     case OP_MINPLUS:
1756 nigel 93 case OP_POSPLUS:
1757 nigel 77 case OP_QUERY:
1758     case OP_MINQUERY:
1759 nigel 93 case OP_POSQUERY:
1760     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1761 nigel 77 break;
1762     }
1763 ph10 369 #else
1764     (void)(utf8); /* Keep compiler happy by referencing function argument */
1765 ph10 111 #endif
1766 nigel 77 }
1767     }
1768     }
1769    
1770    
1771    
1772     /*************************************************
1773     * Scan compiled branch for non-emptiness *
1774     *************************************************/
1775    
1776     /* This function scans through a branch of a compiled pattern to see whether it
1777 nigel 93 can match the empty string or not. It is called from could_be_empty()
1778     below and from compile_branch() when checking for an unlimited repeat of a
1779     group that can match nothing. Note that first_significant_code() skips over
1780 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1781     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1782     bracket whose current branch will already have been scanned.
1783 nigel 77
1784     Arguments:
1785     code points to start of search
1786     endcode points to where to stop
1787     utf8 TRUE if in UTF8 mode
1788 ph10 503 cd contains pointers to tables etc.
1789 nigel 77
1790     Returns: TRUE if what is matched could be empty
1791     */
1792    
1793     static BOOL
1794 ph10 503 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1795     compile_data *cd)
1796 nigel 77 {
1797     register int c;
1798 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1799 nigel 77 code < endcode;
1800     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1801     {
1802     const uschar *ccode;
1803    
1804     c = *code;
1805 ph10 503
1806 ph10 286 /* Skip over forward assertions; the other assertions are skipped by
1807 ph10 282 first_significant_code() with a TRUE final argument. */
1808 ph10 286
1809 ph10 282 if (c == OP_ASSERT)
1810 ph10 286 {
1811 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1812     c = *code;
1813     continue;
1814 ph10 286 }
1815 ph10 172
1816 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1817 nigel 77
1818 ph10 335 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1819 ph10 170 {
1820 ph10 172 code += _pcre_OP_lengths[c];
1821 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1822     c = *code;
1823     continue;
1824     }
1825 ph10 503
1826     /* For a recursion/subroutine call, if its end has been reached, which
1827     implies a subroutine call, we can scan it. */
1828    
1829     if (c == OP_RECURSE)
1830     {
1831     const uschar *scode = cd->start_code + GET(code, 1);
1832     if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
1833     do
1834     {
1835     if (!could_be_empty_branch(scode, endcode, utf8, cd)) return FALSE;
1836     scode += GET(scode, 1);
1837     }
1838     while (*scode == OP_ALT);
1839     continue;
1840     }
1841 ph10 170
1842     /* For other groups, scan the branches. */
1843 ph10 172
1844 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1845 nigel 77 {
1846     BOOL empty_branch;
1847     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1848 ph10 406
1849     /* If a conditional group has only one branch, there is a second, implied,
1850 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
1851     Otherwise, scan the individual branches of the group. */
1852 ph10 406
1853 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1854 nigel 77 code += GET(code, 1);
1855 ph10 395 else
1856 ph10 406 {
1857 ph10 395 empty_branch = FALSE;
1858     do
1859     {
1860 ph10 503 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
1861 ph10 395 empty_branch = TRUE;
1862     code += GET(code, 1);
1863     }
1864     while (*code == OP_ALT);
1865     if (!empty_branch) return FALSE; /* All branches are non-empty */
1866 nigel 77 }
1867 ph10 406
1868 ph10 172 c = *code;
1869 nigel 93 continue;
1870 nigel 77 }
1871    
1872 nigel 93 /* Handle the other opcodes */
1873    
1874     switch (c)
1875 nigel 77 {
1876 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1877     cannot be represented just by a bit map. This includes negated single
1878     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1879 ph10 220 actual length is stored in the compiled code, so we must update "code"
1880 ph10 216 here. */
1881 nigel 77
1882     #ifdef SUPPORT_UTF8
1883     case OP_XCLASS:
1884 ph10 216 ccode = code += GET(code, 1);
1885 nigel 77 goto CHECK_CLASS_REPEAT;
1886     #endif
1887    
1888     case OP_CLASS:
1889     case OP_NCLASS:
1890     ccode = code + 33;
1891    
1892     #ifdef SUPPORT_UTF8
1893     CHECK_CLASS_REPEAT:
1894     #endif
1895    
1896     switch (*ccode)
1897     {
1898     case OP_CRSTAR: /* These could be empty; continue */
1899     case OP_CRMINSTAR:
1900     case OP_CRQUERY:
1901     case OP_CRMINQUERY:
1902     break;
1903    
1904     default: /* Non-repeat => class must match */
1905     case OP_CRPLUS: /* These repeats aren't empty */
1906     case OP_CRMINPLUS:
1907     return FALSE;
1908    
1909     case OP_CRRANGE:
1910     case OP_CRMINRANGE:
1911     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1912     break;
1913     }
1914     break;
1915    
1916     /* Opcodes that must match a character */
1917    
1918     case OP_PROP:
1919     case OP_NOTPROP:
1920     case OP_EXTUNI:
1921     case OP_NOT_DIGIT:
1922     case OP_DIGIT:
1923     case OP_NOT_WHITESPACE:
1924     case OP_WHITESPACE:
1925     case OP_NOT_WORDCHAR:
1926     case OP_WORDCHAR:
1927     case OP_ANY:
1928 ph10 345 case OP_ALLANY:
1929 nigel 77 case OP_ANYBYTE:
1930     case OP_CHAR:
1931     case OP_CHARNC:
1932     case OP_NOT:
1933     case OP_PLUS:
1934     case OP_MINPLUS:
1935 nigel 93 case OP_POSPLUS:
1936 nigel 77 case OP_EXACT:
1937     case OP_NOTPLUS:
1938     case OP_NOTMINPLUS:
1939 nigel 93 case OP_NOTPOSPLUS:
1940 nigel 77 case OP_NOTEXACT:
1941     case OP_TYPEPLUS:
1942     case OP_TYPEMINPLUS:
1943 nigel 93 case OP_TYPEPOSPLUS:
1944 nigel 77 case OP_TYPEEXACT:
1945     return FALSE;
1946 ph10 227
1947     /* These are going to continue, as they may be empty, but we have to
1948     fudge the length for the \p and \P cases. */
1949    
1950 ph10 224 case OP_TYPESTAR:
1951     case OP_TYPEMINSTAR:
1952     case OP_TYPEPOSSTAR:
1953     case OP_TYPEQUERY:
1954     case OP_TYPEMINQUERY:
1955     case OP_TYPEPOSQUERY:
1956     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1957 ph10 227 break;
1958    
1959 ph10 224 /* Same for these */
1960 ph10 227
1961 ph10 224 case OP_TYPEUPTO:
1962     case OP_TYPEMINUPTO:
1963     case OP_TYPEPOSUPTO:
1964     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1965     break;
1966 nigel 77
1967     /* End of branch */
1968    
1969     case OP_KET:
1970     case OP_KETRMAX:
1971     case OP_KETRMIN:
1972     case OP_ALT:
1973     return TRUE;
1974    
1975 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1976     MINUPTO, and POSUPTO may be followed by a multibyte character */
1977 nigel 77
1978     #ifdef SUPPORT_UTF8
1979     case OP_STAR:
1980     case OP_MINSTAR:
1981 nigel 93 case OP_POSSTAR:
1982 nigel 77 case OP_QUERY:
1983     case OP_MINQUERY:
1984 nigel 93 case OP_POSQUERY:
1985 ph10 426 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1986     break;
1987 ph10 461
1988 nigel 77 case OP_UPTO:
1989     case OP_MINUPTO:
1990 nigel 93 case OP_POSUPTO:
1991 ph10 426 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
1992 nigel 77 break;
1993     #endif
1994 ph10 503
1995     /* None of the remaining opcodes are required to match a character. */
1996    
1997     default:
1998     break;
1999 nigel 77 }
2000     }
2001    
2002     return TRUE;
2003     }
2004    
2005    
2006    
2007     /*************************************************
2008     * Scan compiled regex for non-emptiness *
2009     *************************************************/
2010    
2011     /* This function is called to check for left recursive calls. We want to check
2012     the current branch of the current pattern to see if it could match the empty
2013     string. If it could, we must look outwards for branches at other levels,
2014     stopping when we pass beyond the bracket which is the subject of the recursion.
2015    
2016     Arguments:
2017     code points to start of the recursion
2018     endcode points to where to stop (current RECURSE item)
2019     bcptr points to the chain of current (unclosed) branch starts
2020     utf8 TRUE if in UTF-8 mode
2021 ph10 503 cd pointers to tables etc
2022 nigel 77
2023     Returns: TRUE if what is matched could be empty
2024     */
2025    
2026     static BOOL
2027     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2028 ph10 503 BOOL utf8, compile_data *cd)
2029 nigel 77 {
2030 ph10 475 while (bcptr != NULL && bcptr->current_branch >= code)
2031 nigel 77 {
2032 ph10 503 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2033 ph10 475 return FALSE;
2034 nigel 77 bcptr = bcptr->outer;
2035     }
2036     return TRUE;
2037     }
2038    
2039    
2040    
2041     /*************************************************
2042     * Check for POSIX class syntax *
2043     *************************************************/
2044    
2045     /* This function is called when the sequence "[:" or "[." or "[=" is
2046 ph10 295 encountered in a character class. It checks whether this is followed by a
2047 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2048 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2049 nigel 77
2050 ph10 298 Originally, this function only recognized a sequence of letters between the
2051     terminators, but it seems that Perl recognizes any sequence of characters,
2052     though of course unknown POSIX names are subsequently rejected. Perl gives an
2053     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2054     didn't consider this to be a POSIX class. Likewise for [:1234:].
2055 ph10 295
2056 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2057     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2058     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2059     below handles the special case of \], but does not try to do any other escape
2060     processing. This makes it different from Perl for cases such as [:l\ower:]
2061 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2062 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2063 ph10 295 I think.
2064    
2065     Arguments:
2066 nigel 77 ptr pointer to the initial [
2067     endptr where to return the end pointer
2068    
2069     Returns: TRUE or FALSE
2070     */
2071    
2072     static BOOL
2073 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2074 nigel 77 {
2075     int terminator; /* Don't combine these lines; the Solaris cc */
2076     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2077 ph10 295 for (++ptr; *ptr != 0; ptr++)
2078 nigel 77 {
2079 ph10 391 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2080 ph10 298 {
2081 ph10 391 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2082     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2083 ph10 295 {
2084     *endptr = ptr;
2085     return TRUE;
2086 ph10 298 }
2087     }
2088     }
2089 nigel 77 return FALSE;
2090     }
2091    
2092    
2093    
2094    
2095     /*************************************************
2096     * Check POSIX class name *
2097     *************************************************/
2098    
2099     /* This function is called to check the name given in a POSIX-style class entry
2100     such as [:alnum:].
2101    
2102     Arguments:
2103     ptr points to the first letter
2104     len the length of the name
2105    
2106     Returns: a value representing the name, or -1 if unknown
2107     */
2108    
2109     static int
2110     check_posix_name(const uschar *ptr, int len)
2111     {
2112 ph10 240 const char *pn = posix_names;
2113 nigel 77 register int yield = 0;
2114     while (posix_name_lengths[yield] != 0)
2115     {
2116     if (len == posix_name_lengths[yield] &&
2117 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2118 ph10 243 pn += posix_name_lengths[yield] + 1;
2119 nigel 77 yield++;
2120     }
2121     return -1;
2122     }
2123    
2124    
2125     /*************************************************
2126     * Adjust OP_RECURSE items in repeated group *
2127     *************************************************/
2128    
2129     /* OP_RECURSE items contain an offset from the start of the regex to the group
2130     that is referenced. This means that groups can be replicated for fixed
2131     repetition simply by copying (because the recursion is allowed to refer to
2132     earlier groups that are outside the current group). However, when a group is
2133 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2134     inserted before it, after it has been compiled. This means that any OP_RECURSE
2135     items within it that refer to the group itself or any contained groups have to
2136     have their offsets adjusted. That one of the jobs of this function. Before it
2137     is called, the partially compiled regex must be temporarily terminated with
2138     OP_END.
2139 nigel 77
2140 nigel 93 This function has been extended with the possibility of forward references for
2141     recursions and subroutine calls. It must also check the list of such references
2142     for the group we are dealing with. If it finds that one of the recursions in
2143     the current group is on this list, it adjusts the offset in the list, not the
2144     value in the reference (which is a group number).
2145    
2146 nigel 77 Arguments:
2147     group points to the start of the group
2148     adjust the amount by which the group is to be moved
2149     utf8 TRUE in UTF-8 mode
2150     cd contains pointers to tables etc.
2151 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2152 nigel 77
2153     Returns: nothing
2154     */
2155    
2156     static void
2157 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2158     uschar *save_hwm)
2159 nigel 77 {
2160     uschar *ptr = group;
2161 ph10 224
2162 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2163     {
2164 nigel 93 int offset;
2165     uschar *hc;
2166    
2167     /* See if this recursion is on the forward reference list. If so, adjust the
2168     reference. */
2169 ph10 345
2170 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2171     {
2172     offset = GET(hc, 0);
2173     if (cd->start_code + offset == ptr + 1)
2174     {
2175     PUT(hc, 0, offset + adjust);
2176     break;
2177     }
2178     }
2179    
2180     /* Otherwise, adjust the recursion offset if it's after the start of this
2181     group. */
2182    
2183     if (hc >= cd->hwm)
2184     {
2185     offset = GET(ptr, 1);
2186     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2187     }
2188    
2189 nigel 77 ptr += 1 + LINK_SIZE;
2190     }
2191     }
2192    
2193    
2194    
2195     /*************************************************
2196     * Insert an automatic callout point *
2197     *************************************************/
2198    
2199     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2200     callout points before each pattern item.
2201    
2202     Arguments:
2203     code current code pointer
2204     ptr current pattern pointer
2205     cd pointers to tables etc
2206    
2207     Returns: new code pointer
2208     */
2209    
2210     static uschar *
2211     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2212     {
2213     *code++ = OP_CALLOUT;
2214     *code++ = 255;
2215     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2216     PUT(code, LINK_SIZE, 0); /* Default length */
2217     return code + 2*LINK_SIZE;
2218     }
2219    
2220    
2221    
2222     /*************************************************
2223     * Complete a callout item *
2224     *************************************************/
2225    
2226     /* A callout item contains the length of the next item in the pattern, which
2227     we can't fill in till after we have reached the relevant point. This is used
2228     for both automatic and manual callouts.
2229    
2230     Arguments:
2231     previous_callout points to previous callout item
2232     ptr current pattern pointer
2233     cd pointers to tables etc
2234    
2235     Returns: nothing
2236     */
2237    
2238     static void
2239     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2240     {
2241     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2242     PUT(previous_callout, 2 + LINK_SIZE, length);
2243     }
2244    
2245    
2246    
2247     #ifdef SUPPORT_UCP
2248     /*************************************************
2249     * Get othercase range *
2250     *************************************************/
2251    
2252     /* This function is passed the start and end of a class range, in UTF-8 mode
2253     with UCP support. It searches up the characters, looking for internal ranges of
2254     characters in the "other" case. Each call returns the next one, updating the
2255     start address.
2256    
2257     Arguments:
2258     cptr points to starting character value; updated
2259     d end value
2260     ocptr where to put start of othercase range
2261     odptr where to put end of othercase range
2262    
2263     Yield: TRUE when range returned; FALSE when no more
2264     */
2265    
2266     static BOOL
2267 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2268     unsigned int *odptr)
2269 nigel 77 {
2270 nigel 93 unsigned int c, othercase, next;
2271 nigel 77
2272     for (c = *cptr; c <= d; c++)
2273 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2274 nigel 77
2275     if (c > d) return FALSE;
2276    
2277     *ocptr = othercase;
2278     next = othercase + 1;
2279    
2280     for (++c; c <= d; c++)
2281     {
2282 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2283 nigel 77 next++;
2284     }
2285    
2286     *odptr = next - 1;
2287     *cptr = c;
2288    
2289     return TRUE;
2290     }
2291     #endif /* SUPPORT_UCP */
2292    
2293    
2294 nigel 93
2295 nigel 77 /*************************************************
2296 nigel 93 * Check if auto-possessifying is possible *
2297     *************************************************/
2298    
2299     /* This function is called for unlimited repeats of certain items, to see
2300     whether the next thing could possibly match the repeated item. If not, it makes
2301     sense to automatically possessify the repeated item.
2302    
2303     Arguments:
2304     op_code the repeated op code
2305     this data for this item, depends on the opcode
2306     utf8 TRUE in UTF-8 mode
2307     utf8_char used for utf8 character bytes, NULL if not relevant
2308     ptr next character in pattern
2309     options options bits
2310     cd contains pointers to tables etc.
2311    
2312     Returns: TRUE if possessifying is wanted
2313     */
2314    
2315     static BOOL
2316     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2317     const uschar *ptr, int options, compile_data *cd)
2318     {
2319     int next;
2320    
2321     /* Skip whitespace and comments in extended mode */
2322    
2323     if ((options & PCRE_EXTENDED) != 0)
2324     {
2325     for (;;)
2326     {
2327     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2328 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2329 nigel 93 {
2330     while (*(++ptr) != 0)
2331     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2332     }
2333     else break;
2334     }
2335     }
2336    
2337     /* If the next item is one that we can handle, get its value. A non-negative
2338     value is a character, a negative value is an escape value. */
2339    
2340 ph10 391 if (*ptr == CHAR_BACKSLASH)
2341 nigel 93 {
2342     int temperrorcode = 0;
2343     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2344     if (temperrorcode != 0) return FALSE;
2345     ptr++; /* Point after the escape sequence */
2346     }
2347    
2348     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2349     {
2350     #ifdef SUPPORT_UTF8
2351     if (utf8) { GETCHARINC(next, ptr); } else
2352     #endif
2353     next = *ptr++;
2354     }
2355    
2356     else return FALSE;
2357    
2358     /* Skip whitespace and comments in extended mode */
2359    
2360     if ((options & PCRE_EXTENDED) != 0)
2361     {
2362     for (;;)
2363     {
2364     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2365 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2366 nigel 93 {
2367     while (*(++ptr) != 0)
2368     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2369     }
2370     else break;
2371     }
2372     }
2373    
2374     /* If the next thing is itself optional, we have to give up. */
2375    
2376 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2377 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2378     return FALSE;
2379 nigel 93
2380     /* Now compare the next item with the previous opcode. If the previous is a
2381     positive single character match, "item" either contains the character or, if
2382     "item" is greater than 127 in utf8 mode, the character's bytes are in
2383     utf8_char. */
2384    
2385    
2386     /* Handle cases when the next item is a character. */
2387    
2388     if (next >= 0) switch(op_code)
2389     {
2390     case OP_CHAR:
2391     #ifdef SUPPORT_UTF8
2392     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2393 ph10 369 #else
2394     (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2395 nigel 93 #endif
2396     return item != next;
2397    
2398     /* For CHARNC (caseless character) we must check the other case. If we have
2399     Unicode property support, we can use it to test the other case of
2400     high-valued characters. */
2401    
2402     case OP_CHARNC:
2403     #ifdef SUPPORT_UTF8
2404     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2405     #endif
2406     if (item == next) return FALSE;
2407     #ifdef SUPPORT_UTF8
2408     if (utf8)
2409     {
2410     unsigned int othercase;
2411     if (next < 128) othercase = cd->fcc[next]; else
2412     #ifdef SUPPORT_UCP
2413 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2414 nigel 93 #else
2415     othercase = NOTACHAR;
2416     #endif
2417     return (unsigned int)item != othercase;
2418     }
2419     else
2420     #endif /* SUPPORT_UTF8 */
2421     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2422    
2423     /* For OP_NOT, "item" must be a single-byte character. */
2424    
2425     case OP_NOT:
2426     if (item == next) return TRUE;
2427     if ((options & PCRE_CASELESS) == 0) return FALSE;
2428     #ifdef SUPPORT_UTF8
2429     if (utf8)
2430     {
2431     unsigned int othercase;
2432     if (next < 128) othercase = cd->fcc[next]; else
2433     #ifdef SUPPORT_UCP
2434 ph10 349 othercase = UCD_OTHERCASE(next);
2435 nigel 93 #else
2436     othercase = NOTACHAR;
2437     #endif
2438     return (unsigned int)item == othercase;
2439     }
2440     else
2441     #endif /* SUPPORT_UTF8 */
2442     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2443    
2444     case OP_DIGIT:
2445     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2446    
2447     case OP_NOT_DIGIT:
2448     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2449    
2450     case OP_WHITESPACE:
2451     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2452    
2453     case OP_NOT_WHITESPACE:
2454     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2455    
2456     case OP_WORDCHAR:
2457     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2458    
2459     case OP_NOT_WORDCHAR:
2460     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2461    
2462 ph10 180 case OP_HSPACE:
2463     case OP_NOT_HSPACE:
2464     switch(next)
2465     {
2466     case 0x09:
2467     case 0x20:
2468     case 0xa0:
2469     case 0x1680:
2470     case 0x180e:
2471     case 0x2000:
2472     case 0x2001:
2473     case 0x2002:
2474     case 0x2003:
2475     case 0x2004:
2476     case 0x2005:
2477     case 0x2006:
2478     case 0x2007:
2479     case 0x2008:
2480     case 0x2009:
2481     case 0x200A:
2482     case 0x202f:
2483     case 0x205f:
2484     case 0x3000:
2485     return op_code != OP_HSPACE;
2486     default:
2487     return op_code == OP_HSPACE;
2488     }
2489    
2490     case OP_VSPACE:
2491     case OP_NOT_VSPACE:
2492     switch(next)
2493     {
2494     case 0x0a:
2495     case 0x0b:
2496     case 0x0c:
2497     case 0x0d:
2498     case 0x85:
2499     case 0x2028:
2500     case 0x2029:
2501     return op_code != OP_VSPACE;
2502     default:
2503     return op_code == OP_VSPACE;
2504     }
2505    
2506 nigel 93 default:
2507     return FALSE;
2508     }
2509    
2510    
2511     /* Handle the case when the next item is \d, \s, etc. */
2512    
2513     switch(op_code)
2514     {
2515     case OP_CHAR:
2516     case OP_CHARNC:
2517     #ifdef SUPPORT_UTF8
2518     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2519     #endif
2520     switch(-next)
2521     {
2522     case ESC_d:
2523     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2524    
2525     case ESC_D:
2526     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2527    
2528     case ESC_s:
2529     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2530    
2531     case ESC_S:
2532     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2533    
2534     case ESC_w:
2535     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2536    
2537     case ESC_W:
2538     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2539 ph10 182
2540 ph10 180 case ESC_h:
2541     case ESC_H:
2542     switch(item)
2543     {
2544     case 0x09:
2545     case 0x20:
2546     case 0xa0:
2547     case 0x1680:
2548     case 0x180e:
2549     case 0x2000:
2550     case 0x2001:
2551     case 0x2002:
2552     case 0x2003:
2553     case 0x2004:
2554     case 0x2005:
2555     case 0x2006:
2556     case 0x2007:
2557     case 0x2008:
2558     case 0x2009:
2559     case 0x200A:
2560     case 0x202f:
2561     case 0x205f:
2562     case 0x3000:
2563     return -next != ESC_h;
2564     default:
2565     return -next == ESC_h;
2566 ph10 182 }
2567    
2568 ph10 180 case ESC_v:
2569     case ESC_V:
2570     switch(item)
2571     {
2572     case 0x0a:
2573     case 0x0b:
2574     case 0x0c:
2575     case 0x0d:
2576     case 0x85:
2577     case 0x2028:
2578     case 0x2029:
2579     return -next != ESC_v;
2580     default:
2581     return -next == ESC_v;
2582 ph10 182 }
2583 nigel 93
2584     default:
2585     return FALSE;
2586     }
2587    
2588     case OP_DIGIT:
2589 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2590     next == -ESC_h || next == -ESC_v;
2591 nigel 93
2592     case OP_NOT_DIGIT:
2593     return next == -ESC_d;
2594    
2595     case OP_WHITESPACE:
2596     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2597    
2598     case OP_NOT_WHITESPACE:
2599 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2600 nigel 93
2601 ph10 180 case OP_HSPACE:
2602     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2603    
2604     case OP_NOT_HSPACE:
2605     return next == -ESC_h;
2606 ph10 182
2607 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2608 ph10 182 case OP_VSPACE:
2609 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2610    
2611     case OP_NOT_VSPACE:
2612 ph10 182 return next == -ESC_v;
2613 ph10 180
2614 nigel 93 case OP_WORDCHAR:
2615 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2616 nigel 93
2617     case OP_NOT_WORDCHAR:
2618     return next == -ESC_w || next == -ESC_d;
2619 ph10 182
2620 nigel 93 default:
2621     return FALSE;
2622     }
2623    
2624     /* Control does not reach here */
2625     }
2626    
2627    
2628    
2629     /*************************************************
2630 nigel 77 * Compile one branch *
2631     *************************************************/
2632    
2633 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2634 nigel 77 changed during the branch, the pointer is used to change the external options
2635 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2636     to find out the amount of memory needed, as well as during the real compile
2637     phase. The value of lengthptr distinguishes the two phases.
2638 nigel 77
2639     Arguments:
2640     optionsptr pointer to the option bits
2641     codeptr points to the pointer to the current code point
2642     ptrptr points to the current pattern pointer
2643     errorcodeptr points to error code variable
2644     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2645     reqbyteptr set to the last literal character required, else < 0
2646     bcptr points to current branch chain
2647     cd contains pointers to tables etc.
2648 nigel 93 lengthptr NULL during the real compile phase
2649     points to length accumulator during pre-compile phase
2650 nigel 77
2651     Returns: TRUE on success
2652     FALSE, with *errorcodeptr set non-zero on error
2653     */
2654    
2655     static BOOL
2656 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2657     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2658     compile_data *cd, int *lengthptr)
2659 nigel 77 {
2660     int repeat_type, op_type;
2661     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2662     int bravalue = 0;
2663     int greedy_default, greedy_non_default;
2664     int firstbyte, reqbyte;
2665     int zeroreqbyte, zerofirstbyte;
2666     int req_caseopt, reqvary, tempreqvary;
2667     int options = *optionsptr;
2668     int after_manual_callout = 0;
2669 nigel 93 int length_prevgroup = 0;
2670 nigel 77 register int c;
2671     register uschar *code = *codeptr;
2672 nigel 93 uschar *last_code = code;
2673     uschar *orig_code = code;
2674 nigel 77 uschar *tempcode;
2675     BOOL inescq = FALSE;
2676     BOOL groupsetfirstbyte = FALSE;
2677     const uschar *ptr = *ptrptr;
2678     const uschar *tempptr;
2679     uschar *previous = NULL;
2680     uschar *previous_callout = NULL;
2681 nigel 93 uschar *save_hwm = NULL;
2682 nigel 77 uschar classbits[32];
2683    
2684     #ifdef SUPPORT_UTF8
2685     BOOL class_utf8;
2686     BOOL utf8 = (options & PCRE_UTF8) != 0;
2687     uschar *class_utf8data;
2688 ph10 300 uschar *class_utf8data_base;
2689 nigel 77 uschar utf8_char[6];
2690     #else
2691     BOOL utf8 = FALSE;
2692 nigel 93 uschar *utf8_char = NULL;
2693 nigel 77 #endif
2694    
2695 ph10 475 #ifdef PCRE_DEBUG
2696 nigel 93 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2697     #endif
2698    
2699 nigel 77 /* Set up the default and non-default settings for greediness */
2700    
2701     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2702     greedy_non_default = greedy_default ^ 1;
2703    
2704     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2705     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2706     matches a non-fixed char first char; reqbyte just remains unset if we never
2707     find one.
2708    
2709     When we hit a repeat whose minimum is zero, we may have to adjust these values
2710     to take the zero repeat into account. This is implemented by setting them to
2711     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2712     item types that can be repeated set these backoff variables appropriately. */
2713    
2714     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2715    
2716     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2717     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2718     value > 255. It is added into the firstbyte or reqbyte variables to record the
2719     case status of the value. This is used only for ASCII characters. */
2720    
2721     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2722    
2723     /* Switch on next character until the end of the branch */
2724    
2725     for (;; ptr++)
2726     {
2727     BOOL negate_class;
2728 ph10 286 BOOL should_flip_negation;
2729 nigel 77 BOOL possessive_quantifier;
2730     BOOL is_quantifier;
2731 nigel 93 BOOL is_recurse;
2732 ph10 180 BOOL reset_bracount;
2733 nigel 77 int class_charcount;
2734     int class_lastchar;
2735     int newoptions;
2736     int recno;
2737 ph10 172 int refsign;
2738 nigel 77 int skipbytes;
2739     int subreqbyte;
2740     int subfirstbyte;
2741 nigel 93 int terminator;
2742 nigel 77 int mclength;
2743     uschar mcbuffer[8];
2744    
2745 nigel 93 /* Get next byte in the pattern */
2746 nigel 77
2747     c = *ptr;
2748 ph10 345
2749 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2750     previous cycle of this loop. */
2751    
2752     if (lengthptr != NULL)
2753     {
2754 ph10 475 #ifdef PCRE_DEBUG
2755 nigel 93 if (code > cd->hwm) cd->hwm = code; /* High water info */
2756     #endif
2757     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2758     {
2759     *errorcodeptr = ERR52;
2760     goto FAILED;
2761     }
2762    
2763     /* There is at least one situation where code goes backwards: this is the
2764     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2765     the class is simply eliminated. However, it is created first, so we have to
2766     allow memory for it. Therefore, don't ever reduce the length at this point.
2767     */
2768    
2769     if (code < last_code) code = last_code;
2770 ph10 202
2771     /* Paranoid check for integer overflow */
2772    
2773     if (OFLOW_MAX - *lengthptr < code - last_code)
2774     {
2775     *errorcodeptr = ERR20;
2776     goto FAILED;
2777     }
2778    
2779 nigel 93 *lengthptr += code - last_code;
2780     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2781    
2782     /* If "previous" is set and it is not at the start of the work space, move
2783     it back to there, in order to avoid filling up the work space. Otherwise,
2784     if "previous" is NULL, reset the current code pointer to the start. */
2785    
2786     if (previous != NULL)
2787     {
2788     if (previous > orig_code)
2789     {
2790     memmove(orig_code, previous, code - previous);
2791     code -= previous - orig_code;
2792     previous = orig_code;
2793     }
2794     }
2795     else code = orig_code;
2796    
2797     /* Remember where this code item starts so we can pick up the length
2798     next time round. */
2799    
2800     last_code = code;
2801     }
2802    
2803     /* In the real compile phase, just check the workspace used by the forward
2804     reference list. */
2805    
2806     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2807     {
2808     *errorcodeptr = ERR52;
2809     goto FAILED;
2810     }
2811    
2812 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2813    
2814     if (inescq && c != 0)
2815     {
2816 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2817 nigel 77 {
2818     inescq = FALSE;
2819     ptr++;
2820     continue;
2821     }
2822     else
2823     {
2824     if (previous_callout != NULL)
2825     {
2826 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2827     complete_callout(previous_callout, ptr, cd);
2828 nigel 77 previous_callout = NULL;
2829     }
2830     if ((options & PCRE_AUTO_CALLOUT) != 0)
2831     {
2832     previous_callout = code;
2833     code = auto_callout(code, ptr, cd);
2834     }
2835     goto NORMAL_CHAR;
2836     }
2837     }
2838    
2839     /* Fill in length of a previous callout, except when the next thing is
2840     a quantifier. */
2841    
2842 ph10 392 is_quantifier =
2843 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2844     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2845 nigel 77
2846     if (!is_quantifier && previous_callout != NULL &&
2847     after_manual_callout-- <= 0)
2848     {
2849 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2850     complete_callout(previous_callout, ptr, cd);
2851 nigel 77 previous_callout = NULL;
2852     }
2853    
2854     /* In extended mode, skip white space and comments */
2855    
2856     if ((options & PCRE_EXTENDED) != 0)
2857     {
2858     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2859 ph10 391 if (c == CHAR_NUMBER_SIGN)
2860 nigel 77 {
2861 nigel 93 while (*(++ptr) != 0)
2862 nigel 91 {
2863 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2864 nigel 91 }
2865 nigel 93 if (*ptr != 0) continue;
2866    
2867 nigel 91 /* Else fall through to handle end of string */
2868     c = 0;
2869 nigel 77 }
2870     }
2871    
2872     /* No auto callout for quantifiers. */
2873    
2874     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2875     {
2876     previous_callout = code;
2877     code = auto_callout(code, ptr, cd);
2878     }
2879    
2880     switch(c)
2881     {
2882 nigel 93 /* ===================================================================*/
2883     case 0: /* The branch terminates at string end */
2884 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
2885     case CHAR_RIGHT_PARENTHESIS:
2886 nigel 77 *firstbyteptr = firstbyte;
2887     *reqbyteptr = reqbyte;
2888     *codeptr = code;
2889     *ptrptr = ptr;
2890 nigel 93 if (lengthptr != NULL)
2891     {
2892 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2893     {
2894     *errorcodeptr = ERR20;
2895     goto FAILED;
2896     }
2897 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2898     DPRINTF((">> end branch\n"));
2899     }
2900 nigel 77 return TRUE;
2901    
2902 nigel 93
2903     /* ===================================================================*/
2904 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2905     the setting of any following char as a first character. */
2906    
2907 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
2908 nigel 77 if ((options & PCRE_MULTILINE) != 0)
2909     {
2910     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2911     }
2912     previous = NULL;
2913     *code++ = OP_CIRC;
2914     break;
2915    
2916 ph10 391 case CHAR_DOLLAR_SIGN:
2917 nigel 77 previous = NULL;
2918     *code++ = OP_DOLL;
2919     break;
2920    
2921     /* There can never be a first char if '.' is first, whatever happens about
2922     repeats. The value of reqbyte doesn't change either. */
2923    
2924 ph10 391 case CHAR_DOT:
2925 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2926     zerofirstbyte = firstbyte;
2927     zeroreqbyte = reqbyte;
2928     previous = code;
2929 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2930 nigel 77 break;
2931    
2932 nigel 93
2933     /* ===================================================================*/
2934 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2935     32-byte bitmap of the permitted characters, except in the special case
2936     where there is only one such character. For negated classes, we build the
2937     map as usual, then invert it at the end. However, we use a different opcode
2938     so that data characters > 255 can be handled correctly.
2939 nigel 77
2940     If the class contains characters outside the 0-255 range, a different
2941     opcode is compiled. It may optionally have a bit map for characters < 256,
2942     but those above are are explicitly listed afterwards. A flag byte tells
2943     whether the bitmap is present, and whether this is a negated class or not.
2944 ph10 345
2945 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
2946     default (Perl) mode, it is treated as a data character. */
2947 ph10 345
2948 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
2949 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2950     {
2951     *errorcodeptr = ERR64;
2952 ph10 345 goto FAILED;
2953 ph10 336 }
2954 ph10 345 goto NORMAL_CHAR;
2955 nigel 77
2956 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
2957 nigel 77 previous = code;
2958    
2959     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2960     they are encountered at the top level, so we'll do that too. */
2961    
2962 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2963 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
2964 ph10 295 check_posix_syntax(ptr, &tempptr))
2965 nigel 77 {
2966 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2967 nigel 77 goto FAILED;
2968     }
2969    
2970 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2971 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2972 ph10 205 skip them too. This makes for compatibility with Perl. */
2973 ph10 208
2974 ph10 205 negate_class = FALSE;
2975     for (;;)
2976 nigel 77 {
2977     c = *(++ptr);
2978 ph10 391 if (c == CHAR_BACKSLASH)
2979 ph10 205 {
2980 ph10 392 if (ptr[1] == CHAR_E)
2981 ph10 391 ptr++;
2982 ph10 392 else if (strncmp((const char *)ptr+1,
2983     STR_Q STR_BACKSLASH STR_E, 3) == 0)
2984 ph10 391 ptr += 3;
2985 ph10 392 else
2986 ph10 391 break;
2987 ph10 205 }
2988 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2989 ph10 205 negate_class = TRUE;
2990     else break;
2991 ph10 208 }
2992 ph10 345
2993     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2994     an initial ']' is taken as a data character -- the code below handles
2995 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2996     [^] must match any character, so generate OP_ALLANY. */
2997 ph10 345
2998 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2999 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3000 ph10 341 {
3001     *code++ = negate_class? OP_ALLANY : OP_FAIL;
3002     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3003     zerofirstbyte = firstbyte;
3004     break;
3005 ph10 345 }
3006 nigel 77
3007 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
3008     negation flag at the end, so that support for characters > 255 works
3009 ph10 264 correctly (they are all included in the class). */
3010    
3011     should_flip_negation = FALSE;
3012    
3013 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
3014 nigel 93 of just a single character (as long as it's < 256). However, For higher
3015     valued UTF-8 characters, we don't yet do any optimization. */
3016 nigel 77
3017     class_charcount = 0;
3018     class_lastchar = -1;
3019    
3020 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
3021     temporary bit of memory, in case the class contains only 1 character (less
3022     than 256), because in that case the compiled code doesn't use the bit map.
3023     */
3024    
3025     memset(classbits, 0, 32 * sizeof(uschar));
3026    
3027 nigel 77 #ifdef SUPPORT_UTF8
3028     class_utf8 = FALSE; /* No chars >= 256 */
3029 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3030 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3031 nigel 77 #endif
3032    
3033     /* Process characters until ] is reached. By writing this as a "do" it
3034 nigel 93 means that an initial ] is taken as a data character. At the start of the
3035     loop, c contains the first byte of the character. */
3036 nigel 77
3037 nigel 93 if (c != 0) do
3038 nigel 77 {
3039 nigel 93 const uschar *oldptr;
3040    
3041 nigel 77 #ifdef SUPPORT_UTF8
3042     if (utf8 && c > 127)
3043     { /* Braces are required because the */
3044     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3045     }
3046 ph10 309
3047 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3048 ph10 309 data and reset the pointer. This is so that very large classes that
3049 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
3050 ph10 309 (which is on the stack). */
3051    
3052 ph10 300 if (lengthptr != NULL)
3053     {
3054     *lengthptr += class_utf8data - class_utf8data_base;
3055 ph10 309 class_utf8data = class_utf8data_base;
3056     }
3057    
3058 nigel 77 #endif
3059    
3060     /* Inside \Q...\E everything is literal except \E */
3061    
3062     if (inescq)
3063     {
3064 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3065 nigel 77 {
3066 nigel 93 inescq = FALSE; /* Reset literal state */
3067     ptr++; /* Skip the 'E' */
3068     continue; /* Carry on with next */
3069 nigel 77 }
3070 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
3071 nigel 77 }
3072    
3073     /* Handle POSIX class names. Perl allows a negation extension of the
3074     form [:^name:]. A square bracket that doesn't match the syntax is
3075     treated as a literal. We also recognize the POSIX constructions
3076     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3077     5.6 and 5.8 do. */
3078    
3079 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3080 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3081 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3082 nigel 77 {
3083     BOOL local_negate = FALSE;
3084 nigel 87 int posix_class, taboffset, tabopt;
3085 nigel 77 register const uschar *cbits = cd->cbits;
3086 nigel 87 uschar pbits[32];
3087 nigel 77
3088 ph10 391 if (ptr[1] != CHAR_COLON)
3089 nigel 77 {
3090     *errorcodeptr = ERR31;
3091     goto FAILED;
3092     }
3093    
3094     ptr += 2;
3095 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3096 nigel 77 {
3097     local_negate = TRUE;
3098 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3099 nigel 77 ptr++;
3100     }
3101    
3102     posix_class = check_posix_name(ptr, tempptr - ptr);
3103     if (posix_class < 0)
3104     {
3105     *errorcodeptr = ERR30;
3106     goto FAILED;
3107     }
3108    
3109     /* If matching is caseless, upper and lower are converted to
3110     alpha. This relies on the fact that the class table starts with
3111     alpha, lower, upper as the first 3 entries. */
3112    
3113     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3114     posix_class = 0;
3115    
3116 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
3117     because we may be adding and subtracting from it, and we don't want to
3118     subtract bits that may be in the main map already. At the end we or the
3119     result into the bit map that is being built. */
3120 nigel 77
3121     posix_class *= 3;
3122 nigel 87
3123     /* Copy in the first table (always present) */
3124    
3125     memcpy(pbits, cbits + posix_class_maps[posix_class],
3126     32 * sizeof(uschar));
3127    
3128     /* If there is a second table, add or remove it as required. */
3129    
3130     taboffset = posix_class_maps[posix_class + 1];
3131     tabopt = posix_class_maps[posix_class + 2];
3132    
3133     if (taboffset >= 0)
3134 nigel 77 {
3135 nigel 87 if (tabopt >= 0)
3136     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3137 nigel 77 else
3138 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3139 nigel 77 }
3140    
3141 nigel 87 /* Not see if we need to remove any special characters. An option
3142     value of 1 removes vertical space and 2 removes underscore. */
3143    
3144     if (tabopt < 0) tabopt = -tabopt;
3145     if (tabopt == 1) pbits[1] &= ~0x3c;
3146     else if (tabopt == 2) pbits[11] &= 0x7f;
3147    
3148     /* Add the POSIX table or its complement into the main table that is
3149     being built and we are done. */
3150    
3151     if (local_negate)
3152     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3153     else
3154     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3155    
3156 nigel 77 ptr = tempptr + 1;
3157     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3158     continue; /* End of POSIX syntax handling */
3159     }
3160    
3161     /* Backslash may introduce a single character, or it may introduce one
3162 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3163     case. Inside a class (and only there) it is treated as backspace.
3164     Elsewhere it marks a word boundary. Other escapes have preset maps ready
3165 ph10 205 to 'or' into the one we are building. We assume they have more than one
3166 nigel 77 character in them, so set class_charcount bigger than one. */
3167    
3168 ph10 391 if (c == CHAR_BACKSLASH)
3169 nigel 77 {
3170 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3171     if (*errorcodeptr != 0) goto FAILED;
3172 nigel 77
3173 ph10 391 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3174     else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
3175     else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
3176 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3177     {
3178 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3179 nigel 77 {
3180     ptr += 2; /* avoid empty string */
3181     }
3182     else inescq = TRUE;
3183     continue;
3184     }
3185 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3186 nigel 77
3187     if (c < 0)
3188     {
3189     register const uschar *cbits = cd->cbits;
3190     class_charcount += 2; /* Greater than 1 is what matters */
3191 nigel 93
3192     /* Save time by not doing this in the pre-compile phase. */
3193    
3194     if (lengthptr == NULL) switch (-c)
3195 nigel 77 {
3196     case ESC_d:
3197     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3198     continue;
3199    
3200     case ESC_D:
3201 ph10 286 should_flip_negation = TRUE;
3202 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3203     continue;
3204    
3205     case ESC_w:
3206     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3207     continue;
3208    
3209     case ESC_W:
3210 ph10 286 should_flip_negation = TRUE;
3211 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3212     continue;
3213    
3214     case ESC_s:
3215     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3216     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3217     continue;
3218    
3219     case ESC_S:
3220 ph10 286 should_flip_negation = TRUE;
3221 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3222     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3223     continue;
3224    
3225 nigel 93 default: /* Not recognized; fall through */
3226     break; /* Need "default" setting to stop compiler warning. */
3227     }
3228    
3229     /* In the pre-compile phase, just do the recognition. */
3230    
3231     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3232     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3233 ph10 180
3234 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
3235     they use extra memory. */
3236 ph10 180
3237 ph10 178 if (-c == ESC_h)
3238     {
3239     SETBIT(classbits, 0x09); /* VT */
3240     SETBIT(classbits, 0x20); /* SPACE */
3241 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
3242 ph10 178 #ifdef SUPPORT_UTF8
3243     if (utf8)
3244 ph10 180 {
3245 ph10 178 class_utf8 = TRUE;
3246     *class_utf8data++ = XCL_SINGLE;
3247 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3248 ph10 178 *class_utf8data++ = XCL_SINGLE;
3249 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3250     *class_utf8data++ = XCL_RANGE;
3251     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3252     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3253 ph10 178 *class_utf8data++ = XCL_SINGLE;
3254 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3255 ph10 178 *class_utf8data++ = XCL_SINGLE;
3256 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3257 ph10 178 *class_utf8data++ = XCL_SINGLE;
3258 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3259     }
3260     #endif
3261     continue;
3262     }
3263 nigel 93
3264 ph10 178 if (-c == ESC_H)
3265     {
3266     for (c = 0; c < 32; c++)
3267     {
3268     int x = 0xff;
3269     switch (c)
3270 ph10 180 {
3271 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
3272     case 0x20/8: x ^= 1 << (0x20%8); break;
3273     case 0xa0/8: x ^= 1 << (0xa0%8); break;
3274     default: break;
3275     }
3276     classbits[c] |= x;
3277 ph10 180 }
3278    
3279 ph10 178 #ifdef SUPPORT_UTF8
3280     if (utf8)
3281 ph10 180 {
3282 ph10 178 class_utf8 = TRUE;
3283 ph10 180 *class_utf8data++ = XCL_RANGE;
3284     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3285     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3286     *class_utf8data++ = XCL_RANGE;
3287     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3288     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3289     *class_utf8data++ = XCL_RANGE;
3290     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3291     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3292     *class_utf8data++ = XCL_RANGE;
3293     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3294     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3295     *class_utf8data++ = XCL_RANGE;
3296     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3297     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3298     *class_utf8data++ = XCL_RANGE;
3299     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3300     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3301     *class_utf8data++ = XCL_RANGE;
3302     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3303     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3304     }
3305     #endif
3306     continue;
3307     }
3308 ph10 178
3309     if (-c == ESC_v)
3310     {
3311     SETBIT(classbits, 0x0a); /* LF */
3312     SETBIT(classbits, 0x0b); /* VT */
3313 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3314     SETBIT(classbits, 0x0d); /* CR */
3315     SETBIT(classbits, 0x85); /* NEL */
3316 ph10 178 #ifdef SUPPORT_UTF8
3317     if (utf8)
3318 ph10 180 {
3319 ph10 178 class_utf8 = TRUE;
3320 ph10 180 *class_utf8data++ = XCL_RANGE;
3321     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3322     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3323     }
3324     #endif
3325     continue;
3326     }
3327 ph10 178
3328     if (-c == ESC_V)
3329     {
3330     for (c = 0; c < 32; c++)
3331     {
3332     int x = 0xff;
3333     switch (c)
3334 ph10 180 {
3335 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3336     x ^= 1 << (0x0b%8);
3337     x ^= 1 << (0x0c%8);
3338 ph10 180 x ^= 1 << (0x0d%8);
3339 ph10 178 break;
3340     case 0x85/8: x ^= 1 << (0x85%8); break;
3341     default: break;
3342     }
3343     classbits[c] |= x;
3344 ph10 180 }
3345    
3346 ph10 178 #ifdef SUPPORT_UTF8
3347     if (utf8)
3348 ph10 180 {
3349 ph10 178 class_utf8 = TRUE;
3350 ph10 180 *class_utf8data++ = XCL_RANGE;
3351     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3352     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3353     *class_utf8data++ = XCL_RANGE;
3354     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3355     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3356     }
3357     #endif
3358     continue;
3359     }
3360 ph10 178
3361 nigel 93 /* We need to deal with \P and \p in both phases. */
3362    
3363 nigel 77 #ifdef SUPPORT_UCP
3364 nigel 93 if (-c == ESC_p || -c == ESC_P)
3365     {
3366     BOOL negated;
3367     int pdata;
3368     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3369     if (ptype < 0) goto FAILED;
3370     class_utf8 = TRUE;
3371     *class_utf8data++ = ((-c == ESC_p) != negated)?
3372     XCL_PROP : XCL_NOTPROP;
3373     *class_utf8data++ = ptype;
3374     *class_utf8data++ = pdata;
3375     class_charcount -= 2; /* Not a < 256 character */
3376 nigel 77 continue;
3377 nigel 93 }
3378 nigel 77 #endif
3379 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
3380     strict mode. By default, for compatibility with Perl, they are
3381     treated as literals. */
3382 nigel 77
3383 nigel 93 if ((options & PCRE_EXTRA) != 0)
3384     {
3385     *errorcodeptr = ERR7;
3386     goto FAILED;
3387     }
3388 nigel 77
3389 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3390     c = *ptr; /* Get the final character and fall through */
3391 nigel 77 }
3392    
3393     /* Fall through if we have a single character (c >= 0). This may be
3394 nigel 93 greater than 256 in UTF-8 mode. */
3395 nigel 77
3396     } /* End of backslash handling */
3397    
3398     /* A single character may be followed by '-' to form a range. However,
3399     Perl does not permit ']' to be the end of the range. A '-' character
3400 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3401     entirely. The code for handling \Q and \E is messy. */
3402 nigel 77
3403 nigel 93 CHECK_RANGE:
3404 ph10 391 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3405 nigel 77 {
3406 nigel 93 inescq = FALSE;
3407     ptr += 2;
3408     }
3409    
3410     oldptr = ptr;
3411 ph10 231
3412 ph10 230 /* Remember \r or \n */
3413 ph10 231
3414 ph10 391 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3415 ph10 231
3416 ph10 230 /* Check for range */
3417 nigel 93
3418 ph10 391 if (!inescq && ptr[1] == CHAR_MINUS)
3419 nigel 93 {
3420 nigel 77 int d;
3421     ptr += 2;
3422 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3423 nigel 77
3424 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3425     mode. */
3426    
3427 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3428 nigel 93 {
3429     ptr += 2;
3430 ph10 392 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3431 ph10 391 { ptr += 2; continue; }
3432 nigel 93 inescq = TRUE;
3433     break;
3434     }
3435    
3436 ph10 391 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3437 nigel 93 {
3438     ptr = oldptr;
3439     goto LONE_SINGLE_CHARACTER;
3440     }
3441    
3442 nigel 77 #ifdef SUPPORT_UTF8
3443     if (utf8)
3444     { /* Braces are required because the */
3445     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3446     }
3447     else
3448     #endif
3449     d = *ptr; /* Not UTF-8 mode */
3450    
3451     /* The second part of a range can be a single-character escape, but
3452     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3453     in such circumstances. */
3454    
3455 ph10 391 if (!inescq && d == CHAR_BACKSLASH)
3456 nigel 77 {
3457 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3458     if (*errorcodeptr != 0) goto FAILED;
3459 nigel 77
3460 ph10 275 /* \b is backspace; \X is literal X; \R is literal R; any other
3461 nigel 93 special means the '-' was literal */
3462 nigel 77
3463     if (d < 0)
3464     {
3465 ph10 391 if (d == -ESC_b) d = CHAR_BS;
3466     else if (d == -ESC_X) d = CHAR_X;
3467     else if (d == -ESC_R) d = CHAR_R; else
3468 nigel 77 {
3469 nigel 93 ptr = oldptr;
3470 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3471     }
3472     }
3473     }
3474    
3475 nigel 93 /* Check that the two values are in the correct order. Optimize
3476     one-character ranges */
3477 nigel 77
3478 nigel 93 if (d < c)
3479     {
3480     *errorcodeptr = ERR8;
3481     goto FAILED;
3482     }
3483    
3484 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3485    
3486 ph10 230 /* Remember \r or \n */
3487 ph10 231
3488 ph10 391 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3489 ph10 231
3490 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3491     matching, we have to use an XCLASS with extra data items. Caseless
3492     matching for characters > 127 is available only if UCP support is
3493     available. */
3494    
3495     #ifdef SUPPORT_UTF8
3496     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3497     {
3498     class_utf8 = TRUE;
3499    
3500     /* With UCP support, we can find the other case equivalents of
3501     the relevant characters. There may be several ranges. Optimize how
3502     they fit with the basic range. */
3503    
3504     #ifdef SUPPORT_UCP
3505     if ((options & PCRE_CASELESS) != 0)
3506     {
3507 nigel 93 unsigned int occ, ocd;
3508     unsigned int cc = c;
3509     unsigned int origd = d;
3510 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3511     {
3512 ph10 180 if (occ >= (unsigned int)c &&
3513     ocd <= (unsigned int)d)
3514 ph10 176 continue; /* Skip embedded ranges */
3515 nigel 77
3516 ph10 180 if (occ < (unsigned int)c &&
3517 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3518 nigel 77 { /* if there is overlap, */
3519     c = occ; /* noting that if occ < c */
3520     continue; /* we can't have ocd > d */
3521     } /* because a subrange is */
3522 ph10 180 if (ocd > (unsigned int)d &&
3523 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3524 nigel 77 { /* the basic range. */
3525     d = ocd;
3526     continue;
3527     }
3528    
3529     if (occ == ocd)
3530     {
3531     *class_utf8data++ = XCL_SINGLE;
3532     }
3533     else
3534     {
3535     *class_utf8data++ = XCL_RANGE;
3536     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3537     }
3538     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3539     }
3540     }
3541     #endif /* SUPPORT_UCP */
3542    
3543     /* Now record the original range, possibly modified for UCP caseless
3544     overlapping ranges. */
3545    
3546     *class_utf8data++ = XCL_RANGE;
3547     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3548     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3549    
3550     /* With UCP support, we are done. Without UCP support, there is no
3551     caseless matching for UTF-8 characters > 127; we can use the bit map
3552     for the smaller ones. */
3553    
3554     #ifdef SUPPORT_UCP
3555     continue; /* With next character in the class */
3556     #else
3557     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3558    
3559     /* Adjust upper limit and fall through to set up the map */
3560    
3561     d = 127;
3562    
3563     #endif /* SUPPORT_UCP */
3564     }
3565     #endif /* SUPPORT_UTF8 */
3566    
3567     /* We use the bit map for all cases when not in UTF-8 mode; else
3568     ranges that lie entirely within 0-127 when there is UCP support; else
3569     for partial ranges without UCP support. */
3570    
3571 nigel 93 class_charcount += d - c + 1;
3572     class_lastchar = d;
3573    
3574     /* We can save a bit of time by skipping this in the pre-compile. */
3575    
3576     if (lengthptr == NULL) for (; c <= d; c++)
3577 nigel 77 {
3578     classbits[c/8] |= (1 << (c&7));
3579     if ((options & PCRE_CASELESS) != 0)
3580     {
3581     int uc = cd->fcc[c]; /* flip case */
3582     classbits[uc/8] |= (1 << (uc&7));
3583     }
3584     }
3585    
3586     continue; /* Go get the next char in the class */
3587     }
3588    
3589     /* Handle a lone single character - we can get here for a normal
3590     non-escape char, or after \ that introduces a single character or for an
3591     apparent range that isn't. */
3592    
3593     LONE_SINGLE_CHARACTER:
3594 ph10 231
3595 nigel 77 /* Handle a character that cannot go in the bit map */
3596    
3597     #ifdef SUPPORT_UTF8
3598     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3599     {
3600     class_utf8 = TRUE;
3601     *class_utf8data++ = XCL_SINGLE;
3602     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3603    
3604     #ifdef SUPPORT_UCP
3605     if ((options & PCRE_CASELESS) != 0)
3606     {
3607 nigel 93 unsigned int othercase;
3608 ph10 349 if ((othercase = UCD_OTHERCASE(c)) != c)
3609 nigel 77 {
3610     *class_utf8data++ = XCL_SINGLE;
3611     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3612     }
3613     }
3614     #endif /* SUPPORT_UCP */
3615    
3616     }
3617     else
3618     #endif /* SUPPORT_UTF8 */
3619    
3620     /* Handle a single-byte character */
3621     {
3622     classbits[c/8] |= (1 << (c&7));
3623     if ((options & PCRE_CASELESS) != 0)
3624     {
3625     c = cd->fcc[c]; /* flip case */
3626     classbits[c/8] |= (1 << (c&7));
3627     }
3628     class_charcount++;
3629     class_lastchar = c;
3630     }
3631     }
3632    
3633 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3634 nigel 77
3635 ph10 391 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3636 nigel 77
3637 nigel 93 if (c == 0) /* Missing terminating ']' */
3638     {
3639     *errorcodeptr = ERR6;
3640     goto FAILED;
3641     }
3642 ph10 231
3643    
3644 ph10 230 /* This code has been disabled because it would mean that \s counts as
3645     an explicit \r or \n reference, and that's not really what is wanted. Now
3646     we set the flag only if there is a literal "\r" or "\n" in the class. */
3647 ph10 227
3648 ph10 230 #if 0
3649 ph10 226 /* Remember whether \r or \n are in this class */
3650 ph10 227
3651 ph10 226 if (negate_class)
3652     {
3653 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3654 ph10 226 }
3655     else
3656     {
3657 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3658 ph10 227 }
3659 ph10 230 #endif
3660 ph10 227
3661 ph10 231
3662 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3663 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3664     use of \p or \P, in other words, no use of any XCLASS features, we can
3665     optimize.
3666    
3667 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3668     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3669     operate on single-bytes only. This is an historical hangover. Maybe one day
3670     we can tidy these opcodes to handle multi-byte characters.
3671 nigel 77
3672     The optimization throws away the bit map. We turn the item into a
3673     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3674     that OP_NOT does not support multibyte characters. In the positive case, it
3675     can cause firstbyte to be set. Otherwise, there can be no first char if
3676     this item is first, whatever repeat count may follow. In the case of
3677     reqbyte, save the previous value for reinstating. */
3678    
3679     #ifdef SUPPORT_UTF8
3680 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3681 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3682 nigel 77 #else
3683     if (class_charcount == 1)
3684     #endif
3685     {
3686     zeroreqbyte = reqbyte;
3687    
3688     /* The OP_NOT opcode works on one-byte characters only. */
3689    
3690     if (negate_class)
3691     {
3692     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3693     zerofirstbyte = firstbyte;
3694     *code++ = OP_NOT;
3695     *code++ = class_lastchar;
3696     break;
3697     }
3698    
3699     /* For a single, positive character, get the value into mcbuffer, and
3700     then we can handle this with the normal one-character code. */
3701    
3702     #ifdef SUPPORT_UTF8
3703     if (utf8 && class_lastchar > 127)
3704     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3705     else
3706     #endif
3707     {
3708     mcbuffer[0] = class_lastchar;
3709     mclength = 1;
3710     }
3711     goto ONE_CHAR;
3712     } /* End of 1-char optimization */
3713    
3714     /* The general case - not the one-char optimization. If this is the first
3715     thing in the branch, there can be no first char setting, whatever the
3716     repeat count. Any reqbyte setting must remain unchanged after any kind of
3717     repeat. */
3718    
3719     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3720     zerofirstbyte = firstbyte;
3721     zeroreqbyte = reqbyte;
3722    
3723     /* If there are characters with values > 255, we have to compile an
3724 ph10 286 extended class, with its own opcode, unless there was a negated special
3725     such as \S in the class, because in that case all characters > 255 are in
3726     the class, so any that were explicitly given as well can be ignored. If
3727 ph10 264 (when there are explicit characters > 255 that must be listed) there are no
3728     characters < 256, we can omit the bitmap in the actual compiled code. */
3729 nigel 77
3730     #ifdef SUPPORT_UTF8
3731 ph10 264 if (class_utf8 && !should_flip_negation)
3732 nigel 77 {
3733     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3734     *code++ = OP_XCLASS;
3735     code += LINK_SIZE;
3736     *code = negate_class? XCL_NOT : 0;
3737    
3738 nigel 93 /* If the map is required, move up the extra data to make room for it;
3739     otherwise just move the code pointer to the end of the extra data. */
3740 nigel 77
3741     if (class_charcount > 0)
3742     {
3743     *code++ |= XCL_MAP;
3744 nigel 93 memmove(code + 32, code, class_utf8data - code);
3745 nigel 77 memcpy(code, classbits, 32);
3746 nigel 93 code = class_utf8data + 32;
3747 nigel 77 }
3748 nigel 93 else code = class_utf8data;
3749 nigel 77
3750     /* Now fill in the complete length of the item */
3751    
3752     PUT(previous, 1, code - previous);
3753     break; /* End of class handling */
3754     }
3755     #endif
3756    
3757 ph10 286 /* If there are no characters > 255, set the opcode to OP_CLASS or
3758     OP_NCLASS, depending on whether the whole class was negated and whether
3759     there were negative specials such as \S in the class. Then copy the 32-byte
3760 ph10 264 map into the code vector, negating it if necessary. */
3761 ph10 286
3762 ph10 264 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3763 nigel 77 if (negate_class)
3764     {
3765 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3766     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3767 nigel 77 }
3768     else
3769     {
3770     memcpy(code, classbits, 32);
3771     }
3772     code += 32;
3773     break;
3774    
3775 nigel 93
3776     /* ===================================================================*/
3777 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3778     has been tested above. */
3779    
3780 ph10 391 case CHAR_LEFT_CURLY_BRACKET:
3781 nigel 77 if (!is_quantifier) goto NORMAL_CHAR;
3782     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3783     if (*errorcodeptr != 0) goto FAILED;
3784     goto REPEAT;
3785    
3786 ph10 391 case CHAR_ASTERISK:
3787 nigel 77 repeat_min = 0;
3788     repeat_max = -1;
3789     goto REPEAT;
3790    
3791 ph10 391 case CHAR_PLUS:
3792 nigel 77 repeat_min = 1;
3793     repeat_max = -1;
3794     goto REPEAT;
3795    
3796 ph10 391 case CHAR_QUESTION_MARK:
3797 nigel 77 repeat_min = 0;
3798     repeat_max = 1;
3799    
3800     REPEAT:
3801     if (previous == NULL)
3802     {
3803     *errorcodeptr = ERR9;
3804     goto FAILED;
3805     }
3806    
3807     if (repeat_min == 0)
3808     {
3809     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3810     reqbyte = zeroreqbyte; /* Ditto */
3811     }
3812    
3813     /* Remember whether this is a variable length repeat */
3814    
3815     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3816    
3817     op_type = 0; /* Default single-char op codes */
3818     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3819    
3820     /* Save start of previous item, in case we have to move it up to make space
3821     for an inserted OP_ONCE for the additional '+' extension. */
3822    
3823     tempcode = previous;
3824    
3825     /* If the next character is '+', we have a possessive quantifier. This
3826     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3827     If the next character is '?' this is a minimizing repeat, by default,
3828     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3829     repeat type to the non-default. */
3830    
3831 ph10 391 if (ptr[1] == CHAR_PLUS)
3832 nigel 77 {
3833     repeat_type = 0; /* Force greedy */
3834     possessive_quantifier = TRUE;
3835     ptr++;
3836     }
3837 ph10 391 else if (ptr[1] == CHAR_QUESTION_MARK)
3838 nigel 77 {
3839     repeat_type = greedy_non_default;
3840     ptr++;
3841     }
3842     else repeat_type = greedy_default;
3843    
3844     /* If previous was a character match, abolish the item and generate a
3845     repeat item instead. If a char item has a minumum of more than one, ensure
3846     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3847     the first thing in a branch because the x will have gone into firstbyte
3848     instead. */
3849    
3850     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3851     {
3852     /* Deal with UTF-8 characters that take up more than one byte. It's
3853     easier to write this out separately than try to macrify it. Use c to
3854     hold the length of the character in bytes, plus 0x80 to flag that it's a
3855     length rather than a small character. */
3856    
3857     #ifdef SUPPORT_UTF8
3858     if (utf8 && (code[-1] & 0x80) != 0)
3859     {
3860     uschar *lastchar = code - 1;
3861     while((*lastchar & 0xc0) == 0x80) lastchar--;
3862     c = code - lastchar; /* Length of UTF-8 character */
3863     memcpy(utf8_char, lastchar, c); /* Save the char */
3864     c |= 0x80; /* Flag c as a length */
3865     }
3866     else
3867     #endif
3868    
3869     /* Handle the case of a single byte - either with no UTF8 support, or
3870     with UTF-8 disabled, or for a UTF-8 character < 128. */
3871    
3872     {
3873     c = code[-1];
3874     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3875     }
3876    
3877 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3878     the line is something that cannot possibly match this character. If so,
3879     automatically possessifying this item gains some performance in the case
3880     where the match fails. */
3881    
3882     if (!possessive_quantifier &&
3883     repeat_max < 0 &&
3884     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3885     options, cd))
3886     {
3887     repeat_type = 0; /* Force greedy */
3888     possessive_quantifier = TRUE;
3889     }
3890    
3891 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3892     }
3893    
3894     /* If previous was a single negated character ([^a] or similar), we use
3895     one of the special opcodes, replacing it. The code is shared with single-
3896     character repeats by setting opt_type to add a suitable offset into
3897 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3898     currently used only for single-byte chars. */
3899 nigel 77
3900     else if (*previous == OP_NOT)
3901     {
3902     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3903     c = previous[1];
3904 nigel 93 if (!possessive_quantifier &&
3905     repeat_max < 0 &&
3906     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3907     {
3908     repeat_type = 0; /* Force greedy */
3909     possessive_quantifier = TRUE;
3910     }
3911 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3912     }
3913    
3914     /* If previous was a character type match (\d or similar), abolish it and
3915     create a suitable repeat item. The code is shared with single-character
3916     repeats by setting op_type to add a suitable offset into repeat_type. Note
3917     the the Unicode property types will be present only when SUPPORT_UCP is
3918     defined, but we don't wrap the little bits of code here because it just
3919     makes it horribly messy. */
3920    
3921     else if (*previous < OP_EODN)
3922     {
3923     uschar *oldcode;
3924 nigel 87 int prop_type, prop_value;
3925 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3926     c = *previous;
3927    
3928 nigel 93 if (!possessive_quantifier &&
3929     repeat_max < 0 &&
3930     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3931     {
3932     repeat_type = 0; /* Force greedy */
3933     possessive_quantifier = TRUE;
3934     }
3935    
3936 nigel 77 OUTPUT_SINGLE_REPEAT:
3937 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3938     {
3939     prop_type = previous[1];
3940     prop_value = previous[2];
3941     }
3942     else prop_type = prop_value = -1;
3943 nigel 77
3944     oldcode = code;
3945     code = previous; /* Usually overwrite previous item */
3946    
3947     /* If the maximum is zero then the minimum must also be zero; Perl allows
3948