/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 500 - (hide annotations) (download)
Sat Mar 6 19:00:29 2010 UTC (3 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 222249 byte(s)
Fix bugs with \K in atomic groups, subroutines, and assertions.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 475 Copyright (c) 1997-2010 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 ph10 475 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57     also used by pcretest. PCRE_DEBUG is not defined when building a production
58     library. */
59 nigel 85
60 ph10 475 #ifdef PCRE_DEBUG
61 nigel 85 #include "pcre_printint.src"
62     #endif
63    
64    
65 ph10 178 /* Macro for setting individual bits in class bitmaps. */
66    
67     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68    
69 ph10 202 /* Maximum length value to check against when making sure that the integer that
70     holds the compiled pattern length does not overflow. We make it a bit less than
71     INT_MAX to allow for adding in group terminating bytes, so that we don't have
72     to check them every time. */
73 ph10 178
74 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
75    
76    
77 nigel 77 /*************************************************
78     * Code parameters and static tables *
79     *************************************************/
80    
81 nigel 93 /* This value specifies the size of stack workspace that is used during the
82     first pre-compile phase that determines how much memory is required. The regex
83     is partly compiled into this space, but the compiled parts are discarded as
84     soon as they can be, so that hopefully there will never be an overrun. The code
85     does, however, check for an overrun. The largest amount I've seen used is 218,
86     so this number is very generous.
87 nigel 77
88 nigel 93 The same workspace is used during the second, actual compile phase for
89     remembering forward references to groups so that they can be filled in at the
90     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91     is 4 there is plenty of room. */
92 nigel 77
93 nigel 93 #define COMPILE_WORK_SIZE (4096)
94 nigel 77
95 nigel 93
96 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
97     are simple data values; negative values are for special things like \d and so
98     on. Zero means further processing is needed (for things like \x), or the escape
99     is invalid. */
100    
101 ph10 391 #ifndef EBCDIC
102    
103     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
104 ph10 392 in UTF-8 mode. */
105 ph10 391
106 ph10 392 static const short int escapes[] = {
107 ph10 391 0, 0,
108     0, 0,
109 ph10 392 0, 0,
110     0, 0,
111     0, 0,
112 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
113 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
114 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
115 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
116     -ESC_B, -ESC_C,
117     -ESC_D, -ESC_E,
118     0, -ESC_G,
119     -ESC_H, 0,
120     0, -ESC_K,
121 ph10 391 0, 0,
122 ph10 392 0, 0,
123 ph10 391 -ESC_P, -ESC_Q,
124     -ESC_R, -ESC_S,
125 ph10 392 0, 0,
126     -ESC_V, -ESC_W,
127     -ESC_X, 0,
128     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
129 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
130 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
131 ph10 391 CHAR_GRAVE_ACCENT, 7,
132 ph10 392 -ESC_b, 0,
133     -ESC_d, ESC_e,
134 ph10 391 ESC_f, 0,
135     -ESC_h, 0,
136 ph10 392 0, -ESC_k,
137 ph10 391 0, 0,
138     ESC_n, 0,
139 ph10 392 -ESC_p, 0,
140     ESC_r, -ESC_s,
141 ph10 391 ESC_tee, 0,
142 ph10 392 -ESC_v, -ESC_w,
143     0, 0,
144 ph10 391 -ESC_z
145 nigel 77 };
146    
147 ph10 392 #else
148 ph10 391
149     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
150    
151 nigel 77 static const short int escapes[] = {
152     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
153     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
154     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
155     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
156     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
157     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
158     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
159     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
160 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
161 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
162 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
163 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
164 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
165     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
166     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
167     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
168 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
169 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
170 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
171 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
172 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
173     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
174     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
175     };
176     #endif
177    
178    
179 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
180     searched linearly. Put all the names into a single string, in order to reduce
181 ph10 392 the number of relocations when a shared library is dynamically linked. The
182     string is built from string macros so that it works in UTF-8 mode on EBCDIC
183 ph10 391 platforms. */
184 ph10 210
185     typedef struct verbitem {
186     int len;
187     int op;
188 ph10 211 } verbitem;
189 ph10 210
190 ph10 240 static const char verbnames[] =
191 ph10 391 STRING_ACCEPT0
192     STRING_COMMIT0
193     STRING_F0
194     STRING_FAIL0
195     STRING_PRUNE0
196     STRING_SKIP0
197     STRING_THEN;
198 ph10 240
199 ph10 327 static const verbitem verbs[] = {
200 ph10 240 { 6, OP_ACCEPT },
201     { 6, OP_COMMIT },
202     { 1, OP_FAIL },
203     { 4, OP_FAIL },
204     { 5, OP_PRUNE },
205     { 4, OP_SKIP },
206     { 4, OP_THEN }
207 ph10 210 };
208    
209 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
210 ph10 210
211    
212 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
213     now all in a single string, to reduce the number of relocations when a shared
214 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
215     length entry. The first three must be alpha, lower, upper, as this is assumed
216     for handling case independence. */
217 nigel 77
218 ph10 240 static const char posix_names[] =
219 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
220     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
221 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
222     STRING_word0 STRING_xdigit;
223 nigel 77
224     static const uschar posix_name_lengths[] = {
225     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
226    
227 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
228     base map, with an optional addition or removal of another map. Then, for some
229     classes, there is some additional tweaking: for [:blank:] the vertical space
230     characters are removed, and for [:alpha:] and [:alnum:] the underscore
231     character is removed. The triples in the table consist of the base map offset,
232     second map offset or -1 if no second map, and a non-negative value for map
233     addition or a negative value for map subtraction (if there are two maps). The
234     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
235     remove vertical space characters, 2 => remove underscore. */
236 nigel 77
237     static const int posix_class_maps[] = {
238 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
239     cbit_lower, -1, 0, /* lower */
240     cbit_upper, -1, 0, /* upper */
241     cbit_word, -1, 2, /* alnum - word without underscore */
242     cbit_print, cbit_cntrl, 0, /* ascii */
243     cbit_space, -1, 1, /* blank - a GNU extension */
244     cbit_cntrl, -1, 0, /* cntrl */
245     cbit_digit, -1, 0, /* digit */
246     cbit_graph, -1, 0, /* graph */
247     cbit_print, -1, 0, /* print */
248     cbit_punct, -1, 0, /* punct */
249     cbit_space, -1, 0, /* space */
250     cbit_word, -1, 0, /* word - a Perl extension */
251     cbit_xdigit,-1, 0 /* xdigit */
252 nigel 77 };
253    
254    
255 nigel 93 #define STRING(a) # a
256     #define XSTRING(s) STRING(s)
257    
258 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
259 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
260     they are documented. Always add a new error instead. Messages marked DEAD below
261 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
262     the number of relocations needed when a shared library is loaded dynamically,
263     it is now one long string. We cannot use a table of offsets, because the
264     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
265     simply count through to the one we want - this isn't a performance issue
266 ph10 499 because these strings are used only when there is a compilation error.
267 nigel 77
268 ph10 499 Each substring ends with \0 to insert a null character. This includes the final
269     substring, so that the whole string ends with \0\0, which can be detected when
270     counting through. */
271    
272 ph10 240 static const char error_texts[] =
273     "no error\0"
274     "\\ at end of pattern\0"
275     "\\c at end of pattern\0"
276     "unrecognized character follows \\\0"
277     "numbers out of order in {} quantifier\0"
278 nigel 77 /* 5 */
279 ph10 240 "number too big in {} quantifier\0"
280     "missing terminating ] for character class\0"
281     "invalid escape sequence in character class\0"
282     "range out of order in character class\0"
283     "nothing to repeat\0"
284 nigel 77 /* 10 */
285 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
286     "internal error: unexpected repeat\0"
287 ph10 269 "unrecognized character after (? or (?-\0"
288 ph10 240 "POSIX named classes are supported only within a class\0"
289     "missing )\0"
290 nigel 77 /* 15 */
291 ph10 240 "reference to non-existent subpattern\0"
292     "erroffset passed as NULL\0"
293     "unknown option bit(s) set\0"
294     "missing ) after comment\0"
295     "parentheses nested too deeply\0" /** DEAD **/
296 nigel 77 /* 20 */
297 ph10 240 "regular expression is too large\0"
298     "failed to get memory\0"
299     "unmatched parentheses\0"
300     "internal error: code overflow\0"
301     "unrecognized character after (?<\0"
302 nigel 77 /* 25 */
303 ph10 240 "lookbehind assertion is not fixed length\0"
304     "malformed number or name after (?(\0"
305     "conditional group contains more than two branches\0"
306     "assertion expected after (?(\0"
307     "(?R or (?[+-]digits must be followed by )\0"
308 nigel 77 /* 30 */
309 ph10 240 "unknown POSIX class name\0"
310     "POSIX collating elements are not supported\0"
311     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
312     "spare error\0" /** DEAD **/
313     "character value in \\x{...} sequence is too large\0"
314 nigel 77 /* 35 */
315 ph10 240 "invalid condition (?(0)\0"
316     "\\C not allowed in lookbehind assertion\0"
317     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
318     "number after (?C is > 255\0"
319     "closing ) for (?C expected\0"
320 nigel 77 /* 40 */
321 ph10 240 "recursive call could loop indefinitely\0"
322     "unrecognized character after (?P\0"
323     "syntax error in subpattern name (missing terminator)\0"
324     "two named subpatterns have the same name\0"
325     "invalid UTF-8 string\0"
326 nigel 77 /* 45 */
327 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
328     "malformed \\P or \\p sequence\0"
329     "unknown property name after \\P or \\p\0"
330     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
331     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
332 nigel 91 /* 50 */
333 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
334     "octal value is greater than \\377 (not in UTF-8 mode)\0"
335     "internal error: overran compiling workspace\0"
336     "internal error: previously-checked referenced subpattern not found\0"
337     "DEFINE group contains more than one branch\0"
338 nigel 93 /* 55 */
339 ph10 240 "repeating a DEFINE group is not allowed\0"
340     "inconsistent NEWLINE options\0"
341 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
342     "a numbered reference must not be zero\0"
343 ph10 240 "(*VERB) with an argument is not supported\0"
344 ph10 211 /* 60 */
345 ph10 240 "(*VERB) not recognized\0"
346 ph10 268 "number is too big\0"
347 ph10 272 "subpattern name expected\0"
348 ph10 336 "digit expected after (?+\0"
349 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
350     /* 65 */
351 ph10 499 "different names for subpatterns of the same number are not allowed\0";
352 nigel 77
353     /* Table to identify digits and hex digits. This is used when compiling
354     patterns. Note that the tables in chartables are dependent on the locale, and
355     may mark arbitrary characters as digits - but the PCRE compiling code expects
356     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
357     a private table here. It costs 256 bytes, but it is a lot faster than doing
358     character value tests (at least in some simple cases I timed), and in some
359     applications one wants PCRE to compile efficiently as well as match
360     efficiently.
361    
362     For convenience, we use the same bit definitions as in chartables:
363    
364     0x04 decimal digit
365     0x08 hexadecimal digit
366    
367     Then we can use ctype_digit and ctype_xdigit in the code. */
368    
369 ph10 392 #ifndef EBCDIC
370 ph10 391
371 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
372 ph10 391 UTF-8 mode. */
373    
374 nigel 77 static const unsigned char digitab[] =
375     {
376     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
377     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
378     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
379     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
380     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
381     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
382     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
383     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
384     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
387     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
388     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
393     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
394     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
395     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
396     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
397     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
398     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
399     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
400     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
401     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
402     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
403     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
404     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
405     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
406     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
407     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
408    
409 ph10 392 #else
410 ph10 391
411     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
412    
413 nigel 77 static const unsigned char digitab[] =
414     {
415     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
416     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
417     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
418     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
419     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
420     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
421     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
422     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
423     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
424     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
425     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
426 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
427 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
428     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
429     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
430     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
431     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
432     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
433     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
434     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
435     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
436     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
437     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
438     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
439     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
440     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
441     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
442     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
443     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
444     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
445     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
446     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
447    
448     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
449     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
450     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
451     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
452     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
453     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
454     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
455     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
456     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
457     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
458     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
459     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
460 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
461 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
462     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
463     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
464     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
465     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
466     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
467     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
468     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
469     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
470     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
471     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
472     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
473     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
474     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
475     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
476     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
477     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
478     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
479     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
480     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
481     #endif
482    
483    
484     /* Definition to allow mutual recursion */
485    
486     static BOOL
487 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
488 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
489 nigel 77
490    
491    
492     /*************************************************
493 ph10 240 * Find an error text *
494     *************************************************/
495    
496 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
497     some of the text is of unknown length, we can't use a table of offsets.
498     Instead, just count through the strings. This is not a performance issue
499 ph10 240 because it happens only when there has been a compilation error.
500    
501     Argument: the error number
502     Returns: pointer to the error string
503     */
504    
505     static const char *
506     find_error_text(int n)
507     {
508     const char *s = error_texts;
509 ph10 499 for (; n > 0; n--)
510     {
511     while (*s++ != 0) {};
512     if (*s == 0) return "Error text not found (please report)";
513     }
514 ph10 240 return s;
515     }
516    
517    
518     /*************************************************
519 nigel 77 * Handle escapes *
520     *************************************************/
521    
522     /* This function is called when a \ has been encountered. It either returns a
523     positive value for a simple escape such as \n, or a negative value which
524 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
525     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
526     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
527     ptr is pointing at the \. On exit, it is on the final character of the escape
528     sequence.
529 nigel 77
530     Arguments:
531     ptrptr points to the pattern position pointer
532     errorcodeptr points to the errorcode variable
533     bracount number of previous extracting brackets
534     options the options bits
535     isclass TRUE if inside a character class
536    
537     Returns: zero or positive => a data character
538     negative => a special escape sequence
539 ph10 213 on error, errorcodeptr is set
540 nigel 77 */
541    
542     static int
543     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
544     int options, BOOL isclass)
545     {
546 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
547     const uschar *ptr = *ptrptr + 1;
548 nigel 77 int c, i;
549    
550 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
551     ptr--; /* Set pointer back to the last byte */
552    
553 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
554    
555     if (c == 0) *errorcodeptr = ERR1;
556    
557 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
558     in a table. A non-zero result is something that can be returned immediately.
559 nigel 77 Otherwise further processing may be required. */
560    
561 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
562     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
563     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
564 nigel 77
565 ph10 97 #else /* EBCDIC coding */
566 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
567 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
568     #endif
569    
570     /* Escapes that need further processing, or are illegal. */
571    
572     else
573     {
574     const uschar *oldptr;
575 nigel 93 BOOL braced, negated;
576    
577 nigel 77 switch (c)
578     {
579     /* A number of Perl escapes are not handled by PCRE. We give an explicit
580     error. */
581    
582 ph10 391 case CHAR_l:
583     case CHAR_L:
584     case CHAR_N:
585     case CHAR_u:
586     case CHAR_U:
587 nigel 77 *errorcodeptr = ERR37;
588     break;
589    
590 ph10 333 /* \g must be followed by one of a number of specific things:
591 ph10 345
592 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
593     backreference. If negative, it is a relative backreference. This is a Perl
594     5.10 feature.
595 ph10 345
596 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
597     is part of Perl's movement towards a unified syntax for back references. As
598     this is synonymous with \k{name}, we fudge it up by pretending it really
599     was \k.
600 ph10 345
601     (3) For Oniguruma compatibility we also support \g followed by a name or a
602     number either in angle brackets or in single quotes. However, these are
603     (possibly recursive) subroutine calls, _not_ backreferences. Just return
604 ph10 333 the -ESC_g code (cf \k). */
605 nigel 93
606 ph10 391 case CHAR_g:
607     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
608 ph10 333 {
609     c = -ESC_g;
610 ph10 345 break;
611     }
612 ph10 333
613     /* Handle the Perl-compatible cases */
614 ph10 345
615 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
616 nigel 93 {
617 ph10 171 const uschar *p;
618 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
619     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
620     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
621 ph10 171 {
622     c = -ESC_k;
623     break;
624 ph10 172 }
625 nigel 93 braced = TRUE;
626     ptr++;
627     }
628     else braced = FALSE;
629    
630 ph10 391 if (ptr[1] == CHAR_MINUS)
631 nigel 93 {
632     negated = TRUE;
633     ptr++;
634     }
635     else negated = FALSE;
636    
637     c = 0;
638     while ((digitab[ptr[1]] & ctype_digit) != 0)
639 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
640 ph10 220
641 ph10 333 if (c < 0) /* Integer overflow */
642 ph10 213 {
643     *errorcodeptr = ERR61;
644     break;
645 ph10 220 }
646 ph10 345
647 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
648 nigel 93 {
649     *errorcodeptr = ERR57;
650 ph10 213 break;
651 nigel 93 }
652 ph10 345
653 ph10 333 if (c == 0)
654     {
655     *errorcodeptr = ERR58;
656     break;
657 ph10 345 }
658 nigel 93
659     if (negated)
660     {
661     if (c > bracount)
662     {
663     *errorcodeptr = ERR15;
664 ph10 213 break;
665 nigel 93 }
666     c = bracount - (c - 1);
667     }
668    
669     c = -(ESC_REF + c);
670     break;
671    
672 nigel 77 /* The handling of escape sequences consisting of a string of digits
673     starting with one that is not zero is not straightforward. By experiment,
674     the way Perl works seems to be as follows:
675    
676     Outside a character class, the digits are read as a decimal number. If the
677     number is less than 10, or if there are that many previous extracting
678     left brackets, then it is a back reference. Otherwise, up to three octal
679     digits are read to form an escaped byte. Thus \123 is likely to be octal
680     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
681     value is greater than 377, the least significant 8 bits are taken. Inside a
682     character class, \ followed by a digit is always an octal number. */
683    
684 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
685     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
686 nigel 77
687     if (!isclass)
688     {
689     oldptr = ptr;
690 ph10 391 c -= CHAR_0;
691 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
692 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
693 ph10 333 if (c < 0) /* Integer overflow */
694 ph10 213 {
695     *errorcodeptr = ERR61;
696 ph10 220 break;
697     }
698 nigel 77 if (c < 10 || c <= bracount)
699     {
700     c = -(ESC_REF + c);
701     break;
702     }
703     ptr = oldptr; /* Put the pointer back and fall through */
704     }
705    
706     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
707     generates a binary zero byte and treats the digit as a following literal.
708     Thus we have to pull back the pointer by one. */
709    
710 ph10 391 if ((c = *ptr) >= CHAR_8)
711 nigel 77 {
712     ptr--;
713     c = 0;
714     break;
715     }
716    
717     /* \0 always starts an octal number, but we may drop through to here with a
718 nigel 91 larger first octal digit. The original code used just to take the least
719     significant 8 bits of octal numbers (I think this is what early Perls used
720     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
721     than 3 octal digits. */
722 nigel 77
723 ph10 391 case CHAR_0:
724     c -= CHAR_0;
725     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
726     c = c * 8 + *(++ptr) - CHAR_0;
727 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
728 nigel 77 break;
729    
730 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
731     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
732     treated as a data character. */
733 nigel 77
734 ph10 391 case CHAR_x:
735     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
736 nigel 77 {
737     const uschar *pt = ptr + 2;
738 nigel 87 int count = 0;
739    
740 nigel 77 c = 0;
741     while ((digitab[*pt] & ctype_xdigit) != 0)
742     {
743 nigel 87 register int cc = *pt++;
744 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
745 nigel 77 count++;
746 nigel 87
747 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
748     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
749     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
750 ph10 97 #else /* EBCDIC coding */
751 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
752     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
753 nigel 77 #endif
754     }
755 nigel 87
756 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
757 nigel 77 {
758 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
759 nigel 77 ptr = pt;
760     break;
761     }
762 nigel 87
763 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
764     recognize this construct; fall through to the normal \x handling. */
765     }
766    
767 nigel 87 /* Read just a single-byte hex-defined char */
768 nigel 77
769     c = 0;
770     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
771     {
772 ph10 391 int cc; /* Some compilers don't like */
773     cc = *(++ptr); /* ++ in initializers */
774     #ifndef EBCDIC /* ASCII/UTF-8 coding */
775     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
776     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
777 ph10 97 #else /* EBCDIC coding */
778 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
779     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
780 nigel 77 #endif
781     }
782     break;
783    
784 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
785     This coding is ASCII-specific, but then the whole concept of \cx is
786     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
787 nigel 77
788 ph10 391 case CHAR_c:
789 nigel 77 c = *(++ptr);
790     if (c == 0)
791     {
792     *errorcodeptr = ERR2;
793 ph10 213 break;
794 nigel 77 }
795    
796 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
797     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
798 nigel 77 c ^= 0x40;
799 ph10 97 #else /* EBCDIC coding */
800 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
801 nigel 77 c ^= 0xC0;
802     #endif
803     break;
804    
805     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
806 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
807     otherwise, for Perl compatibility, it is a literal. This code looks a bit
808     odd, but there used to be some cases other than the default, and there may
809     be again in future, so I haven't "optimized" it. */
810 nigel 77
811     default:
812     if ((options & PCRE_EXTRA) != 0) switch(c)
813     {
814     default:
815     *errorcodeptr = ERR3;
816     break;
817     }
818     break;
819     }
820     }
821    
822     *ptrptr = ptr;
823     return c;
824     }
825    
826    
827    
828     #ifdef SUPPORT_UCP
829     /*************************************************
830     * Handle \P and \p *
831     *************************************************/
832    
833     /* This function is called after \P or \p has been encountered, provided that
834     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
835     pointing at the P or p. On exit, it is pointing at the final character of the
836     escape sequence.
837    
838     Argument:
839     ptrptr points to the pattern position pointer
840     negptr points to a boolean that is set TRUE for negation else FALSE
841 nigel 87 dptr points to an int that is set to the detailed property value
842 nigel 77 errorcodeptr points to the error code variable
843    
844 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
845 nigel 77 */
846    
847     static int
848 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
849 nigel 77 {
850     int c, i, bot, top;
851     const uschar *ptr = *ptrptr;
852 nigel 87 char name[32];
853 nigel 77
854     c = *(++ptr);
855     if (c == 0) goto ERROR_RETURN;
856    
857     *negptr = FALSE;
858    
859 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
860     negation. */
861 nigel 77
862 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
863 nigel 77 {
864 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
865 nigel 77 {
866     *negptr = TRUE;
867     ptr++;
868     }
869 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
870 nigel 77 {
871     c = *(++ptr);
872     if (c == 0) goto ERROR_RETURN;
873 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
874 nigel 77 name[i] = c;
875     }
876 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
877 nigel 77 name[i] = 0;
878     }
879    
880     /* Otherwise there is just one following character */
881    
882     else
883     {
884     name[0] = c;
885     name[1] = 0;
886     }
887    
888     *ptrptr = ptr;
889    
890     /* Search for a recognized property name using binary chop */
891    
892     bot = 0;
893     top = _pcre_utt_size;
894    
895     while (bot < top)
896     {
897 nigel 87 i = (bot + top) >> 1;
898 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
899 nigel 87 if (c == 0)
900     {
901     *dptr = _pcre_utt[i].value;
902     return _pcre_utt[i].type;
903     }
904 nigel 77 if (c > 0) bot = i + 1; else top = i;
905     }
906    
907     *errorcodeptr = ERR47;
908     *ptrptr = ptr;
909     return -1;
910    
911     ERROR_RETURN:
912     *errorcodeptr = ERR46;
913     *ptrptr = ptr;
914     return -1;
915     }
916     #endif
917    
918    
919    
920    
921     /*************************************************
922     * Check for counted repeat *
923     *************************************************/
924    
925     /* This function is called when a '{' is encountered in a place where it might
926     start a quantifier. It looks ahead to see if it really is a quantifier or not.
927     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
928     where the ddds are digits.
929    
930     Arguments:
931     p pointer to the first char after '{'
932    
933     Returns: TRUE or FALSE
934     */
935    
936     static BOOL
937     is_counted_repeat(const uschar *p)
938     {
939     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
940     while ((digitab[*p] & ctype_digit) != 0) p++;
941 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
942 nigel 77
943 ph10 391 if (*p++ != CHAR_COMMA) return FALSE;
944     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
945 nigel 77
946     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
947     while ((digitab[*p] & ctype_digit) != 0) p++;
948    
949 ph10 391 return (*p == CHAR_RIGHT_CURLY_BRACKET);
950 nigel 77 }
951    
952    
953    
954     /*************************************************
955     * Read repeat counts *
956     *************************************************/
957    
958     /* Read an item of the form {n,m} and return the values. This is called only
959     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
960     so the syntax is guaranteed to be correct, but we need to check the values.
961    
962     Arguments:
963     p pointer to first char after '{'
964     minp pointer to int for min
965     maxp pointer to int for max
966     returned as -1 if no max
967     errorcodeptr points to error code variable
968    
969     Returns: pointer to '}' on success;
970     current ptr on error, with errorcodeptr set non-zero
971     */
972    
973     static const uschar *
974     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
975     {
976     int min = 0;
977     int max = -1;
978    
979 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
980     an integer overflow. */
981    
982 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
983 nigel 81 if (min < 0 || min > 65535)
984     {
985     *errorcodeptr = ERR5;
986     return p;
987     }
988 nigel 77
989 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
990     Also, max must not be less than min. */
991    
992 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
993 nigel 77 {
994 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
995 nigel 77 {
996     max = 0;
997 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
998 nigel 81 if (max < 0 || max > 65535)
999     {
1000     *errorcodeptr = ERR5;
1001     return p;
1002     }
1003 nigel 77 if (max < min)
1004     {
1005     *errorcodeptr = ERR4;
1006     return p;
1007     }
1008     }
1009     }
1010    
1011 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1012     '}'. */
1013 nigel 77
1014 nigel 81 *minp = min;
1015     *maxp = max;
1016 nigel 77 return p;
1017     }
1018    
1019    
1020    
1021     /*************************************************
1022 ph10 408 * Subroutine for finding forward reference *
1023 nigel 91 *************************************************/
1024    
1025 ph10 408 /* This recursive function is called only from find_parens() below. The
1026     top-level call starts at the beginning of the pattern. All other calls must
1027     start at a parenthesis. It scans along a pattern's text looking for capturing
1028 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1029     name it is given, it returns its number. Alternatively, if the name is NULL, it
1030 ph10 408 returns when it reaches a given numbered subpattern. We know that if (?P< is
1031     encountered, the name will be terminated by '>' because that is checked in the
1032 ph10 411 first pass. Recursion is used to keep track of subpatterns that reset the
1033 ph10 408 capturing group numbers - the (?| feature.
1034 nigel 91
1035     Arguments:
1036 ph10 408 ptrptr address of the current character pointer (updated)
1037 ph10 345 cd compile background data
1038 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1039     lorn name length, or subpattern number if name is NULL
1040     xmode TRUE if we are in /x mode
1041 ph10 411 count pointer to the current capturing subpattern number (updated)
1042 nigel 91
1043     Returns: the number of the named subpattern, or -1 if not found
1044     */
1045    
1046     static int
1047 ph10 408 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1048     BOOL xmode, int *count)
1049 nigel 91 {
1050 ph10 408 uschar *ptr = *ptrptr;
1051     int start_count = *count;
1052     int hwm_count = start_count;
1053     BOOL dup_parens = FALSE;
1054 nigel 93
1055 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1056 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1057    
1058     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1059     {
1060     if (ptr[1] == CHAR_QUESTION_MARK &&
1061 ph10 411 ptr[2] == CHAR_VERTICAL_LINE)
1062 ph10 408 {
1063     ptr += 3;
1064 ph10 411 dup_parens = TRUE;
1065     }
1066 ph10 408
1067     /* Handle a normal, unnamed capturing parenthesis */
1068 ph10 411
1069 ph10 408 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1070     {
1071     *count += 1;
1072     if (name == NULL && *count == lorn) return *count;
1073 ph10 411 ptr++;
1074 ph10 408 }
1075    
1076     /* Handle a condition. If it is an assertion, just carry on so that it
1077     is processed as normal. If not, skip to the closing parenthesis of the
1078 ph10 411 condition (there can't be any nested parens. */
1079    
1080 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1081     {
1082 ph10 411 ptr += 2;
1083 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1084     {
1085     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1086 ph10 411 if (*ptr != 0) ptr++;
1087 ph10 408 }
1088 ph10 411 }
1089    
1090 ph10 408 /* We have either (? or (* and not a condition */
1091    
1092     else
1093 ph10 411 {
1094 ph10 408 ptr += 2;
1095     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1096    
1097     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1098 ph10 411
1099 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1100     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1101     {
1102     int term;
1103     const uschar *thisname;
1104     *count += 1;
1105     if (name == NULL && *count == lorn) return *count;
1106     term = *ptr++;
1107     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1108     thisname = ptr;
1109     while (*ptr != term) ptr++;
1110     if (name != NULL && lorn == ptr - thisname &&
1111     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1112     return *count;
1113 ph10 461 term++;
1114 ph10 411 }
1115 ph10 408 }
1116 ph10 411 }
1117 ph10 408
1118 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1119 ph10 408 bars. */
1120    
1121 nigel 91 for (; *ptr != 0; ptr++)
1122     {
1123 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1124    
1125 ph10 391 if (*ptr == CHAR_BACKSLASH)
1126 nigel 93 {
1127 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1128 ph10 391 if (*ptr == CHAR_Q) for (;;)
1129 nigel 93 {
1130 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1131 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1132 ph10 391 if (*(++ptr) == CHAR_E) break;
1133 nigel 93 }
1134     continue;
1135     }
1136    
1137 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1138     are handled for real. If the first character is '^', skip it. Also, if the
1139     first few characters (either before or after ^) are \Q\E or \E we skip them
1140 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1141 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1142 nigel 93
1143 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1144 nigel 93 {
1145 ph10 340 BOOL negate_class = FALSE;
1146     for (;;)
1147     {
1148 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1149 ph10 340 {
1150 ph10 438 if (ptr[2] == CHAR_E)
1151     ptr+= 2;
1152     else if (strncmp((const char *)ptr+2,
1153 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1154 ph10 438 ptr += 4;
1155 ph10 392 else
1156 ph10 391 break;
1157 ph10 340 }
1158 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1159 ph10 461 {
1160 ph10 340 negate_class = TRUE;
1161 ph10 438 ptr++;
1162 ph10 461 }
1163 ph10 340 else break;
1164     }
1165    
1166     /* If the next character is ']', it is a data character that must be
1167 ph10 341 skipped, except in JavaScript compatibility mode. */
1168 ph10 345
1169 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1170 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1171 ph10 345 ptr++;
1172    
1173 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1174 nigel 93 {
1175 ph10 220 if (*ptr == 0) return -1;
1176 ph10 391 if (*ptr == CHAR_BACKSLASH)
1177 nigel 93 {
1178 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1179 ph10 391 if (*ptr == CHAR_Q) for (;;)
1180 nigel 93 {
1181 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1182 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1183 ph10 391 if (*(++ptr) == CHAR_E) break;
1184 nigel 93 }
1185     continue;
1186     }
1187     }
1188     continue;
1189     }
1190    
1191     /* Skip comments in /x mode */
1192    
1193 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1194 nigel 93 {
1195 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1196 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1197 nigel 93 continue;
1198     }
1199    
1200 ph10 408 /* Check for the special metacharacters */
1201 ph10 411
1202 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1203 nigel 93 {
1204 ph10 408 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1205     if (rc > 0) return rc;
1206     if (*ptr == 0) goto FAIL_EXIT;
1207 nigel 93 }
1208 ph10 411
1209 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1210     {
1211     if (dup_parens && *count < hwm_count) *count = hwm_count;
1212     *ptrptr = ptr;
1213     return -1;
1214     }
1215 ph10 411
1216     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1217 ph10 408 {
1218     if (*count > hwm_count) hwm_count = *count;
1219     *count = start_count;
1220 ph10 411 }
1221 ph10 408 }
1222 nigel 93
1223 ph10 408 FAIL_EXIT:
1224     *ptrptr = ptr;
1225     return -1;
1226     }
1227 nigel 93
1228    
1229    
1230    
1231 ph10 408 /*************************************************
1232     * Find forward referenced subpattern *
1233     *************************************************/
1234 nigel 93
1235 ph10 408 /* This function scans along a pattern's text looking for capturing
1236     subpatterns, and counting them. If it finds a named pattern that matches the
1237     name it is given, it returns its number. Alternatively, if the name is NULL, it
1238     returns when it reaches a given numbered subpattern. This is used for forward
1239     references to subpatterns. We used to be able to start this scan from the
1240     current compiling point, using the current count value from cd->bracount, and
1241     do it all in a single loop, but the addition of the possibility of duplicate
1242     subpattern numbers means that we have to scan from the very start, in order to
1243     take account of such duplicates, and to use a recursive function to keep track
1244     of the different types of group.
1245    
1246     Arguments:
1247     cd compile background data
1248     name name to seek, or NULL if seeking a numbered subpattern
1249     lorn name length, or subpattern number if name is NULL
1250     xmode TRUE if we are in /x mode
1251    
1252     Returns: the number of the found subpattern, or -1 if not found
1253     */
1254    
1255     static int
1256     find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1257     {
1258     uschar *ptr = (uschar *)cd->start_pattern;
1259     int count = 0;
1260     int rc;
1261    
1262     /* If the pattern does not start with an opening parenthesis, the first call
1263     to find_parens_sub() will scan right to the end (if necessary). However, if it
1264     does start with a parenthesis, find_parens_sub() will return when it hits the
1265     matching closing parens. That is why we have to have a loop. */
1266    
1267 ph10 411 for (;;)
1268     {
1269 ph10 408 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1270 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1271     }
1272    
1273 ph10 408 return rc;
1274 nigel 91 }
1275    
1276    
1277    
1278 ph10 408
1279 nigel 91 /*************************************************
1280 nigel 77 * Find first significant op code *
1281     *************************************************/
1282    
1283     /* This is called by several functions that scan a compiled expression looking
1284     for a fixed first character, or an anchoring op code etc. It skips over things
1285     that do not influence this. For some calls, a change of option is important.
1286     For some calls, it makes sense to skip negative forward and all backward
1287     assertions, and also the \b assertion; for others it does not.
1288    
1289     Arguments:
1290     code pointer to the start of the group
1291     options pointer to external options
1292     optbit the option bit whose changing is significant, or
1293     zero if none are
1294     skipassert TRUE if certain assertions are to be skipped
1295    
1296     Returns: pointer to the first significant opcode
1297     */
1298    
1299     static const uschar*
1300     first_significant_code(const uschar *code, int *options, int optbit,
1301     BOOL skipassert)
1302     {
1303     for (;;)
1304     {
1305     switch ((int)*code)
1306     {
1307     case OP_OPT:
1308     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1309     *options = (int)code[1];
1310     code += 2;
1311     break;
1312    
1313     case OP_ASSERT_NOT:
1314     case OP_ASSERTBACK:
1315     case OP_ASSERTBACK_NOT:
1316     if (!skipassert) return code;
1317     do code += GET(code, 1); while (*code == OP_ALT);
1318     code += _pcre_OP_lengths[*code];
1319     break;
1320    
1321     case OP_WORD_BOUNDARY:
1322     case OP_NOT_WORD_BOUNDARY:
1323     if (!skipassert) return code;
1324     /* Fall through */
1325    
1326     case OP_CALLOUT:
1327     case OP_CREF:
1328 ph10 459 case OP_NCREF:
1329 nigel 93 case OP_RREF:
1330 ph10 459 case OP_NRREF:
1331 nigel 93 case OP_DEF:
1332 nigel 77 code += _pcre_OP_lengths[*code];
1333     break;
1334    
1335     default:
1336     return code;
1337     }
1338     }
1339     /* Control never reaches here */
1340     }
1341    
1342    
1343    
1344    
1345     /*************************************************
1346 ph10 454 * Find the fixed length of a branch *
1347 nigel 77 *************************************************/
1348    
1349 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1350 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1351 ph10 461 In UTF8 mode, the result is in characters rather than bytes. The branch is
1352 ph10 454 temporarily terminated with OP_END when this function is called.
1353 nigel 77
1354 ph10 461 This function is called when a backward assertion is encountered, so that if it
1355     fails, the error message can point to the correct place in the pattern.
1356 ph10 454 However, we cannot do this when the assertion contains subroutine calls,
1357 ph10 461 because they can be forward references. We solve this by remembering this case
1358 ph10 454 and doing the check at the end; a flag specifies which mode we are running in.
1359    
1360 nigel 77 Arguments:
1361     code points to the start of the pattern (the bracket)
1362     options the compiling options
1363 ph10 461 atend TRUE if called when the pattern is complete
1364     cd the "compile data" structure
1365 nigel 77
1366 ph10 461 Returns: the fixed length,
1367 ph10 454 or -1 if there is no fixed length,
1368 nigel 77 or -2 if \C was encountered
1369 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1370 nigel 77 */
1371    
1372     static int
1373 ph10 454 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1374 nigel 77 {
1375     int length = -1;
1376    
1377     register int branchlength = 0;
1378     register uschar *cc = code + 1 + LINK_SIZE;
1379    
1380     /* Scan along the opcodes for this branch. If we get to the end of the
1381     branch, check the length against that of the other branches. */
1382    
1383     for (;;)
1384     {
1385     int d;
1386 ph10 454 uschar *ce, *cs;
1387 nigel 77 register int op = *cc;
1388     switch (op)
1389     {
1390 nigel 93 case OP_CBRA:
1391 nigel 77 case OP_BRA:
1392     case OP_ONCE:
1393     case OP_COND:
1394 ph10 454 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1395 nigel 77 if (d < 0) return d;
1396     branchlength += d;
1397     do cc += GET(cc, 1); while (*cc == OP_ALT);
1398     cc += 1 + LINK_SIZE;
1399     break;
1400    
1401     /* Reached end of a branch; if it's a ket it is the end of a nested
1402     call. If it's ALT it is an alternation in a nested call. If it is
1403     END it's the end of the outer call. All can be handled by the same code. */
1404    
1405     case OP_ALT:
1406     case OP_KET:
1407     case OP_KETRMAX:
1408     case OP_KETRMIN:
1409     case OP_END:
1410     if (length < 0) length = branchlength;
1411     else if (length != branchlength) return -1;
1412     if (*cc != OP_ALT) return length;
1413     cc += 1 + LINK_SIZE;
1414     branchlength = 0;
1415     break;
1416 ph10 461
1417 ph10 454 /* A true recursion implies not fixed length, but a subroutine call may
1418     be OK. If the subroutine is a forward reference, we can't deal with
1419     it until the end of the pattern, so return -3. */
1420 ph10 461
1421 ph10 454 case OP_RECURSE:
1422     if (!atend) return -3;
1423     cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1424     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1425     if (cc > cs && cc < ce) return -1; /* Recursion */
1426     d = find_fixedlength(cs + 2, options, atend, cd);
1427 ph10 461 if (d < 0) return d;
1428 ph10 454 branchlength += d;
1429     cc += 1 + LINK_SIZE;
1430 ph10 461 break;
1431 nigel 77
1432     /* Skip over assertive subpatterns */
1433    
1434     case OP_ASSERT:
1435     case OP_ASSERT_NOT:
1436     case OP_ASSERTBACK:
1437     case OP_ASSERTBACK_NOT:
1438     do cc += GET(cc, 1); while (*cc == OP_ALT);
1439     /* Fall through */
1440    
1441     /* Skip over things that don't match chars */
1442    
1443     case OP_REVERSE:
1444     case OP_CREF:
1445 ph10 459 case OP_NCREF:
1446 nigel 93 case OP_RREF:
1447 ph10 459 case OP_NRREF:
1448 nigel 93 case OP_DEF:
1449 nigel 77 case OP_OPT:
1450     case OP_CALLOUT:
1451     case OP_SOD:
1452     case OP_SOM:
1453 ph10 500 case OP_SET_SOM:
1454 nigel 77 case OP_EOD:
1455     case OP_EODN:
1456     case OP_CIRC:
1457     case OP_DOLL:
1458     case OP_NOT_WORD_BOUNDARY:
1459     case OP_WORD_BOUNDARY:
1460     cc += _pcre_OP_lengths[*cc];
1461     break;
1462    
1463     /* Handle literal characters */
1464    
1465     case OP_CHAR:
1466     case OP_CHARNC:
1467 nigel 91 case OP_NOT:
1468 nigel 77 branchlength++;
1469     cc += 2;
1470     #ifdef SUPPORT_UTF8
1471 ph10 461 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1472 ph10 426 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1473 nigel 77 #endif
1474     break;
1475    
1476     /* Handle exact repetitions. The count is already in characters, but we
1477     need to skip over a multibyte character in UTF8 mode. */
1478    
1479     case OP_EXACT:
1480     branchlength += GET2(cc,1);
1481     cc += 4;
1482     #ifdef SUPPORT_UTF8
1483 ph10 461 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1484 ph10 426 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1485 nigel 77 #endif
1486     break;
1487    
1488     case OP_TYPEEXACT:
1489     branchlength += GET2(cc,1);
1490 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1491 nigel 77 cc += 4;
1492     break;
1493    
1494     /* Handle single-char matchers */
1495    
1496     case OP_PROP:
1497     case OP_NOTPROP:
1498 nigel 87 cc += 2;
1499 nigel 77 /* Fall through */
1500    
1501     case OP_NOT_DIGIT:
1502     case OP_DIGIT:
1503     case OP_NOT_WHITESPACE:
1504     case OP_WHITESPACE:
1505     case OP_NOT_WORDCHAR:
1506     case OP_WORDCHAR:
1507     case OP_ANY:
1508 ph10 342 case OP_ALLANY:
1509 nigel 77 branchlength++;
1510     cc++;
1511     break;
1512    
1513     /* The single-byte matcher isn't allowed */
1514    
1515     case OP_ANYBYTE:
1516     return -2;
1517    
1518     /* Check a class for variable quantification */
1519    
1520     #ifdef SUPPORT_UTF8
1521     case OP_XCLASS:
1522     cc += GET(cc, 1) - 33;
1523     /* Fall through */
1524     #endif
1525    
1526     case OP_CLASS:
1527     case OP_NCLASS:
1528     cc += 33;
1529    
1530     switch (*cc)
1531     {
1532     case OP_CRSTAR:
1533     case OP_CRMINSTAR:
1534     case OP_CRQUERY:
1535     case OP_CRMINQUERY:
1536     return -1;
1537    
1538     case OP_CRRANGE:
1539     case OP_CRMINRANGE:
1540     if (GET2(cc,1) != GET2(cc,3)) return -1;
1541     branchlength += GET2(cc,1);
1542     cc += 5;
1543     break;
1544    
1545     default:
1546     branchlength++;
1547     }
1548     break;
1549    
1550     /* Anything else is variable length */
1551    
1552     default:
1553     return -1;
1554     }
1555     }
1556     /* Control never gets here */
1557     }
1558    
1559    
1560    
1561    
1562     /*************************************************
1563 ph10 454 * Scan compiled regex for specific bracket *
1564 nigel 77 *************************************************/
1565    
1566     /* This little function scans through a compiled pattern until it finds a
1567 ph10 454 capturing bracket with the given number, or, if the number is negative, an
1568 ph10 461 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1569     so that it can be called from pcre_study() when finding the minimum matching
1570 ph10 455 length.
1571 nigel 77
1572     Arguments:
1573     code points to start of expression
1574     utf8 TRUE in UTF-8 mode
1575 ph10 454 number the required bracket number or negative to find a lookbehind
1576 nigel 77
1577     Returns: pointer to the opcode for the bracket, or NULL if not found
1578     */
1579    
1580 ph10 455 const uschar *
1581     _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1582 nigel 77 {
1583     for (;;)
1584     {
1585     register int c = *code;
1586     if (c == OP_END) return NULL;
1587 nigel 91
1588     /* XCLASS is used for classes that cannot be represented just by a bit
1589     map. This includes negated single high-valued characters. The length in
1590     the table is zero; the actual length is stored in the compiled code. */
1591    
1592     if (c == OP_XCLASS) code += GET(code, 1);
1593 ph10 461
1594 ph10 454 /* Handle recursion */
1595 ph10 461
1596 ph10 454 else if (c == OP_REVERSE)
1597     {
1598 ph10 461 if (number < 0) return (uschar *)code;
1599 ph10 454 code += _pcre_OP_lengths[c];
1600     }
1601 nigel 91
1602 nigel 93 /* Handle capturing bracket */
1603 nigel 91
1604 nigel 93 else if (c == OP_CBRA)
1605 nigel 77 {
1606 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1607 nigel 77 if (n == number) return (uschar *)code;
1608 nigel 93 code += _pcre_OP_lengths[c];
1609 nigel 77 }
1610 nigel 91
1611 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1612     repeated character types, we have to test for \p and \P, which have an extra
1613 ph10 218 two bytes of parameters. */
1614 nigel 91
1615 nigel 77 else
1616     {
1617 ph10 218 switch(c)
1618     {
1619     case OP_TYPESTAR:
1620     case OP_TYPEMINSTAR:
1621     case OP_TYPEPLUS:
1622     case OP_TYPEMINPLUS:
1623     case OP_TYPEQUERY:
1624     case OP_TYPEMINQUERY:
1625     case OP_TYPEPOSSTAR:
1626     case OP_TYPEPOSPLUS:
1627     case OP_TYPEPOSQUERY:
1628     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1629 ph10 220 break;
1630 ph10 221
1631     case OP_TYPEUPTO:
1632     case OP_TYPEMINUPTO:
1633     case OP_TYPEEXACT:
1634     case OP_TYPEPOSUPTO:
1635     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1636     break;
1637 ph10 220 }
1638    
1639 ph10 218 /* Add in the fixed length from the table */
1640 ph10 220
1641 nigel 77 code += _pcre_OP_lengths[c];
1642 ph10 220
1643 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1644     a multi-byte character. The length in the table is a minimum, so we have to
1645     arrange to skip the extra bytes. */
1646 ph10 220
1647 ph10 107 #ifdef SUPPORT_UTF8
1648 nigel 77 if (utf8) switch(c)
1649     {
1650     case OP_CHAR:
1651     case OP_CHARNC:
1652     case OP_EXACT:
1653     case OP_UPTO:
1654     case OP_MINUPTO:
1655 nigel 93 case OP_POSUPTO:
1656 nigel 77 case OP_STAR:
1657     case OP_MINSTAR:
1658 nigel 93 case OP_POSSTAR:
1659 nigel 77 case OP_PLUS:
1660     case OP_MINPLUS:
1661 nigel 93 case OP_POSPLUS:
1662 nigel 77 case OP_QUERY:
1663     case OP_MINQUERY:
1664 nigel 93 case OP_POSQUERY:
1665     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1666 nigel 77 break;
1667     }
1668 ph10 369 #else
1669     (void)(utf8); /* Keep compiler happy by referencing function argument */
1670 ph10 111 #endif
1671 nigel 77 }
1672     }
1673     }
1674    
1675    
1676    
1677     /*************************************************
1678     * Scan compiled regex for recursion reference *
1679     *************************************************/
1680    
1681     /* This little function scans through a compiled pattern until it finds an
1682     instance of OP_RECURSE.
1683    
1684     Arguments:
1685     code points to start of expression
1686     utf8 TRUE in UTF-8 mode
1687    
1688     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1689     */
1690    
1691     static const uschar *
1692     find_recurse(const uschar *code, BOOL utf8)
1693     {
1694     for (;;)
1695     {
1696     register int c = *code;
1697     if (c == OP_END) return NULL;
1698 nigel 91 if (c == OP_RECURSE) return code;
1699 ph10 220
1700 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1701     map. This includes negated single high-valued characters. The length in
1702     the table is zero; the actual length is stored in the compiled code. */
1703    
1704     if (c == OP_XCLASS) code += GET(code, 1);
1705    
1706 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1707     repeated character types, we have to test for \p and \P, which have an extra
1708 ph10 218 two bytes of parameters. */
1709 nigel 91
1710 nigel 77 else
1711     {
1712 ph10 218 switch(c)
1713     {
1714     case OP_TYPESTAR:
1715     case OP_TYPEMINSTAR:
1716     case OP_TYPEPLUS:
1717     case OP_TYPEMINPLUS:
1718     case OP_TYPEQUERY:
1719     case OP_TYPEMINQUERY:
1720     case OP_TYPEPOSSTAR:
1721     case OP_TYPEPOSPLUS:
1722     case OP_TYPEPOSQUERY:
1723     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1724 ph10 220 break;
1725 ph10 221
1726     case OP_TYPEPOSUPTO:
1727     case OP_TYPEUPTO:
1728     case OP_TYPEMINUPTO:
1729     case OP_TYPEEXACT:
1730     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1731     break;
1732 ph10 220 }
1733    
1734 ph10 218 /* Add in the fixed length from the table */
1735    
1736 nigel 77 code += _pcre_OP_lengths[c];
1737 ph10 220
1738 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1739     by a multi-byte character. The length in the table is a minimum, so we have
1740     to arrange to skip the extra bytes. */
1741 ph10 220
1742 ph10 107 #ifdef SUPPORT_UTF8
1743 nigel 77 if (utf8) switch(c)
1744     {
1745     case OP_CHAR:
1746     case OP_CHARNC:
1747     case OP_EXACT:
1748     case OP_UPTO:
1749     case OP_MINUPTO:
1750 nigel 93 case OP_POSUPTO:
1751 nigel 77 case OP_STAR:
1752     case OP_MINSTAR:
1753 nigel 93 case OP_POSSTAR:
1754 nigel 77 case OP_PLUS:
1755     case OP_MINPLUS:
1756 nigel 93 case OP_POSPLUS:
1757 nigel 77 case OP_QUERY:
1758     case OP_MINQUERY:
1759 nigel 93 case OP_POSQUERY:
1760     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1761 nigel 77 break;
1762     }
1763 ph10 369 #else
1764     (void)(utf8); /* Keep compiler happy by referencing function argument */
1765 ph10 111 #endif
1766 nigel 77 }
1767     }
1768     }
1769    
1770    
1771    
1772     /*************************************************
1773     * Scan compiled branch for non-emptiness *
1774     *************************************************/
1775    
1776     /* This function scans through a branch of a compiled pattern to see whether it
1777 nigel 93 can match the empty string or not. It is called from could_be_empty()
1778     below and from compile_branch() when checking for an unlimited repeat of a
1779     group that can match nothing. Note that first_significant_code() skips over
1780 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1781     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1782     bracket whose current branch will already have been scanned.
1783 nigel 77
1784     Arguments:
1785     code points to start of search
1786     endcode points to where to stop
1787     utf8 TRUE if in UTF8 mode
1788    
1789     Returns: TRUE if what is matched could be empty
1790     */
1791    
1792     static BOOL
1793     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1794     {
1795     register int c;
1796 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1797 nigel 77 code < endcode;
1798     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1799     {
1800     const uschar *ccode;
1801    
1802     c = *code;
1803 ph10 286
1804     /* Skip over forward assertions; the other assertions are skipped by
1805 ph10 282 first_significant_code() with a TRUE final argument. */
1806 ph10 286
1807 ph10 282 if (c == OP_ASSERT)
1808 ph10 286 {
1809 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1810     c = *code;
1811     continue;
1812 ph10 286 }
1813 ph10 172
1814 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1815 nigel 77
1816 ph10 335 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1817 ph10 170 {
1818 ph10 172 code += _pcre_OP_lengths[c];
1819 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1820     c = *code;
1821     continue;
1822     }
1823    
1824     /* For other groups, scan the branches. */
1825 ph10 172
1826 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1827 nigel 77 {
1828     BOOL empty_branch;
1829     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1830 ph10 406
1831     /* If a conditional group has only one branch, there is a second, implied,
1832 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
1833     Otherwise, scan the individual branches of the group. */
1834 ph10 406
1835 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1836 nigel 77 code += GET(code, 1);
1837 ph10 395 else
1838 ph10 406 {
1839 ph10 395 empty_branch = FALSE;
1840     do
1841     {
1842     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1843     empty_branch = TRUE;
1844     code += GET(code, 1);
1845     }
1846     while (*code == OP_ALT);
1847     if (!empty_branch) return FALSE; /* All branches are non-empty */
1848 nigel 77 }
1849 ph10 406
1850 ph10 172 c = *code;
1851 nigel 93 continue;
1852 nigel 77 }
1853    
1854 nigel 93 /* Handle the other opcodes */
1855    
1856     switch (c)
1857 nigel 77 {
1858 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1859     cannot be represented just by a bit map. This includes negated single
1860     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1861 ph10 220 actual length is stored in the compiled code, so we must update "code"
1862 ph10 216 here. */
1863 nigel 77
1864     #ifdef SUPPORT_UTF8
1865     case OP_XCLASS:
1866 ph10 216 ccode = code += GET(code, 1);
1867 nigel 77 goto CHECK_CLASS_REPEAT;
1868     #endif
1869    
1870     case OP_CLASS:
1871     case OP_NCLASS:
1872     ccode = code + 33;
1873    
1874     #ifdef SUPPORT_UTF8
1875     CHECK_CLASS_REPEAT:
1876     #endif
1877    
1878     switch (*ccode)
1879     {
1880     case OP_CRSTAR: /* These could be empty; continue */
1881     case OP_CRMINSTAR:
1882     case OP_CRQUERY:
1883     case OP_CRMINQUERY:
1884     break;
1885    
1886     default: /* Non-repeat => class must match */
1887     case OP_CRPLUS: /* These repeats aren't empty */
1888     case OP_CRMINPLUS:
1889     return FALSE;
1890    
1891     case OP_CRRANGE:
1892     case OP_CRMINRANGE:
1893     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1894     break;
1895     }
1896     break;
1897    
1898     /* Opcodes that must match a character */
1899    
1900     case OP_PROP:
1901     case OP_NOTPROP:
1902     case OP_EXTUNI:
1903     case OP_NOT_DIGIT:
1904     case OP_DIGIT:
1905     case OP_NOT_WHITESPACE:
1906     case OP_WHITESPACE:
1907     case OP_NOT_WORDCHAR:
1908     case OP_WORDCHAR:
1909     case OP_ANY:
1910 ph10 345 case OP_ALLANY:
1911 nigel 77 case OP_ANYBYTE:
1912     case OP_CHAR:
1913     case OP_CHARNC:
1914     case OP_NOT:
1915     case OP_PLUS:
1916     case OP_MINPLUS:
1917 nigel 93 case OP_POSPLUS:
1918 nigel 77 case OP_EXACT:
1919     case OP_NOTPLUS:
1920     case OP_NOTMINPLUS:
1921 nigel 93 case OP_NOTPOSPLUS:
1922 nigel 77 case OP_NOTEXACT:
1923     case OP_TYPEPLUS:
1924     case OP_TYPEMINPLUS:
1925 nigel 93 case OP_TYPEPOSPLUS:
1926 nigel 77 case OP_TYPEEXACT:
1927     return FALSE;
1928 ph10 227
1929     /* These are going to continue, as they may be empty, but we have to
1930     fudge the length for the \p and \P cases. */
1931    
1932 ph10 224 case OP_TYPESTAR:
1933     case OP_TYPEMINSTAR:
1934     case OP_TYPEPOSSTAR:
1935     case OP_TYPEQUERY:
1936     case OP_TYPEMINQUERY:
1937     case OP_TYPEPOSQUERY:
1938     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1939 ph10 227 break;
1940    
1941 ph10 224 /* Same for these */
1942 ph10 227
1943 ph10 224 case OP_TYPEUPTO:
1944     case OP_TYPEMINUPTO:
1945     case OP_TYPEPOSUPTO:
1946     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1947     break;
1948 nigel 77
1949     /* End of branch */
1950    
1951     case OP_KET:
1952     case OP_KETRMAX:
1953     case OP_KETRMIN:
1954     case OP_ALT:
1955     return TRUE;
1956    
1957 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1958     MINUPTO, and POSUPTO may be followed by a multibyte character */
1959 nigel 77
1960     #ifdef SUPPORT_UTF8
1961     case OP_STAR:
1962     case OP_MINSTAR:
1963 nigel 93 case OP_POSSTAR:
1964 nigel 77 case OP_QUERY:
1965     case OP_MINQUERY:
1966 nigel 93 case OP_POSQUERY:
1967 ph10 426 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1968     break;
1969 ph10 461
1970 nigel 77 case OP_UPTO:
1971     case OP_MINUPTO:
1972 nigel 93 case OP_POSUPTO:
1973 ph10 426 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
1974 nigel 77 break;
1975     #endif
1976     }
1977     }
1978    
1979     return TRUE;
1980     }
1981    
1982    
1983    
1984     /*************************************************
1985     * Scan compiled regex for non-emptiness *
1986     *************************************************/
1987    
1988     /* This function is called to check for left recursive calls. We want to check
1989     the current branch of the current pattern to see if it could match the empty
1990     string. If it could, we must look outwards for branches at other levels,
1991     stopping when we pass beyond the bracket which is the subject of the recursion.
1992    
1993     Arguments:
1994     code points to start of the recursion
1995     endcode points to where to stop (current RECURSE item)
1996     bcptr points to the chain of current (unclosed) branch starts
1997     utf8 TRUE if in UTF-8 mode
1998    
1999     Returns: TRUE if what is matched could be empty
2000     */
2001    
2002     static BOOL
2003     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2004     BOOL utf8)
2005     {
2006 ph10 475 while (bcptr != NULL && bcptr->current_branch >= code)
2007 nigel 77 {
2008 ph10 487 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8))
2009 ph10 475 return FALSE;
2010 nigel 77 bcptr = bcptr->outer;
2011     }
2012     return TRUE;
2013     }
2014    
2015    
2016    
2017     /*************************************************
2018     * Check for POSIX class syntax *
2019     *************************************************/
2020    
2021     /* This function is called when the sequence "[:" or "[." or "[=" is
2022 ph10 295 encountered in a character class. It checks whether this is followed by a
2023 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2024 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2025 nigel 77
2026 ph10 298 Originally, this function only recognized a sequence of letters between the
2027     terminators, but it seems that Perl recognizes any sequence of characters,
2028     though of course unknown POSIX names are subsequently rejected. Perl gives an
2029     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2030     didn't consider this to be a POSIX class. Likewise for [:1234:].
2031 ph10 295
2032 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2033     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2034     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2035     below handles the special case of \], but does not try to do any other escape
2036     processing. This makes it different from Perl for cases such as [:l\ower:]
2037 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2038 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2039 ph10 295 I think.
2040    
2041     Arguments:
2042 nigel 77 ptr pointer to the initial [
2043     endptr where to return the end pointer
2044    
2045     Returns: TRUE or FALSE
2046     */
2047    
2048     static BOOL
2049 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2050 nigel 77 {
2051     int terminator; /* Don't combine these lines; the Solaris cc */
2052     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2053 ph10 295 for (++ptr; *ptr != 0; ptr++)
2054 nigel 77 {
2055 ph10 391 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2056 ph10 298 {
2057 ph10 391 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2058     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2059 ph10 295 {
2060     *endptr = ptr;
2061     return TRUE;
2062 ph10 298 }
2063     }
2064     }
2065 nigel 77 return FALSE;
2066     }
2067    
2068    
2069    
2070    
2071     /*************************************************
2072     * Check POSIX class name *
2073     *************************************************/
2074    
2075     /* This function is called to check the name given in a POSIX-style class entry
2076     such as [:alnum:].
2077    
2078     Arguments:
2079     ptr points to the first letter
2080     len the length of the name
2081    
2082     Returns: a value representing the name, or -1 if unknown
2083     */
2084    
2085     static int
2086     check_posix_name(const uschar *ptr, int len)
2087     {
2088 ph10 240 const char *pn = posix_names;
2089 nigel 77 register int yield = 0;
2090     while (posix_name_lengths[yield] != 0)
2091     {
2092     if (len == posix_name_lengths[yield] &&
2093 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2094 ph10 243 pn += posix_name_lengths[yield] + 1;
2095 nigel 77 yield++;
2096     }
2097     return -1;
2098     }
2099    
2100    
2101     /*************************************************
2102     * Adjust OP_RECURSE items in repeated group *
2103     *************************************************/
2104    
2105     /* OP_RECURSE items contain an offset from the start of the regex to the group
2106     that is referenced. This means that groups can be replicated for fixed
2107     repetition simply by copying (because the recursion is allowed to refer to
2108     earlier groups that are outside the current group). However, when a group is
2109 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2110     inserted before it, after it has been compiled. This means that any OP_RECURSE
2111     items within it that refer to the group itself or any contained groups have to
2112     have their offsets adjusted. That one of the jobs of this function. Before it
2113     is called, the partially compiled regex must be temporarily terminated with
2114     OP_END.
2115 nigel 77
2116 nigel 93 This function has been extended with the possibility of forward references for
2117     recursions and subroutine calls. It must also check the list of such references
2118     for the group we are dealing with. If it finds that one of the recursions in
2119     the current group is on this list, it adjusts the offset in the list, not the
2120     value in the reference (which is a group number).
2121    
2122 nigel 77 Arguments:
2123     group points to the start of the group
2124     adjust the amount by which the group is to be moved
2125     utf8 TRUE in UTF-8 mode
2126     cd contains pointers to tables etc.
2127 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2128 nigel 77
2129     Returns: nothing
2130     */
2131    
2132     static void
2133 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2134     uschar *save_hwm)
2135 nigel 77 {
2136     uschar *ptr = group;
2137 ph10 224
2138 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2139     {
2140 nigel 93 int offset;
2141     uschar *hc;
2142    
2143     /* See if this recursion is on the forward reference list. If so, adjust the
2144     reference. */
2145 ph10 345
2146 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2147     {
2148     offset = GET(hc, 0);
2149     if (cd->start_code + offset == ptr + 1)
2150     {
2151     PUT(hc, 0, offset + adjust);
2152     break;
2153     }
2154     }
2155    
2156     /* Otherwise, adjust the recursion offset if it's after the start of this
2157     group. */
2158    
2159     if (hc >= cd->hwm)
2160     {
2161     offset = GET(ptr, 1);
2162     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2163     }
2164    
2165 nigel 77 ptr += 1 + LINK_SIZE;
2166     }
2167     }
2168    
2169    
2170    
2171     /*************************************************
2172     * Insert an automatic callout point *
2173     *************************************************/
2174    
2175     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2176     callout points before each pattern item.
2177    
2178     Arguments:
2179     code current code pointer
2180     ptr current pattern pointer
2181     cd pointers to tables etc
2182    
2183     Returns: new code pointer
2184     */
2185    
2186     static uschar *
2187     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2188     {
2189     *code++ = OP_CALLOUT;
2190     *code++ = 255;
2191     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2192     PUT(code, LINK_SIZE, 0); /* Default length */
2193     return code + 2*LINK_SIZE;
2194     }
2195    
2196    
2197    
2198     /*************************************************
2199     * Complete a callout item *
2200     *************************************************/
2201    
2202     /* A callout item contains the length of the next item in the pattern, which
2203     we can't fill in till after we have reached the relevant point. This is used
2204     for both automatic and manual callouts.
2205    
2206     Arguments:
2207     previous_callout points to previous callout item
2208     ptr current pattern pointer
2209     cd pointers to tables etc
2210    
2211     Returns: nothing
2212     */
2213    
2214     static void
2215     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2216     {
2217     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2218     PUT(previous_callout, 2 + LINK_SIZE, length);
2219     }
2220    
2221    
2222    
2223     #ifdef SUPPORT_UCP
2224     /*************************************************
2225     * Get othercase range *
2226     *************************************************/
2227    
2228     /* This function is passed the start and end of a class range, in UTF-8 mode
2229     with UCP support. It searches up the characters, looking for internal ranges of
2230     characters in the "other" case. Each call returns the next one, updating the
2231     start address.
2232    
2233     Arguments:
2234     cptr points to starting character value; updated
2235     d end value
2236     ocptr where to put start of othercase range
2237     odptr where to put end of othercase range
2238    
2239     Yield: TRUE when range returned; FALSE when no more
2240     */
2241    
2242     static BOOL
2243 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2244     unsigned int *odptr)
2245 nigel 77 {
2246 nigel 93 unsigned int c, othercase, next;
2247 nigel 77
2248     for (c = *cptr; c <= d; c++)
2249 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2250 nigel 77
2251     if (c > d) return FALSE;
2252    
2253     *ocptr = othercase;
2254     next = othercase + 1;
2255    
2256     for (++c; c <= d; c++)
2257     {
2258 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2259 nigel 77 next++;
2260     }
2261    
2262     *odptr = next - 1;
2263     *cptr = c;
2264    
2265     return TRUE;
2266     }
2267     #endif /* SUPPORT_UCP */
2268    
2269    
2270 nigel 93
2271 nigel 77 /*************************************************
2272 nigel 93 * Check if auto-possessifying is possible *
2273     *************************************************/
2274    
2275     /* This function is called for unlimited repeats of certain items, to see
2276     whether the next thing could possibly match the repeated item. If not, it makes
2277     sense to automatically possessify the repeated item.
2278    
2279     Arguments:
2280     op_code the repeated op code
2281     this data for this item, depends on the opcode
2282     utf8 TRUE in UTF-8 mode
2283     utf8_char used for utf8 character bytes, NULL if not relevant
2284     ptr next character in pattern
2285     options options bits
2286     cd contains pointers to tables etc.
2287    
2288     Returns: TRUE if possessifying is wanted
2289     */
2290    
2291     static BOOL
2292     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2293     const uschar *ptr, int options, compile_data *cd)
2294     {
2295     int next;
2296    
2297     /* Skip whitespace and comments in extended mode */
2298    
2299     if ((options & PCRE_EXTENDED) != 0)
2300     {
2301     for (;;)
2302     {
2303     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2304 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2305 nigel 93 {
2306     while (*(++ptr) != 0)
2307     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2308     }
2309     else break;
2310     }
2311     }
2312    
2313     /* If the next item is one that we can handle, get its value. A non-negative
2314     value is a character, a negative value is an escape value. */
2315    
2316 ph10 391 if (*ptr == CHAR_BACKSLASH)
2317 nigel 93 {
2318     int temperrorcode = 0;
2319     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2320     if (temperrorcode != 0) return FALSE;
2321     ptr++; /* Point after the escape sequence */
2322     }
2323    
2324     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2325     {
2326     #ifdef SUPPORT_UTF8
2327     if (utf8) { GETCHARINC(next, ptr); } else
2328     #endif
2329     next = *ptr++;
2330     }
2331    
2332     else return FALSE;
2333    
2334     /* Skip whitespace and comments in extended mode */
2335    
2336     if ((options & PCRE_EXTENDED) != 0)
2337     {
2338     for (;;)
2339     {
2340     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2341 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2342 nigel 93 {
2343     while (*(++ptr) != 0)
2344     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2345     }
2346     else break;
2347     }
2348     }
2349    
2350     /* If the next thing is itself optional, we have to give up. */
2351    
2352 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2353 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2354     return FALSE;
2355 nigel 93
2356     /* Now compare the next item with the previous opcode. If the previous is a
2357     positive single character match, "item" either contains the character or, if
2358     "item" is greater than 127 in utf8 mode, the character's bytes are in
2359     utf8_char. */
2360    
2361    
2362     /* Handle cases when the next item is a character. */
2363    
2364     if (next >= 0) switch(op_code)
2365     {
2366     case OP_CHAR:
2367     #ifdef SUPPORT_UTF8
2368     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2369 ph10 369 #else
2370     (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2371 nigel 93 #endif
2372     return item != next;
2373    
2374     /* For CHARNC (caseless character) we must check the other case. If we have
2375     Unicode property support, we can use it to test the other case of
2376     high-valued characters. */
2377    
2378     case OP_CHARNC:
2379     #ifdef SUPPORT_UTF8
2380     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2381     #endif
2382     if (item == next) return FALSE;
2383     #ifdef SUPPORT_UTF8
2384     if (utf8)
2385     {
2386     unsigned int othercase;
2387     if (next < 128) othercase = cd->fcc[next]; else
2388     #ifdef SUPPORT_UCP
2389 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2390 nigel 93 #else
2391     othercase = NOTACHAR;
2392     #endif
2393     return (unsigned int)item != othercase;
2394     }
2395     else
2396     #endif /* SUPPORT_UTF8 */
2397     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2398    
2399     /* For OP_NOT, "item" must be a single-byte character. */
2400    
2401     case OP_NOT:
2402     if (item == next) return TRUE;
2403     if ((options & PCRE_CASELESS) == 0) return FALSE;
2404     #ifdef SUPPORT_UTF8
2405     if (utf8)
2406     {
2407     unsigned int othercase;
2408     if (next < 128) othercase = cd->fcc[next]; else
2409     #ifdef SUPPORT_UCP
2410 ph10 349 othercase = UCD_OTHERCASE(next);
2411 nigel 93 #else
2412     othercase = NOTACHAR;
2413     #endif
2414     return (unsigned int)item == othercase;
2415     }
2416     else
2417     #endif /* SUPPORT_UTF8 */
2418     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2419    
2420     case OP_DIGIT:
2421     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2422    
2423     case OP_NOT_DIGIT:
2424     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2425    
2426     case OP_WHITESPACE:
2427     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2428    
2429     case OP_NOT_WHITESPACE:
2430     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2431    
2432     case OP_WORDCHAR:
2433     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2434    
2435     case OP_NOT_WORDCHAR:
2436     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2437    
2438 ph10 180 case OP_HSPACE:
2439     case OP_NOT_HSPACE:
2440     switch(next)
2441     {
2442     case 0x09:
2443     case 0x20:
2444     case 0xa0:
2445     case 0x1680:
2446     case 0x180e:
2447     case 0x2000:
2448     case 0x2001:
2449     case 0x2002:
2450     case 0x2003:
2451     case 0x2004:
2452     case 0x2005:
2453     case 0x2006:
2454     case 0x2007:
2455     case 0x2008:
2456     case 0x2009:
2457     case 0x200A:
2458     case 0x202f:
2459     case 0x205f:
2460     case 0x3000:
2461     return op_code != OP_HSPACE;
2462     default:
2463     return op_code == OP_HSPACE;
2464     }
2465    
2466     case OP_VSPACE:
2467     case OP_NOT_VSPACE:
2468     switch(next)
2469     {
2470     case 0x0a:
2471     case 0x0b:
2472     case 0x0c:
2473     case 0x0d:
2474     case 0x85:
2475     case 0x2028:
2476     case 0x2029:
2477     return op_code != OP_VSPACE;
2478     default:
2479     return op_code == OP_VSPACE;
2480     }
2481    
2482 nigel 93 default:
2483     return FALSE;
2484     }
2485    
2486    
2487     /* Handle the case when the next item is \d, \s, etc. */
2488    
2489     switch(op_code)
2490     {
2491     case OP_CHAR:
2492     case OP_CHARNC:
2493     #ifdef SUPPORT_UTF8
2494     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2495     #endif
2496     switch(-next)
2497     {
2498     case ESC_d:
2499     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2500    
2501     case ESC_D:
2502     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2503    
2504     case ESC_s:
2505     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2506    
2507     case ESC_S:
2508     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2509    
2510     case ESC_w:
2511     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2512    
2513     case ESC_W:
2514     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2515 ph10 182
2516 ph10 180 case ESC_h:
2517     case ESC_H:
2518     switch(item)
2519     {
2520     case 0x09:
2521     case 0x20:
2522     case 0xa0:
2523     case 0x1680:
2524     case 0x180e:
2525     case 0x2000:
2526     case 0x2001:
2527     case 0x2002:
2528     case 0x2003:
2529     case 0x2004:
2530     case 0x2005:
2531     case 0x2006:
2532     case 0x2007:
2533     case 0x2008:
2534     case 0x2009:
2535     case 0x200A:
2536     case 0x202f:
2537     case 0x205f:
2538     case 0x3000:
2539     return -next != ESC_h;
2540     default:
2541     return -next == ESC_h;
2542 ph10 182 }
2543    
2544 ph10 180 case ESC_v:
2545     case ESC_V:
2546     switch(item)
2547     {
2548     case 0x0a:
2549     case 0x0b:
2550     case 0x0c:
2551     case 0x0d:
2552     case 0x85:
2553     case 0x2028:
2554     case 0x2029:
2555     return -next != ESC_v;
2556     default:
2557     return -next == ESC_v;
2558 ph10 182 }
2559 nigel 93
2560     default:
2561     return FALSE;
2562     }
2563    
2564     case OP_DIGIT:
2565 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2566     next == -ESC_h || next == -ESC_v;
2567 nigel 93
2568     case OP_NOT_DIGIT:
2569     return next == -ESC_d;
2570    
2571     case OP_WHITESPACE:
2572     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2573    
2574     case OP_NOT_WHITESPACE:
2575 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2576 nigel 93
2577 ph10 180 case OP_HSPACE:
2578     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2579    
2580     case OP_NOT_HSPACE:
2581     return next == -ESC_h;
2582 ph10 182
2583 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2584 ph10 182 case OP_VSPACE:
2585 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2586    
2587     case OP_NOT_VSPACE:
2588 ph10 182 return next == -ESC_v;
2589 ph10 180
2590 nigel 93 case OP_WORDCHAR:
2591 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2592 nigel 93
2593     case OP_NOT_WORDCHAR:
2594     return next == -ESC_w || next == -ESC_d;
2595 ph10 182
2596 nigel 93 default:
2597     return FALSE;
2598     }
2599    
2600     /* Control does not reach here */
2601     }
2602    
2603    
2604    
2605     /*************************************************
2606 nigel 77 * Compile one branch *
2607     *************************************************/
2608    
2609 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2610 nigel 77 changed during the branch, the pointer is used to change the external options
2611 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2612     to find out the amount of memory needed, as well as during the real compile
2613     phase. The value of lengthptr distinguishes the two phases.
2614 nigel 77
2615     Arguments:
2616     optionsptr pointer to the option bits
2617     codeptr points to the pointer to the current code point
2618     ptrptr points to the current pattern pointer
2619     errorcodeptr points to error code variable
2620     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2621     reqbyteptr set to the last literal character required, else < 0
2622     bcptr points to current branch chain
2623     cd contains pointers to tables etc.
2624 nigel 93 lengthptr NULL during the real compile phase
2625     points to length accumulator during pre-compile phase
2626 nigel 77
2627     Returns: TRUE on success
2628     FALSE, with *errorcodeptr set non-zero on error
2629     */
2630    
2631     static BOOL
2632 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2633     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2634     compile_data *cd, int *lengthptr)
2635 nigel 77 {
2636     int repeat_type, op_type;
2637     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2638     int bravalue = 0;
2639     int greedy_default, greedy_non_default;
2640     int firstbyte, reqbyte;
2641     int zeroreqbyte, zerofirstbyte;
2642     int req_caseopt, reqvary, tempreqvary;
2643     int options = *optionsptr;
2644     int after_manual_callout = 0;
2645 nigel 93 int length_prevgroup = 0;
2646 nigel 77 register int c;
2647     register uschar *code = *codeptr;
2648 nigel 93 uschar *last_code = code;
2649     uschar *orig_code = code;
2650 nigel 77 uschar *tempcode;
2651     BOOL inescq = FALSE;
2652     BOOL groupsetfirstbyte = FALSE;
2653     const uschar *ptr = *ptrptr;
2654     const uschar *tempptr;
2655     uschar *previous = NULL;
2656     uschar *previous_callout = NULL;
2657 nigel 93 uschar *save_hwm = NULL;
2658 nigel 77 uschar classbits[32];
2659    
2660     #ifdef SUPPORT_UTF8
2661     BOOL class_utf8;
2662     BOOL utf8 = (options & PCRE_UTF8) != 0;
2663     uschar *class_utf8data;
2664 ph10 300 uschar *class_utf8data_base;
2665 nigel 77 uschar utf8_char[6];
2666     #else
2667     BOOL utf8 = FALSE;
2668 nigel 93 uschar *utf8_char = NULL;
2669 nigel 77 #endif
2670    
2671 ph10 475 #ifdef PCRE_DEBUG
2672 nigel 93 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2673     #endif
2674    
2675 nigel 77 /* Set up the default and non-default settings for greediness */
2676    
2677     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2678     greedy_non_default = greedy_default ^ 1;
2679    
2680     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2681     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2682     matches a non-fixed char first char; reqbyte just remains unset if we never
2683     find one.
2684    
2685     When we hit a repeat whose minimum is zero, we may have to adjust these values
2686     to take the zero repeat into account. This is implemented by setting them to
2687     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2688     item types that can be repeated set these backoff variables appropriately. */
2689    
2690     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2691    
2692     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2693     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2694     value > 255. It is added into the firstbyte or reqbyte variables to record the
2695     case status of the value. This is used only for ASCII characters. */
2696    
2697     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2698    
2699     /* Switch on next character until the end of the branch */
2700    
2701     for (;; ptr++)
2702     {
2703     BOOL negate_class;
2704 ph10 286 BOOL should_flip_negation;
2705 nigel 77 BOOL possessive_quantifier;
2706     BOOL is_quantifier;
2707 nigel 93 BOOL is_recurse;
2708 ph10 180 BOOL reset_bracount;
2709 nigel 77 int class_charcount;
2710     int class_lastchar;
2711     int newoptions;
2712     int recno;
2713 ph10 172 int refsign;
2714 nigel 77 int skipbytes;
2715     int subreqbyte;
2716     int subfirstbyte;
2717 nigel 93 int terminator;
2718 nigel 77 int mclength;
2719     uschar mcbuffer[8];
2720    
2721 nigel 93 /* Get next byte in the pattern */
2722 nigel 77
2723     c = *ptr;
2724 ph10 345
2725 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2726     previous cycle of this loop. */
2727    
2728     if (lengthptr != NULL)
2729     {
2730 ph10 475 #ifdef PCRE_DEBUG
2731 nigel 93 if (code > cd->hwm) cd->hwm = code; /* High water info */
2732     #endif
2733     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2734     {
2735     *errorcodeptr = ERR52;
2736     goto FAILED;
2737     }
2738    
2739     /* There is at least one situation where code goes backwards: this is the
2740     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2741     the class is simply eliminated. However, it is created first, so we have to
2742     allow memory for it. Therefore, don't ever reduce the length at this point.
2743     */
2744    
2745     if (code < last_code) code = last_code;
2746 ph10 202
2747     /* Paranoid check for integer overflow */
2748    
2749     if (OFLOW_MAX - *lengthptr < code - last_code)
2750     {
2751     *errorcodeptr = ERR20;
2752     goto FAILED;
2753     }
2754    
2755 nigel 93 *lengthptr += code - last_code;
2756     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2757    
2758     /* If "previous" is set and it is not at the start of the work space, move
2759     it back to there, in order to avoid filling up the work space. Otherwise,
2760     if "previous" is NULL, reset the current code pointer to the start. */
2761    
2762     if (previous != NULL)
2763     {
2764     if (previous > orig_code)
2765     {
2766     memmove(orig_code, previous, code - previous);
2767     code -= previous - orig_code;
2768     previous = orig_code;
2769     }
2770     }
2771     else code = orig_code;
2772    
2773     /* Remember where this code item starts so we can pick up the length
2774     next time round. */
2775    
2776     last_code = code;
2777     }
2778    
2779     /* In the real compile phase, just check the workspace used by the forward
2780     reference list. */
2781    
2782     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2783     {
2784     *errorcodeptr = ERR52;
2785     goto FAILED;
2786     }
2787    
2788 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2789    
2790     if (inescq && c != 0)
2791     {
2792 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2793 nigel 77 {
2794     inescq = FALSE;
2795     ptr++;
2796     continue;
2797     }
2798     else
2799     {
2800     if (previous_callout != NULL)
2801     {
2802 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2803     complete_callout(previous_callout, ptr, cd);
2804 nigel 77 previous_callout = NULL;
2805     }
2806     if ((options & PCRE_AUTO_CALLOUT) != 0)
2807     {
2808     previous_callout = code;
2809     code = auto_callout(code, ptr, cd);
2810     }
2811     goto NORMAL_CHAR;
2812     }
2813     }
2814    
2815     /* Fill in length of a previous callout, except when the next thing is
2816     a quantifier. */
2817    
2818 ph10 392 is_quantifier =
2819 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2820     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2821 nigel 77
2822     if (!is_quantifier && previous_callout != NULL &&
2823     after_manual_callout-- <= 0)
2824     {
2825 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2826     complete_callout(previous_callout, ptr, cd);
2827 nigel 77 previous_callout = NULL;
2828     }
2829    
2830     /* In extended mode, skip white space and comments */
2831    
2832     if ((options & PCRE_EXTENDED) != 0)
2833     {
2834     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2835 ph10 391 if (c == CHAR_NUMBER_SIGN)
2836 nigel 77 {
2837 nigel 93 while (*(++ptr) != 0)
2838 nigel 91 {
2839 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2840 nigel 91 }
2841 nigel 93 if (*ptr != 0) continue;
2842    
2843 nigel 91 /* Else fall through to handle end of string */
2844     c = 0;
2845 nigel 77 }
2846     }
2847    
2848     /* No auto callout for quantifiers. */
2849    
2850     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2851     {
2852     previous_callout = code;
2853     code = auto_callout(code, ptr, cd);
2854     }
2855    
2856     switch(c)
2857     {
2858 nigel 93 /* ===================================================================*/
2859     case 0: /* The branch terminates at string end */
2860 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
2861     case CHAR_RIGHT_PARENTHESIS:
2862 nigel 77 *firstbyteptr = firstbyte;
2863     *reqbyteptr = reqbyte;
2864     *codeptr = code;
2865     *ptrptr = ptr;
2866 nigel 93 if (lengthptr != NULL)
2867     {
2868 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2869     {
2870     *errorcodeptr = ERR20;
2871     goto FAILED;
2872     }
2873 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2874     DPRINTF((">> end branch\n"));
2875     }
2876 nigel 77 return TRUE;
2877    
2878 nigel 93
2879     /* ===================================================================*/
2880 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2881     the setting of any following char as a first character. */
2882    
2883 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
2884 nigel 77 if ((options & PCRE_MULTILINE) != 0)
2885     {
2886     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2887     }
2888     previous = NULL;
2889     *code++ = OP_CIRC;
2890     break;
2891    
2892 ph10 391 case CHAR_DOLLAR_SIGN:
2893 nigel 77 previous = NULL;
2894     *code++ = OP_DOLL;
2895     break;
2896    
2897     /* There can never be a first char if '.' is first, whatever happens about
2898     repeats. The value of reqbyte doesn't change either. */
2899    
2900 ph10 391 case CHAR_DOT:
2901 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2902     zerofirstbyte = firstbyte;
2903     zeroreqbyte = reqbyte;
2904     previous = code;
2905 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2906 nigel 77 break;
2907    
2908 nigel 93
2909     /* ===================================================================*/
2910 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2911     32-byte bitmap of the permitted characters, except in the special case
2912     where there is only one such character. For negated classes, we build the
2913     map as usual, then invert it at the end. However, we use a different opcode
2914     so that data characters > 255 can be handled correctly.
2915 nigel 77
2916     If the class contains characters outside the 0-255 range, a different
2917     opcode is compiled. It may optionally have a bit map for characters < 256,
2918     but those above are are explicitly listed afterwards. A flag byte tells
2919     whether the bitmap is present, and whether this is a negated class or not.
2920 ph10 345
2921 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
2922     default (Perl) mode, it is treated as a data character. */
2923 ph10 345
2924 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
2925 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2926     {
2927     *errorcodeptr = ERR64;
2928 ph10 345 goto FAILED;
2929 ph10 336 }
2930 ph10 345 goto NORMAL_CHAR;
2931 nigel 77
2932 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
2933 nigel 77 previous = code;
2934    
2935     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2936     they are encountered at the top level, so we'll do that too. */
2937    
2938 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2939 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
2940 ph10 295 check_posix_syntax(ptr, &tempptr))
2941 nigel 77 {
2942 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2943 nigel 77 goto FAILED;
2944     }
2945    
2946 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2947 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2948 ph10 205 skip them too. This makes for compatibility with Perl. */
2949 ph10 208
2950 ph10 205 negate_class = FALSE;
2951     for (;;)
2952 nigel 77 {
2953     c = *(++ptr);
2954 ph10 391 if (c == CHAR_BACKSLASH)
2955 ph10 205 {
2956 ph10 392 if (ptr[1] == CHAR_E)
2957 ph10 391 ptr++;
2958 ph10 392 else if (strncmp((const char *)ptr+1,
2959     STR_Q STR_BACKSLASH STR_E, 3) == 0)
2960 ph10 391 ptr += 3;
2961 ph10 392 else
2962 ph10 391 break;
2963 ph10 205 }
2964 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2965 ph10 205 negate_class = TRUE;
2966     else break;
2967 ph10 208 }
2968 ph10 345
2969     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2970     an initial ']' is taken as a data character -- the code below handles
2971 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2972     [^] must match any character, so generate OP_ALLANY. */
2973 ph10 345
2974 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2975 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2976 ph10 341 {
2977     *code++ = negate_class? OP_ALLANY : OP_FAIL;
2978     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2979     zerofirstbyte = firstbyte;
2980     break;
2981 ph10 345 }
2982 nigel 77
2983 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
2984     negation flag at the end, so that support for characters > 255 works
2985 ph10 264 correctly (they are all included in the class). */
2986    
2987     should_flip_negation = FALSE;
2988    
2989 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
2990 nigel 93 of just a single character (as long as it's < 256). However, For higher
2991     valued UTF-8 characters, we don't yet do any optimization. */
2992 nigel 77
2993     class_charcount = 0;
2994     class_lastchar = -1;
2995    
2996 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2997     temporary bit of memory, in case the class contains only 1 character (less
2998     than 256), because in that case the compiled code doesn't use the bit map.
2999     */
3000    
3001     memset(classbits, 0, 32 * sizeof(uschar));
3002    
3003 nigel 77 #ifdef SUPPORT_UTF8
3004     class_utf8 = FALSE; /* No chars >= 256 */
3005 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3006 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3007 nigel 77 #endif
3008    
3009     /* Process characters until ] is reached. By writing this as a "do" it
3010 nigel 93 means that an initial ] is taken as a data character. At the start of the
3011     loop, c contains the first byte of the character. */
3012 nigel 77
3013 nigel 93 if (c != 0) do
3014 nigel 77 {
3015 nigel 93 const uschar *oldptr;
3016    
3017 nigel 77 #ifdef SUPPORT_UTF8
3018     if (utf8 && c > 127)
3019     { /* Braces are required because the */
3020     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3021     }
3022 ph10 309
3023 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3024 ph10 309 data and reset the pointer. This is so that very large classes that
3025 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
3026 ph10 309 (which is on the stack). */
3027    
3028 ph10 300 if (lengthptr != NULL)
3029     {
3030     *lengthptr += class_utf8data - class_utf8data_base;
3031 ph10 309 class_utf8data = class_utf8data_base;
3032     }
3033    
3034 nigel 77 #endif
3035    
3036     /* Inside \Q...\E everything is literal except \E */
3037    
3038     if (inescq)
3039     {
3040 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3041 nigel 77 {
3042 nigel 93 inescq = FALSE; /* Reset literal state */
3043     ptr++; /* Skip the 'E' */
3044     continue; /* Carry on with next */
3045 nigel 77 }
3046 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
3047 nigel 77 }
3048    
3049     /* Handle POSIX class names. Perl allows a negation extension of the
3050     form [:^name:]. A square bracket that doesn't match the syntax is
3051     treated as a literal. We also recognize the POSIX constructions
3052     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3053     5.6 and 5.8 do. */
3054    
3055 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3056 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3057 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3058 nigel 77 {
3059     BOOL local_negate = FALSE;
3060 nigel 87 int posix_class, taboffset, tabopt;
3061 nigel 77 register const uschar *cbits = cd->cbits;
3062 nigel 87 uschar pbits[32];
3063 nigel 77
3064 ph10 391 if (ptr[1] != CHAR_COLON)
3065 nigel 77 {
3066     *errorcodeptr = ERR31;
3067     goto FAILED;
3068     }
3069    
3070     ptr += 2;
3071 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3072 nigel 77 {
3073     local_negate = TRUE;
3074 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3075 nigel 77 ptr++;
3076     }
3077    
3078     posix_class = check_posix_name(ptr, tempptr - ptr);
3079     if (posix_class < 0)
3080     {
3081     *errorcodeptr = ERR30;
3082     goto FAILED;
3083     }
3084    
3085     /* If matching is caseless, upper and lower are converted to
3086     alpha. This relies on the fact that the class table starts with
3087     alpha, lower, upper as the first 3 entries. */
3088    
3089     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3090     posix_class = 0;
3091    
3092 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
3093     because we may be adding and subtracting from it, and we don't want to
3094     subtract bits that may be in the main map already. At the end we or the
3095     result into the bit map that is being built. */
3096 nigel 77
3097     posix_class *= 3;
3098 nigel 87
3099     /* Copy in the first table (always present) */
3100    
3101     memcpy(pbits, cbits + posix_class_maps[posix_class],
3102     32 * sizeof(uschar));
3103    
3104     /* If there is a second table, add or remove it as required. */
3105    
3106     taboffset = posix_class_maps[posix_class + 1];
3107     tabopt = posix_class_maps[posix_class + 2];
3108    
3109     if (taboffset >= 0)
3110 nigel 77 {
3111 nigel 87 if (tabopt >= 0)
3112     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3113 nigel 77 else
3114 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3115 nigel 77 }
3116    
3117 nigel 87 /* Not see if we need to remove any special characters. An option
3118     value of 1 removes vertical space and 2 removes underscore. */
3119    
3120     if (tabopt < 0) tabopt = -tabopt;
3121     if (tabopt == 1) pbits[1] &= ~0x3c;
3122     else if (tabopt == 2) pbits[11] &= 0x7f;
3123    
3124     /* Add the POSIX table or its complement into the main table that is
3125     being built and we are done. */
3126    
3127     if (local_negate)
3128     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3129     else
3130     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3131    
3132 nigel 77 ptr = tempptr + 1;
3133     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3134     continue; /* End of POSIX syntax handling */
3135     }
3136    
3137     /* Backslash may introduce a single character, or it may introduce one
3138 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3139     case. Inside a class (and only there) it is treated as backspace.
3140     Elsewhere it marks a word boundary. Other escapes have preset maps ready
3141 ph10 205 to 'or' into the one we are building. We assume they have more than one
3142 nigel 77 character in them, so set class_charcount bigger than one. */
3143    
3144 ph10 391 if (c == CHAR_BACKSLASH)
3145 nigel 77 {
3146 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3147     if (*errorcodeptr != 0) goto FAILED;
3148 nigel 77
3149 ph10 391 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3150     else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
3151     else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
3152 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3153     {
3154 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3155 nigel 77 {
3156     ptr += 2; /* avoid empty string */
3157     }
3158     else inescq = TRUE;
3159     continue;
3160     }
3161 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3162 nigel 77
3163     if (c < 0)
3164     {
3165     register const uschar *cbits = cd->cbits;
3166     class_charcount += 2; /* Greater than 1 is what matters */
3167 nigel 93
3168     /* Save time by not doing this in the pre-compile phase. */
3169    
3170     if (lengthptr == NULL) switch (-c)
3171 nigel 77 {
3172     case ESC_d:
3173     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3174     continue;
3175    
3176     case ESC_D:
3177 ph10 286 should_flip_negation = TRUE;
3178 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3179     continue;
3180    
3181     case ESC_w:
3182     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3183     continue;
3184    
3185     case ESC_W:
3186 ph10 286 should_flip_negation = TRUE;
3187 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3188     continue;
3189    
3190     case ESC_s:
3191     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3192     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3193     continue;
3194    
3195     case ESC_S:
3196 ph10 286 should_flip_negation = TRUE;
3197 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3198     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3199     continue;
3200    
3201 nigel 93 default: /* Not recognized; fall through */
3202     break; /* Need "default" setting to stop compiler warning. */
3203     }
3204    
3205     /* In the pre-compile phase, just do the recognition. */
3206    
3207     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3208     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3209 ph10 180
3210 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
3211     they use extra memory. */
3212 ph10 180
3213 ph10 178 if (-c == ESC_h)
3214     {
3215     SETBIT(classbits, 0x09); /* VT */
3216     SETBIT(classbits, 0x20); /* SPACE */
3217 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
3218 ph10 178 #ifdef SUPPORT_UTF8
3219     if (utf8)
3220 ph10 180 {
3221 ph10 178 class_utf8 = TRUE;
3222     *class_utf8data++ = XCL_SINGLE;
3223 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3224 ph10 178 *class_utf8data++ = XCL_SINGLE;
3225 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3226     *class_utf8data++ = XCL_RANGE;
3227     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3228     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3229 ph10 178 *class_utf8data++ = XCL_SINGLE;
3230 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3231 ph10 178 *class_utf8data++ = XCL_SINGLE;
3232 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3233 ph10 178 *class_utf8data++ = XCL_SINGLE;
3234 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3235     }
3236     #endif
3237     continue;
3238     }
3239 nigel 93
3240 ph10 178 if (-c == ESC_H)
3241     {
3242     for (c = 0; c < 32; c++)
3243     {
3244     int x = 0xff;
3245     switch (c)
3246 ph10 180 {
3247 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
3248     case 0x20/8: x ^= 1 << (0x20%8); break;
3249     case 0xa0/8: x ^= 1 << (0xa0%8); break;
3250     default: break;
3251     }
3252     classbits[c] |= x;
3253 ph10 180 }
3254    
3255 ph10 178 #ifdef SUPPORT_UTF8
3256     if (utf8)
3257 ph10 180 {
3258 ph10 178 class_utf8 = TRUE;
3259 ph10 180 *class_utf8data++ = XCL_RANGE;
3260     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3261     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3262     *class_utf8data++ = XCL_RANGE;
3263     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3264     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3265     *class_utf8data++ = XCL_RANGE;
3266     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3267     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3268     *class_utf8data++ = XCL_RANGE;
3269     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3270     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3271     *class_utf8data++ = XCL_RANGE;
3272     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3273     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3274     *class_utf8data++ = XCL_RANGE;
3275     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3276     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3277     *class_utf8data++ = XCL_RANGE;
3278     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3279     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3280     }
3281     #endif
3282     continue;
3283     }
3284 ph10 178
3285     if (-c == ESC_v)
3286     {
3287     SETBIT(classbits, 0x0a); /* LF */
3288     SETBIT(classbits, 0x0b); /* VT */
3289 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3290     SETBIT(classbits, 0x0d); /* CR */
3291     SETBIT(classbits, 0x85); /* NEL */
3292 ph10 178 #ifdef SUPPORT_UTF8
3293     if (utf8)
3294 ph10 180 {
3295 ph10 178 class_utf8 = TRUE;
3296 ph10 180 *class_utf8data++ = XCL_RANGE;
3297     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3298     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3299     }
3300     #endif
3301     continue;
3302     }
3303 ph10 178
3304     if (-c == ESC_V)
3305     {
3306     for (c = 0; c < 32; c++)
3307     {
3308     int x = 0xff;
3309     switch (c)
3310 ph10 180 {
3311 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3312     x ^= 1 << (0x0b%8);
3313     x ^= 1 << (0x0c%8);
3314 ph10 180 x ^= 1 << (0x0d%8);
3315 ph10 178 break;
3316     case 0x85/8: x ^= 1 << (0x85%8); break;
3317     default: break;
3318     }
3319     classbits[c] |= x;
3320 ph10 180 }
3321    
3322 ph10 178 #ifdef SUPPORT_UTF8
3323     if (utf8)
3324 ph10 180 {
3325 ph10 178 class_utf8 = TRUE;
3326 ph10 180 *class_utf8data++ = XCL_RANGE;
3327     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3328     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3329     *class_utf8data++ = XCL_RANGE;
3330     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3331     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3332     }
3333     #endif
3334     continue;
3335     }
3336 ph10 178
3337 nigel 93 /* We need to deal with \P and \p in both phases. */
3338    
3339 nigel 77 #ifdef SUPPORT_UCP
3340 nigel 93 if (-c == ESC_p || -c == ESC_P)
3341     {
3342     BOOL negated;
3343     int pdata;
3344     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3345     if (ptype < 0) goto FAILED;
3346     class_utf8 = TRUE;
3347     *class_utf8data++ = ((-c == ESC_p) != negated)?
3348     XCL_PROP : XCL_NOTPROP;
3349     *class_utf8data++ = ptype;
3350     *class_utf8data++ = pdata;
3351     class_charcount -= 2; /* Not a < 256 character */
3352 nigel 77 continue;
3353 nigel 93 }
3354 nigel 77 #endif
3355 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
3356     strict mode. By default, for compatibility with Perl, they are
3357     treated as literals. */
3358 nigel 77
3359 nigel 93 if ((options & PCRE_EXTRA) != 0)
3360     {
3361     *errorcodeptr = ERR7;
3362     goto FAILED;
3363     }
3364 nigel 77
3365 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3366     c = *ptr; /* Get the final character and fall through */
3367 nigel 77 }
3368    
3369     /* Fall through if we have a single character (c >= 0). This may be
3370 nigel 93 greater than 256 in UTF-8 mode. */
3371 nigel 77
3372     } /* End of backslash handling */
3373    
3374     /* A single character may be followed by '-' to form a range. However,
3375     Perl does not permit ']' to be the end of the range. A '-' character
3376 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3377     entirely. The code for handling \Q and \E is messy. */
3378 nigel 77
3379 nigel 93 CHECK_RANGE:
3380 ph10 391 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3381 nigel 77 {
3382 nigel 93 inescq = FALSE;
3383     ptr += 2;
3384     }
3385    
3386     oldptr = ptr;
3387 ph10 231
3388 ph10 230 /* Remember \r or \n */
3389 ph10 231
3390 ph10 391 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3391 ph10 231
3392 ph10 230 /* Check for range */
3393 nigel 93
3394 ph10 391 if (!inescq && ptr[1] == CHAR_MINUS)
3395 nigel 93 {
3396 nigel 77 int d;
3397     ptr += 2;
3398 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3399 nigel 77
3400 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3401     mode. */
3402    
3403 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3404 nigel 93 {
3405     ptr += 2;
3406 ph10 392 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3407 ph10 391 { ptr += 2; continue; }
3408 nigel 93 inescq = TRUE;
3409     break;
3410     }
3411    
3412 ph10 391 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3413 nigel 93 {
3414     ptr = oldptr;
3415     goto LONE_SINGLE_CHARACTER;
3416     }
3417    
3418 nigel 77 #ifdef SUPPORT_UTF8
3419     if (utf8)
3420     { /* Braces are required because the */
3421     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3422     }
3423     else
3424     #endif
3425     d = *ptr; /* Not UTF-8 mode */
3426    
3427     /* The second part of a range can be a single-character escape, but
3428     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3429     in such circumstances. */
3430    
3431 ph10 391 if (!inescq && d == CHAR_BACKSLASH)
3432 nigel 77 {
3433 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3434     if (*errorcodeptr != 0) goto FAILED;
3435 nigel 77
3436 ph10 275 /* \b is backspace; \X is literal X; \R is literal R; any other
3437 nigel 93 special means the '-' was literal */
3438 nigel 77
3439     if (d < 0)
3440     {
3441 ph10 391 if (d == -ESC_b) d = CHAR_BS;
3442     else if (d == -ESC_X) d = CHAR_X;
3443     else if (d == -ESC_R) d = CHAR_R; else
3444 nigel 77 {
3445 nigel 93 ptr = oldptr;
3446 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3447     }
3448     }
3449     }
3450    
3451 nigel 93 /* Check that the two values are in the correct order. Optimize
3452     one-character ranges */
3453 nigel 77
3454 nigel 93 if (d < c)
3455     {
3456     *errorcodeptr = ERR8;
3457     goto FAILED;
3458     }
3459    
3460 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3461    
3462 ph10 230 /* Remember \r or \n */
3463 ph10 231
3464 ph10 391 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3465 ph10 231
3466 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3467     matching, we have to use an XCLASS with extra data items. Caseless
3468     matching for characters > 127 is available only if UCP support is
3469     available. */
3470    
3471     #ifdef SUPPORT_UTF8
3472     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3473     {
3474     class_utf8 = TRUE;
3475    
3476     /* With UCP support, we can find the other case equivalents of
3477     the relevant characters. There may be several ranges. Optimize how
3478     they fit with the basic range. */
3479    
3480     #ifdef SUPPORT_UCP
3481     if ((options & PCRE_CASELESS) != 0)
3482     {
3483 nigel 93 unsigned int occ, ocd;
3484     unsigned int cc = c;
3485     unsigned int origd = d;
3486 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3487     {
3488 ph10 180 if (occ >= (unsigned int)c &&
3489     ocd <= (unsigned int)d)
3490 ph10 176 continue; /* Skip embedded ranges */
3491 nigel 77
3492 ph10 180 if (occ < (unsigned int)c &&
3493 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3494 nigel 77 { /* if there is overlap, */
3495     c = occ; /* noting that if occ < c */
3496     continue; /* we can't have ocd > d */
3497     } /* because a subrange is */
3498 ph10 180 if (ocd > (unsigned int)d &&
3499 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3500 nigel 77 { /* the basic range. */
3501     d = ocd;
3502     continue;
3503     }
3504    
3505     if (occ == ocd)
3506     {
3507     *class_utf8data++ = XCL_SINGLE;
3508     }
3509     else
3510     {
3511     *class_utf8data++ = XCL_RANGE;
3512     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3513     }
3514     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3515     }
3516     }
3517     #endif /* SUPPORT_UCP */
3518    
3519     /* Now record the original range, possibly modified for UCP caseless
3520     overlapping ranges. */
3521    
3522     *class_utf8data++ = XCL_RANGE;
3523     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3524     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3525    
3526     /* With UCP support, we are done. Without UCP support, there is no
3527     caseless matching for UTF-8 characters > 127; we can use the bit map
3528     for the smaller ones. */
3529    
3530     #ifdef SUPPORT_UCP
3531     continue; /* With next character in the class */
3532     #else
3533     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3534    
3535     /* Adjust upper limit and fall through to set up the map */
3536    
3537     d = 127;
3538    
3539     #endif /* SUPPORT_UCP */
3540     }
3541     #endif /* SUPPORT_UTF8 */
3542    
3543     /* We use the bit map for all cases when not in UTF-8 mode; else
3544     ranges that lie entirely within 0-127 when there is UCP support; else
3545     for partial ranges without UCP support. */
3546    
3547 nigel 93 class_charcount += d - c + 1;
3548     class_lastchar = d;
3549    
3550     /* We can save a bit of time by skipping this in the pre-compile. */
3551    
3552     if (lengthptr == NULL) for (; c <= d; c++)
3553 nigel 77 {
3554     classbits[c/8] |= (1 << (c&7));
3555     if ((options & PCRE_CASELESS) != 0)
3556     {
3557     int uc = cd->fcc[c]; /* flip case */
3558     classbits[uc/8] |= (1 << (uc&7));
3559     }
3560     }
3561    
3562     continue; /* Go get the next char in the class */
3563     }
3564    
3565     /* Handle a lone single character - we can get here for a normal
3566     non-escape char, or after \ that introduces a single character or for an
3567     apparent range that isn't. */
3568    
3569     LONE_SINGLE_CHARACTER:
3570 ph10 231
3571 nigel 77 /* Handle a character that cannot go in the bit map */
3572    
3573     #ifdef SUPPORT_UTF8
3574     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3575     {
3576     class_utf8 = TRUE;
3577     *class_utf8data++ = XCL_SINGLE;
3578     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3579    
3580     #ifdef SUPPORT_UCP
3581     if ((options & PCRE_CASELESS) != 0)
3582     {
3583 nigel 93 unsigned int othercase;
3584 ph10 349 if ((othercase = UCD_OTHERCASE(c)) != c)
3585 nigel 77 {
3586     *class_utf8data++ = XCL_SINGLE;
3587     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3588     }
3589     }
3590     #endif /* SUPPORT_UCP */
3591    
3592     }
3593     else
3594     #endif /* SUPPORT_UTF8 */
3595    
3596     /* Handle a single-byte character */
3597     {
3598     classbits[c/8] |= (1 << (c&7));
3599     if ((options & PCRE_CASELESS) != 0)
3600     {
3601     c = cd->fcc[c]; /* flip case */
3602     classbits[c/8] |= (1 << (c&7));
3603     }
3604     class_charcount++;
3605     class_lastchar = c;
3606     }
3607     }
3608    
3609 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3610 nigel 77
3611 ph10 391 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3612 nigel 77
3613 nigel 93 if (c == 0) /* Missing terminating ']' */
3614     {
3615     *errorcodeptr = ERR6;
3616     goto FAILED;
3617     }
3618 ph10 231
3619    
3620 ph10 230 /* This code has been disabled because it would mean that \s counts as
3621     an explicit \r or \n reference, and that's not really what is wanted. Now
3622     we set the flag only if there is a literal "\r" or "\n" in the class. */
3623 ph10 227
3624 ph10 230 #if 0
3625 ph10 226 /* Remember whether \r or \n are in this class */
3626 ph10 227
3627 ph10 226 if (negate_class)
3628     {
3629 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3630 ph10 226 }
3631     else
3632     {
3633 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3634 ph10 227 }
3635 ph10 230 #endif
3636 ph10 227
3637 ph10 231
3638 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3639 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3640     use of \p or \P, in other words, no use of any XCLASS features, we can
3641     optimize.
3642    
3643 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3644     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3645     operate on single-bytes only. This is an historical hangover. Maybe one day
3646     we can tidy these opcodes to handle multi-byte characters.
3647 nigel 77
3648     The optimization throws away the bit map. We turn the item into a
3649     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3650     that OP_NOT does not support multibyte characters. In the positive case, it
3651     can cause firstbyte to be set. Otherwise, there can be no first char if
3652     this item is first, whatever repeat count may follow. In the case of
3653     reqbyte, save the previous value for reinstating. */
3654    
3655     #ifdef SUPPORT_UTF8
3656 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3657 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3658 nigel 77 #else
3659     if (class_charcount == 1)
3660     #endif
3661     {
3662     zeroreqbyte = reqbyte;
3663    
3664     /* The OP_NOT opcode works on one-byte characters only. */
3665    
3666     if (negate_class)
3667     {
3668     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3669     zerofirstbyte = firstbyte;
3670     *code++ = OP_NOT;
3671     *code++ = class_lastchar;
3672     break;
3673     }
3674    
3675     /* For a single, positive character, get the value into mcbuffer, and
3676     then we can handle this with the normal one-character code. */
3677    
3678     #ifdef SUPPORT_UTF8
3679     if (utf8 && class_lastchar > 127)
3680     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3681     else
3682     #endif
3683     {
3684     mcbuffer[0] = class_lastchar;
3685     mclength = 1;
3686     }
3687     goto ONE_CHAR;
3688     } /* End of 1-char optimization */
3689    
3690     /* The general case - not the one-char optimization. If this is the first
3691     thing in the branch, there can be no first char setting, whatever the
3692     repeat count. Any reqbyte setting must remain unchanged after any kind of
3693     repeat. */
3694    
3695     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3696     zerofirstbyte = firstbyte;
3697     zeroreqbyte = reqbyte;
3698    
3699     /* If there are characters with values > 255, we have to compile an
3700 ph10 286 extended class, with its own opcode, unless there was a negated special
3701     such as \S in the class, because in that case all characters > 255 are in
3702     the class, so any that were explicitly given as well can be ignored. If
3703 ph10 264 (when there are explicit characters > 255 that must be listed) there are no
3704     characters < 256, we can omit the bitmap in the actual compiled code. */
3705 nigel 77
3706     #ifdef SUPPORT_UTF8
3707 ph10 264 if (class_utf8 && !should_flip_negation)
3708 nigel 77 {
3709     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3710     *code++ = OP_XCLASS;
3711     code += LINK_SIZE;
3712     *code = negate_class? XCL_NOT : 0;
3713    
3714 nigel 93 /* If the map is required, move up the extra data to make room for it;
3715     otherwise just move the code pointer to the end of the extra data. */
3716 nigel 77
3717     if (class_charcount > 0)
3718     {
3719     *code++ |= XCL_MAP;
3720 nigel 93 memmove(code + 32, code, class_utf8data - code);
3721 nigel 77 memcpy(code, classbits, 32);
3722 nigel 93 code = class_utf8data + 32;
3723 nigel 77 }
3724 nigel 93 else code = class_utf8data;
3725 nigel 77
3726     /* Now fill in the complete length of the item */
3727    
3728     PUT(previous, 1, code - previous);
3729     break; /* End of class handling */
3730     }
3731     #endif
3732    
3733 ph10 286 /* If there are no characters > 255, set the opcode to OP_CLASS or
3734     OP_NCLASS, depending on whether the whole class was negated and whether
3735     there were negative specials such as \S in the class. Then copy the 32-byte
3736 ph10 264 map into the code vector, negating it if necessary. */
3737 ph10 286
3738 ph10 264 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3739 nigel 77 if (negate_class)
3740     {
3741 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3742     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3743 nigel 77 }
3744     else
3745     {
3746     memcpy(code, classbits, 32);
3747     }
3748     code += 32;
3749     break;
3750    
3751 nigel 93
3752     /* ===================================================================*/
3753 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3754     has been tested above. */
3755    
3756 ph10 391 case CHAR_LEFT_CURLY_BRACKET:
3757 nigel 77 if (!is_quantifier) goto NORMAL_CHAR;
3758     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3759     if (*errorcodeptr != 0) goto FAILED;
3760     goto REPEAT;
3761    
3762 ph10 391 case CHAR_ASTERISK:
3763 nigel 77 repeat_min = 0;
3764     repeat_max = -1;
3765     goto REPEAT;
3766    
3767 ph10 391 case CHAR_PLUS:
3768 nigel 77 repeat_min = 1;
3769     repeat_max = -1;
3770     goto REPEAT;
3771    
3772 ph10 391 case CHAR_QUESTION_MARK:
3773 nigel 77 repeat_min = 0;
3774     repeat_max = 1;
3775    
3776     REPEAT:
3777     if (previous == NULL)
3778     {
3779     *errorcodeptr = ERR9;
3780     goto FAILED;
3781     }
3782    
3783     if (repeat_min == 0)
3784     {
3785     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3786     reqbyte = zeroreqbyte; /* Ditto */
3787     }
3788    
3789     /* Remember whether this is a variable length repeat */
3790    
3791     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3792    
3793     op_type = 0; /* Default single-char op codes */
3794     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3795    
3796     /* Save start of previous item, in case we have to move it up to make space
3797     for an inserted OP_ONCE for the additional '+' extension. */
3798    
3799     tempcode = previous;
3800    
3801     /* If the next character is '+', we have a possessive quantifier. This
3802     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3803     If the next character is '?' this is a minimizing repeat, by default,
3804     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3805     repeat type to the non-default. */
3806    
3807 ph10 391 if (ptr[1] == CHAR_PLUS)
3808 nigel 77 {
3809     repeat_type = 0; /* Force greedy */
3810     possessive_quantifier = TRUE;
3811     ptr++;
3812     }
3813 ph10 391 else if (ptr[1] == CHAR_QUESTION_MARK)
3814 nigel 77 {
3815     repeat_type = greedy_non_default;
3816     ptr++;
3817     }
3818     else repeat_type = greedy_default;
3819    
3820     /* If previous was a character match, abolish the item and generate a
3821     repeat item instead. If a char item has a minumum of more than one, ensure
3822     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3823     the first thing in a branch because the x will have gone into firstbyte
3824     instead. */
3825    
3826     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3827     {
3828     /* Deal with UTF-8 characters that take up more than one byte. It's
3829     easier to write this out separately than try to macrify it. Use c to
3830     hold the length of the character in bytes, plus 0x80 to flag that it's a
3831     length rather than a small character. */
3832    
3833     #ifdef SUPPORT_UTF8
3834     if (utf8 && (code[-1] & 0x80) != 0)
3835     {
3836     uschar *lastchar = code - 1;
3837     while((*lastchar & 0xc0) == 0x80) lastchar--;
3838     c = code - lastchar; /* Length of UTF-8 character */
3839     memcpy(utf8_char, lastchar, c); /* Save the char */
3840     c |= 0x80; /* Flag c as a length */
3841     }
3842     else
3843     #endif
3844    
3845     /* Handle the case of a single byte - either with no UTF8 support, or
3846     with UTF-8 disabled, or for a UTF-8 character < 128. */
3847    
3848     {
3849     c = code[-1];
3850     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3851     }
3852    
3853 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3854     the line is something that cannot possibly match this character. If so,
3855     automatically possessifying this item gains some performance in the case
3856     where the match fails. */
3857    
3858     if (!possessive_quantifier &&
3859     repeat_max < 0 &&
3860     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3861     options, cd))
3862     {
3863     repeat_type = 0; /* Force greedy */
3864     possessive_quantifier = TRUE;
3865     }
3866    
3867 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3868     }
3869    
3870     /* If previous was a single negated character ([^a] or similar), we use
3871     one of the special opcodes, replacing it. The code is shared with single-
3872     character repeats by setting opt_type to add a suitable offset into
3873 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3874     currently used only for single-byte chars. */
3875 nigel 77
3876     else if (*previous == OP_NOT)
3877     {
3878     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3879     c = previous[1];
3880 nigel 93 if (!possessive_quantifier &&
3881     repeat_max < 0 &&
3882     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3883     {
3884     repeat_type = 0; /* Force greedy */
3885     possessive_quantifier = TRUE;
3886     }
3887 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3888     }
3889    
3890     /* If previous was a character type match (\d or similar), abolish it and
3891     create a suitable repeat item. The code is shared with single-character
3892     repeats by setting op_type to add a suitable offset into repeat_type. Note
3893     the the Unicode property types will be present only when SUPPORT_UCP is
3894     defined, but we don't wrap the little bits of code here because it just
3895     makes it horribly messy. */
3896    
3897     else if (*previous < OP_EODN)
3898     {
3899     uschar *oldcode;
3900 nigel 87 int prop_type, prop_value;
3901 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3902     c = *previous;
3903    
3904 nigel 93 if (!possessive_quantifier &&
3905     repeat_max < 0 &&
3906     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3907     {
3908     repeat_type = 0; /* Force greedy */
3909     possessive_quantifier = TRUE;
3910     }
3911    
3912 nigel 77 OUTPUT_SINGLE_REPEAT:
3913 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3914     {
3915     prop_type = previous[1];
3916     prop_value = previous[2];
3917     }
3918     else prop_type = prop_value = -1;
3919 nigel 77
3920     oldcode = code;
3921     code = previous; /* Usually overwrite previous item */
3922    
3923     /* If the maximum is zero then the minimum must also be zero; Perl allows
3924     this case, so we do too - by simply omitting the item altogether. */
3925    
3926     if (repeat_max == 0) goto END_REPEAT;
3927    
3928 ph10 461 /*--------------------------------------------------------------------*/
3929 ph10 426 /* This code is obsolete from release 8.00; the restriction was finally
3930     removed: */
3931 ph10 461
3932 nigel 77 /* All real repeats make it impossible to handle partial matching (maybe
3933     one day we will be able to remove this restriction). */
3934 ph10 461
3935 ph10 426 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3936 ph10 461 /*--------------------------------------------------------------------*/
3937 nigel 77
3938     /* Combine the op_type with the repeat_type */
3939    
3940     repeat_type += op_type;
3941    
3942     /* A minimum of zero is handled either as the special case * or ?, or as
3943     an UPTO, with the maximum given. */
3944