/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 487 - (hide annotations) (download)
Wed Jan 6 10:26:55 2010 UTC (4 years, 9 months ago) by ph10
File MIME type: text/plain
File size: 220451 byte(s)
Tidying updates for 8.01-RC1 release.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 475 Copyright (c) 1997-2010 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 ph10 475 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57     also used by pcretest. PCRE_DEBUG is not defined when building a production
58     library. */
59 nigel 85
60 ph10 475 #ifdef PCRE_DEBUG
61 nigel 85 #include "pcre_printint.src"
62     #endif
63    
64    
65 ph10 178 /* Macro for setting individual bits in class bitmaps. */
66    
67     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68    
69 ph10 202 /* Maximum length value to check against when making sure that the integer that
70     holds the compiled pattern length does not overflow. We make it a bit less than
71     INT_MAX to allow for adding in group terminating bytes, so that we don't have
72     to check them every time. */
73 ph10 178
74 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
75    
76    
77 nigel 77 /*************************************************
78     * Code parameters and static tables *
79     *************************************************/
80    
81 nigel 93 /* This value specifies the size of stack workspace that is used during the
82     first pre-compile phase that determines how much memory is required. The regex
83     is partly compiled into this space, but the compiled parts are discarded as
84     soon as they can be, so that hopefully there will never be an overrun. The code
85     does, however, check for an overrun. The largest amount I've seen used is 218,
86     so this number is very generous.
87 nigel 77
88 nigel 93 The same workspace is used during the second, actual compile phase for
89     remembering forward references to groups so that they can be filled in at the
90     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91     is 4 there is plenty of room. */
92 nigel 77
93 nigel 93 #define COMPILE_WORK_SIZE (4096)
94 nigel 77
95 nigel 93
96 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
97     are simple data values; negative values are for special things like \d and so
98     on. Zero means further processing is needed (for things like \x), or the escape
99     is invalid. */
100    
101 ph10 391 #ifndef EBCDIC
102    
103     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
104 ph10 392 in UTF-8 mode. */
105 ph10 391
106 ph10 392 static const short int escapes[] = {
107 ph10 391 0, 0,
108     0, 0,
109 ph10 392 0, 0,
110     0, 0,
111     0, 0,
112 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
113 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
114 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
115 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
116     -ESC_B, -ESC_C,
117     -ESC_D, -ESC_E,
118     0, -ESC_G,
119     -ESC_H, 0,
120     0, -ESC_K,
121 ph10 391 0, 0,
122 ph10 392 0, 0,
123 ph10 391 -ESC_P, -ESC_Q,
124     -ESC_R, -ESC_S,
125 ph10 392 0, 0,
126     -ESC_V, -ESC_W,
127     -ESC_X, 0,
128     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
129 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
130 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
131 ph10 391 CHAR_GRAVE_ACCENT, 7,
132 ph10 392 -ESC_b, 0,
133     -ESC_d, ESC_e,
134 ph10 391 ESC_f, 0,
135     -ESC_h, 0,
136 ph10 392 0, -ESC_k,
137 ph10 391 0, 0,
138     ESC_n, 0,
139 ph10 392 -ESC_p, 0,
140     ESC_r, -ESC_s,
141 ph10 391 ESC_tee, 0,
142 ph10 392 -ESC_v, -ESC_w,
143     0, 0,
144 ph10 391 -ESC_z
145 nigel 77 };
146    
147 ph10 392 #else
148 ph10 391
149     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
150    
151 nigel 77 static const short int escapes[] = {
152     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
153     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
154     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
155     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
156     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
157     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
158     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
159     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
160 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
161 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
162 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
163 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
164 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
165     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
166     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
167     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
168 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
169 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
170 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
171 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
172 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
173     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
174     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
175     };
176     #endif
177    
178    
179 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
180     searched linearly. Put all the names into a single string, in order to reduce
181 ph10 392 the number of relocations when a shared library is dynamically linked. The
182     string is built from string macros so that it works in UTF-8 mode on EBCDIC
183 ph10 391 platforms. */
184 ph10 210
185     typedef struct verbitem {
186     int len;
187     int op;
188 ph10 211 } verbitem;
189 ph10 210
190 ph10 240 static const char verbnames[] =
191 ph10 391 STRING_ACCEPT0
192     STRING_COMMIT0
193     STRING_F0
194     STRING_FAIL0
195     STRING_PRUNE0
196     STRING_SKIP0
197     STRING_THEN;
198 ph10 240
199 ph10 327 static const verbitem verbs[] = {
200 ph10 240 { 6, OP_ACCEPT },
201     { 6, OP_COMMIT },
202     { 1, OP_FAIL },
203     { 4, OP_FAIL },
204     { 5, OP_PRUNE },
205     { 4, OP_SKIP },
206     { 4, OP_THEN }
207 ph10 210 };
208    
209 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
210 ph10 210
211    
212 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
213     now all in a single string, to reduce the number of relocations when a shared
214 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
215     length entry. The first three must be alpha, lower, upper, as this is assumed
216     for handling case independence. */
217 nigel 77
218 ph10 240 static const char posix_names[] =
219 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
220     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
221 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
222     STRING_word0 STRING_xdigit;
223 nigel 77
224     static const uschar posix_name_lengths[] = {
225     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
226    
227 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
228     base map, with an optional addition or removal of another map. Then, for some
229     classes, there is some additional tweaking: for [:blank:] the vertical space
230     characters are removed, and for [:alpha:] and [:alnum:] the underscore
231     character is removed. The triples in the table consist of the base map offset,
232     second map offset or -1 if no second map, and a non-negative value for map
233     addition or a negative value for map subtraction (if there are two maps). The
234     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
235     remove vertical space characters, 2 => remove underscore. */
236 nigel 77
237     static const int posix_class_maps[] = {
238 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
239     cbit_lower, -1, 0, /* lower */
240     cbit_upper, -1, 0, /* upper */
241     cbit_word, -1, 2, /* alnum - word without underscore */
242     cbit_print, cbit_cntrl, 0, /* ascii */
243     cbit_space, -1, 1, /* blank - a GNU extension */
244     cbit_cntrl, -1, 0, /* cntrl */
245     cbit_digit, -1, 0, /* digit */
246     cbit_graph, -1, 0, /* graph */
247     cbit_print, -1, 0, /* print */
248     cbit_punct, -1, 0, /* punct */
249     cbit_space, -1, 0, /* space */
250     cbit_word, -1, 0, /* word - a Perl extension */
251     cbit_xdigit,-1, 0 /* xdigit */
252 nigel 77 };
253    
254    
255 nigel 93 #define STRING(a) # a
256     #define XSTRING(s) STRING(s)
257    
258 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
259 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
260     they are documented. Always add a new error instead. Messages marked DEAD below
261 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
262     the number of relocations needed when a shared library is loaded dynamically,
263     it is now one long string. We cannot use a table of offsets, because the
264     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
265     simply count through to the one we want - this isn't a performance issue
266 ph10 240 because these strings are used only when there is a compilation error. */
267 nigel 77
268 ph10 240 static const char error_texts[] =
269     "no error\0"
270     "\\ at end of pattern\0"
271     "\\c at end of pattern\0"
272     "unrecognized character follows \\\0"
273     "numbers out of order in {} quantifier\0"
274 nigel 77 /* 5 */
275 ph10 240 "number too big in {} quantifier\0"
276     "missing terminating ] for character class\0"
277     "invalid escape sequence in character class\0"
278     "range out of order in character class\0"
279     "nothing to repeat\0"
280 nigel 77 /* 10 */
281 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
282     "internal error: unexpected repeat\0"
283 ph10 269 "unrecognized character after (? or (?-\0"
284 ph10 240 "POSIX named classes are supported only within a class\0"
285     "missing )\0"
286 nigel 77 /* 15 */
287 ph10 240 "reference to non-existent subpattern\0"
288     "erroffset passed as NULL\0"
289     "unknown option bit(s) set\0"
290     "missing ) after comment\0"
291     "parentheses nested too deeply\0" /** DEAD **/
292 nigel 77 /* 20 */
293 ph10 240 "regular expression is too large\0"
294     "failed to get memory\0"
295     "unmatched parentheses\0"
296     "internal error: code overflow\0"
297     "unrecognized character after (?<\0"
298 nigel 77 /* 25 */
299 ph10 240 "lookbehind assertion is not fixed length\0"
300     "malformed number or name after (?(\0"
301     "conditional group contains more than two branches\0"
302     "assertion expected after (?(\0"
303     "(?R or (?[+-]digits must be followed by )\0"
304 nigel 77 /* 30 */
305 ph10 240 "unknown POSIX class name\0"
306     "POSIX collating elements are not supported\0"
307     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
308     "spare error\0" /** DEAD **/
309     "character value in \\x{...} sequence is too large\0"
310 nigel 77 /* 35 */
311 ph10 240 "invalid condition (?(0)\0"
312     "\\C not allowed in lookbehind assertion\0"
313     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
314     "number after (?C is > 255\0"
315     "closing ) for (?C expected\0"
316 nigel 77 /* 40 */
317 ph10 240 "recursive call could loop indefinitely\0"
318     "unrecognized character after (?P\0"
319     "syntax error in subpattern name (missing terminator)\0"
320     "two named subpatterns have the same name\0"
321     "invalid UTF-8 string\0"
322 nigel 77 /* 45 */
323 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
324     "malformed \\P or \\p sequence\0"
325     "unknown property name after \\P or \\p\0"
326     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
327     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
328 nigel 91 /* 50 */
329 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
330     "octal value is greater than \\377 (not in UTF-8 mode)\0"
331     "internal error: overran compiling workspace\0"
332     "internal error: previously-checked referenced subpattern not found\0"
333     "DEFINE group contains more than one branch\0"
334 nigel 93 /* 55 */
335 ph10 240 "repeating a DEFINE group is not allowed\0"
336     "inconsistent NEWLINE options\0"
337 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
338     "a numbered reference must not be zero\0"
339 ph10 240 "(*VERB) with an argument is not supported\0"
340 ph10 211 /* 60 */
341 ph10 240 "(*VERB) not recognized\0"
342 ph10 268 "number is too big\0"
343 ph10 272 "subpattern name expected\0"
344 ph10 336 "digit expected after (?+\0"
345 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
346     /* 65 */
347 ph10 461 "different names for subpatterns of the same number are not allowed";
348 nigel 77
349    
350     /* Table to identify digits and hex digits. This is used when compiling
351     patterns. Note that the tables in chartables are dependent on the locale, and
352     may mark arbitrary characters as digits - but the PCRE compiling code expects
353     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
354     a private table here. It costs 256 bytes, but it is a lot faster than doing
355     character value tests (at least in some simple cases I timed), and in some
356     applications one wants PCRE to compile efficiently as well as match
357     efficiently.
358    
359     For convenience, we use the same bit definitions as in chartables:
360    
361     0x04 decimal digit
362     0x08 hexadecimal digit
363    
364     Then we can use ctype_digit and ctype_xdigit in the code. */
365    
366 ph10 392 #ifndef EBCDIC
367 ph10 391
368 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
369 ph10 391 UTF-8 mode. */
370    
371 nigel 77 static const unsigned char digitab[] =
372     {
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
374     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
375     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
376     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
377     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
378     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
379     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
380     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
381     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
382     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
383     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
385     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
387     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
388     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
393     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
394     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
395     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
396     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
397     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
398     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
399     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
400     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
401     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
402     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
403     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
404     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
405    
406 ph10 392 #else
407 ph10 391
408     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
409    
410 nigel 77 static const unsigned char digitab[] =
411     {
412     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
413     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
414     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
415     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
416     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
417     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
418     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
419     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
420     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
421     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
422     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
423 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
424 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
425     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
426     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
427     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
428     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
429     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
430     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
431     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
432     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
433     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
434     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
435     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
436     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
437     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
438     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
439     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
440     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
441     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
442     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
443     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
444    
445     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
446     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
447     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
448     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
449     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
450     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
451     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
452     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
453     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
454     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
455     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
456     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
457 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
458 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
459     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
460     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
461     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
462     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
463     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
464     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
465     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
466     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
467     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
468     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
469     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
470     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
471     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
472     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
473     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
474     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
475     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
476     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
477     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
478     #endif
479    
480    
481     /* Definition to allow mutual recursion */
482    
483     static BOOL
484 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
485 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
486 nigel 77
487    
488    
489     /*************************************************
490 ph10 240 * Find an error text *
491     *************************************************/
492    
493 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
494     some of the text is of unknown length, we can't use a table of offsets.
495     Instead, just count through the strings. This is not a performance issue
496 ph10 240 because it happens only when there has been a compilation error.
497    
498     Argument: the error number
499     Returns: pointer to the error string
500     */
501    
502     static const char *
503     find_error_text(int n)
504     {
505     const char *s = error_texts;
506 ph10 369 for (; n > 0; n--) while (*s++ != 0) {};
507 ph10 240 return s;
508     }
509    
510    
511     /*************************************************
512 nigel 77 * Handle escapes *
513     *************************************************/
514    
515     /* This function is called when a \ has been encountered. It either returns a
516     positive value for a simple escape such as \n, or a negative value which
517 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
518     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
519     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
520     ptr is pointing at the \. On exit, it is on the final character of the escape
521     sequence.
522 nigel 77
523     Arguments:
524     ptrptr points to the pattern position pointer
525     errorcodeptr points to the errorcode variable
526     bracount number of previous extracting brackets
527     options the options bits
528     isclass TRUE if inside a character class
529    
530     Returns: zero or positive => a data character
531     negative => a special escape sequence
532 ph10 213 on error, errorcodeptr is set
533 nigel 77 */
534    
535     static int
536     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
537     int options, BOOL isclass)
538     {
539 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
540     const uschar *ptr = *ptrptr + 1;
541 nigel 77 int c, i;
542    
543 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
544     ptr--; /* Set pointer back to the last byte */
545    
546 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
547    
548     if (c == 0) *errorcodeptr = ERR1;
549    
550 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
551     in a table. A non-zero result is something that can be returned immediately.
552 nigel 77 Otherwise further processing may be required. */
553    
554 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
555     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
556     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
557 nigel 77
558 ph10 97 #else /* EBCDIC coding */
559 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
560 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
561     #endif
562    
563     /* Escapes that need further processing, or are illegal. */
564    
565     else
566     {
567     const uschar *oldptr;
568 nigel 93 BOOL braced, negated;
569    
570 nigel 77 switch (c)
571     {
572     /* A number of Perl escapes are not handled by PCRE. We give an explicit
573     error. */
574    
575 ph10 391 case CHAR_l:
576     case CHAR_L:
577     case CHAR_N:
578     case CHAR_u:
579     case CHAR_U:
580 nigel 77 *errorcodeptr = ERR37;
581     break;
582    
583 ph10 333 /* \g must be followed by one of a number of specific things:
584 ph10 345
585 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
586     backreference. If negative, it is a relative backreference. This is a Perl
587     5.10 feature.
588 ph10 345
589 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
590     is part of Perl's movement towards a unified syntax for back references. As
591     this is synonymous with \k{name}, we fudge it up by pretending it really
592     was \k.
593 ph10 345
594     (3) For Oniguruma compatibility we also support \g followed by a name or a
595     number either in angle brackets or in single quotes. However, these are
596     (possibly recursive) subroutine calls, _not_ backreferences. Just return
597 ph10 333 the -ESC_g code (cf \k). */
598 nigel 93
599 ph10 391 case CHAR_g:
600     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
601 ph10 333 {
602     c = -ESC_g;
603 ph10 345 break;
604     }
605 ph10 333
606     /* Handle the Perl-compatible cases */
607 ph10 345
608 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
609 nigel 93 {
610 ph10 171 const uschar *p;
611 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
612     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
613     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
614 ph10 171 {
615     c = -ESC_k;
616     break;
617 ph10 172 }
618 nigel 93 braced = TRUE;
619     ptr++;
620     }
621     else braced = FALSE;
622    
623 ph10 391 if (ptr[1] == CHAR_MINUS)
624 nigel 93 {
625     negated = TRUE;
626     ptr++;
627     }
628     else negated = FALSE;
629    
630     c = 0;
631     while ((digitab[ptr[1]] & ctype_digit) != 0)
632 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
633 ph10 220
634 ph10 333 if (c < 0) /* Integer overflow */
635 ph10 213 {
636     *errorcodeptr = ERR61;
637     break;
638 ph10 220 }
639 ph10 345
640 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
641 nigel 93 {
642     *errorcodeptr = ERR57;
643 ph10 213 break;
644 nigel 93 }
645 ph10 345
646 ph10 333 if (c == 0)
647     {
648     *errorcodeptr = ERR58;
649     break;
650 ph10 345 }
651 nigel 93
652     if (negated)
653     {
654     if (c > bracount)
655     {
656     *errorcodeptr = ERR15;
657 ph10 213 break;
658 nigel 93 }
659     c = bracount - (c - 1);
660     }
661    
662     c = -(ESC_REF + c);
663     break;
664    
665 nigel 77 /* The handling of escape sequences consisting of a string of digits
666     starting with one that is not zero is not straightforward. By experiment,
667     the way Perl works seems to be as follows:
668    
669     Outside a character class, the digits are read as a decimal number. If the
670     number is less than 10, or if there are that many previous extracting
671     left brackets, then it is a back reference. Otherwise, up to three octal
672     digits are read to form an escaped byte. Thus \123 is likely to be octal
673     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
674     value is greater than 377, the least significant 8 bits are taken. Inside a
675     character class, \ followed by a digit is always an octal number. */
676    
677 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
678     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
679 nigel 77
680     if (!isclass)
681     {
682     oldptr = ptr;
683 ph10 391 c -= CHAR_0;
684 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
685 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
686 ph10 333 if (c < 0) /* Integer overflow */
687 ph10 213 {
688     *errorcodeptr = ERR61;
689 ph10 220 break;
690     }
691 nigel 77 if (c < 10 || c <= bracount)
692     {
693     c = -(ESC_REF + c);
694     break;
695     }
696     ptr = oldptr; /* Put the pointer back and fall through */
697     }
698    
699     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
700     generates a binary zero byte and treats the digit as a following literal.
701     Thus we have to pull back the pointer by one. */
702    
703 ph10 391 if ((c = *ptr) >= CHAR_8)
704 nigel 77 {
705     ptr--;
706     c = 0;
707     break;
708     }
709    
710     /* \0 always starts an octal number, but we may drop through to here with a
711 nigel 91 larger first octal digit. The original code used just to take the least
712     significant 8 bits of octal numbers (I think this is what early Perls used
713     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
714     than 3 octal digits. */
715 nigel 77
716 ph10 391 case CHAR_0:
717     c -= CHAR_0;
718     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
719     c = c * 8 + *(++ptr) - CHAR_0;
720 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
721 nigel 77 break;
722    
723 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
724     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
725     treated as a data character. */
726 nigel 77
727 ph10 391 case CHAR_x:
728     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
729 nigel 77 {
730     const uschar *pt = ptr + 2;
731 nigel 87 int count = 0;
732    
733 nigel 77 c = 0;
734     while ((digitab[*pt] & ctype_xdigit) != 0)
735     {
736 nigel 87 register int cc = *pt++;
737 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
738 nigel 77 count++;
739 nigel 87
740 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
741     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
742     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
743 ph10 97 #else /* EBCDIC coding */
744 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
745     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
746 nigel 77 #endif
747     }
748 nigel 87
749 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
750 nigel 77 {
751 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
752 nigel 77 ptr = pt;
753     break;
754     }
755 nigel 87
756 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
757     recognize this construct; fall through to the normal \x handling. */
758     }
759    
760 nigel 87 /* Read just a single-byte hex-defined char */
761 nigel 77
762     c = 0;
763     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
764     {
765 ph10 391 int cc; /* Some compilers don't like */
766     cc = *(++ptr); /* ++ in initializers */
767     #ifndef EBCDIC /* ASCII/UTF-8 coding */
768     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
769     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
770 ph10 97 #else /* EBCDIC coding */
771 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
772     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
773 nigel 77 #endif
774     }
775     break;
776    
777 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
778     This coding is ASCII-specific, but then the whole concept of \cx is
779     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
780 nigel 77
781 ph10 391 case CHAR_c:
782 nigel 77 c = *(++ptr);
783     if (c == 0)
784     {
785     *errorcodeptr = ERR2;
786 ph10 213 break;
787 nigel 77 }
788    
789 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
790     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
791 nigel 77 c ^= 0x40;
792 ph10 97 #else /* EBCDIC coding */
793 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
794 nigel 77 c ^= 0xC0;
795     #endif
796     break;
797    
798     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
799 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
800     otherwise, for Perl compatibility, it is a literal. This code looks a bit
801     odd, but there used to be some cases other than the default, and there may
802     be again in future, so I haven't "optimized" it. */
803 nigel 77
804     default:
805     if ((options & PCRE_EXTRA) != 0) switch(c)
806     {
807     default:
808     *errorcodeptr = ERR3;
809     break;
810     }
811     break;
812     }
813     }
814    
815     *ptrptr = ptr;
816     return c;
817     }
818    
819    
820    
821     #ifdef SUPPORT_UCP
822     /*************************************************
823     * Handle \P and \p *
824     *************************************************/
825    
826     /* This function is called after \P or \p has been encountered, provided that
827     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
828     pointing at the P or p. On exit, it is pointing at the final character of the
829     escape sequence.
830    
831     Argument:
832     ptrptr points to the pattern position pointer
833     negptr points to a boolean that is set TRUE for negation else FALSE
834 nigel 87 dptr points to an int that is set to the detailed property value
835 nigel 77 errorcodeptr points to the error code variable
836    
837 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
838 nigel 77 */
839    
840     static int
841 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
842 nigel 77 {
843     int c, i, bot, top;
844     const uschar *ptr = *ptrptr;
845 nigel 87 char name[32];
846 nigel 77
847     c = *(++ptr);
848     if (c == 0) goto ERROR_RETURN;
849    
850     *negptr = FALSE;
851    
852 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
853     negation. */
854 nigel 77
855 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
856 nigel 77 {
857 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
858 nigel 77 {
859     *negptr = TRUE;
860     ptr++;
861     }
862 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
863 nigel 77 {
864     c = *(++ptr);
865     if (c == 0) goto ERROR_RETURN;
866 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
867 nigel 77 name[i] = c;
868     }
869 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
870 nigel 77 name[i] = 0;
871     }
872    
873     /* Otherwise there is just one following character */
874    
875     else
876     {
877     name[0] = c;
878     name[1] = 0;
879     }
880    
881     *ptrptr = ptr;
882    
883     /* Search for a recognized property name using binary chop */
884    
885     bot = 0;
886     top = _pcre_utt_size;
887    
888     while (bot < top)
889     {
890 nigel 87 i = (bot + top) >> 1;
891 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
892 nigel 87 if (c == 0)
893     {
894     *dptr = _pcre_utt[i].value;
895     return _pcre_utt[i].type;
896     }
897 nigel 77 if (c > 0) bot = i + 1; else top = i;
898     }
899    
900     *errorcodeptr = ERR47;
901     *ptrptr = ptr;
902     return -1;
903    
904     ERROR_RETURN:
905     *errorcodeptr = ERR46;
906     *ptrptr = ptr;
907     return -1;
908     }
909     #endif
910    
911    
912    
913    
914     /*************************************************
915     * Check for counted repeat *
916     *************************************************/
917    
918     /* This function is called when a '{' is encountered in a place where it might
919     start a quantifier. It looks ahead to see if it really is a quantifier or not.
920     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
921     where the ddds are digits.
922    
923     Arguments:
924     p pointer to the first char after '{'
925    
926     Returns: TRUE or FALSE
927     */
928    
929     static BOOL
930     is_counted_repeat(const uschar *p)
931     {
932     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
933     while ((digitab[*p] & ctype_digit) != 0) p++;
934 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
935 nigel 77
936 ph10 391 if (*p++ != CHAR_COMMA) return FALSE;
937     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
938 nigel 77
939     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
940     while ((digitab[*p] & ctype_digit) != 0) p++;
941    
942 ph10 391 return (*p == CHAR_RIGHT_CURLY_BRACKET);
943 nigel 77 }
944    
945    
946    
947     /*************************************************
948     * Read repeat counts *
949     *************************************************/
950    
951     /* Read an item of the form {n,m} and return the values. This is called only
952     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
953     so the syntax is guaranteed to be correct, but we need to check the values.
954    
955     Arguments:
956     p pointer to first char after '{'
957     minp pointer to int for min
958     maxp pointer to int for max
959     returned as -1 if no max
960     errorcodeptr points to error code variable
961    
962     Returns: pointer to '}' on success;
963     current ptr on error, with errorcodeptr set non-zero
964     */
965    
966     static const uschar *
967     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
968     {
969     int min = 0;
970     int max = -1;
971    
972 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
973     an integer overflow. */
974    
975 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
976 nigel 81 if (min < 0 || min > 65535)
977     {
978     *errorcodeptr = ERR5;
979     return p;
980     }
981 nigel 77
982 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
983     Also, max must not be less than min. */
984    
985 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
986 nigel 77 {
987 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
988 nigel 77 {
989     max = 0;
990 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
991 nigel 81 if (max < 0 || max > 65535)
992     {
993     *errorcodeptr = ERR5;
994     return p;
995     }
996 nigel 77 if (max < min)
997     {
998     *errorcodeptr = ERR4;
999     return p;
1000     }
1001     }
1002     }
1003    
1004 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1005     '}'. */
1006 nigel 77
1007 nigel 81 *minp = min;
1008     *maxp = max;
1009 nigel 77 return p;
1010     }
1011    
1012    
1013    
1014     /*************************************************
1015 ph10 408 * Subroutine for finding forward reference *
1016 nigel 91 *************************************************/
1017    
1018 ph10 408 /* This recursive function is called only from find_parens() below. The
1019     top-level call starts at the beginning of the pattern. All other calls must
1020     start at a parenthesis. It scans along a pattern's text looking for capturing
1021 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1022     name it is given, it returns its number. Alternatively, if the name is NULL, it
1023 ph10 408 returns when it reaches a given numbered subpattern. We know that if (?P< is
1024     encountered, the name will be terminated by '>' because that is checked in the
1025 ph10 411 first pass. Recursion is used to keep track of subpatterns that reset the
1026 ph10 408 capturing group numbers - the (?| feature.
1027 nigel 91
1028     Arguments:
1029 ph10 408 ptrptr address of the current character pointer (updated)
1030 ph10 345 cd compile background data
1031 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1032     lorn name length, or subpattern number if name is NULL
1033     xmode TRUE if we are in /x mode
1034 ph10 411 count pointer to the current capturing subpattern number (updated)
1035 nigel 91
1036     Returns: the number of the named subpattern, or -1 if not found
1037     */
1038    
1039     static int
1040 ph10 408 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1041     BOOL xmode, int *count)
1042 nigel 91 {
1043 ph10 408 uschar *ptr = *ptrptr;
1044     int start_count = *count;
1045     int hwm_count = start_count;
1046     BOOL dup_parens = FALSE;
1047 nigel 93
1048 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1049 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1050    
1051     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1052     {
1053     if (ptr[1] == CHAR_QUESTION_MARK &&
1054 ph10 411 ptr[2] == CHAR_VERTICAL_LINE)
1055 ph10 408 {
1056     ptr += 3;
1057 ph10 411 dup_parens = TRUE;
1058     }
1059 ph10 408
1060     /* Handle a normal, unnamed capturing parenthesis */
1061 ph10 411
1062 ph10 408 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1063     {
1064     *count += 1;
1065     if (name == NULL && *count == lorn) return *count;
1066 ph10 411 ptr++;
1067 ph10 408 }
1068    
1069     /* Handle a condition. If it is an assertion, just carry on so that it
1070     is processed as normal. If not, skip to the closing parenthesis of the
1071 ph10 411 condition (there can't be any nested parens. */
1072    
1073 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1074     {
1075 ph10 411 ptr += 2;
1076 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1077     {
1078     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1079 ph10 411 if (*ptr != 0) ptr++;
1080 ph10 408 }
1081 ph10 411 }
1082    
1083 ph10 408 /* We have either (? or (* and not a condition */
1084    
1085     else
1086 ph10 411 {
1087 ph10 408 ptr += 2;
1088     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1089    
1090     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1091 ph10 411
1092 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1093     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1094     {
1095     int term;
1096     const uschar *thisname;
1097     *count += 1;
1098     if (name == NULL && *count == lorn) return *count;
1099     term = *ptr++;
1100     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1101     thisname = ptr;
1102     while (*ptr != term) ptr++;
1103     if (name != NULL && lorn == ptr - thisname &&
1104     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1105     return *count;
1106 ph10 461 term++;
1107 ph10 411 }
1108 ph10 408 }
1109 ph10 411 }
1110 ph10 408
1111 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1112 ph10 408 bars. */
1113    
1114 nigel 91 for (; *ptr != 0; ptr++)
1115     {
1116 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1117    
1118 ph10 391 if (*ptr == CHAR_BACKSLASH)
1119 nigel 93 {
1120 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1121 ph10 391 if (*ptr == CHAR_Q) for (;;)
1122 nigel 93 {
1123 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1124 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1125 ph10 391 if (*(++ptr) == CHAR_E) break;
1126 nigel 93 }
1127     continue;
1128     }
1129    
1130 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1131     are handled for real. If the first character is '^', skip it. Also, if the
1132     first few characters (either before or after ^) are \Q\E or \E we skip them
1133 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1134 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1135 nigel 93
1136 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1137 nigel 93 {
1138 ph10 340 BOOL negate_class = FALSE;
1139     for (;;)
1140     {
1141 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1142 ph10 340 {
1143 ph10 438 if (ptr[2] == CHAR_E)
1144     ptr+= 2;
1145     else if (strncmp((const char *)ptr+2,
1146 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1147 ph10 438 ptr += 4;
1148 ph10 392 else
1149 ph10 391 break;
1150 ph10 340 }
1151 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1152 ph10 461 {
1153 ph10 340 negate_class = TRUE;
1154 ph10 438 ptr++;
1155 ph10 461 }
1156 ph10 340 else break;
1157     }
1158    
1159     /* If the next character is ']', it is a data character that must be
1160 ph10 341 skipped, except in JavaScript compatibility mode. */
1161 ph10 345
1162 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1163 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1164 ph10 345 ptr++;
1165    
1166 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1167 nigel 93 {
1168 ph10 220 if (*ptr == 0) return -1;
1169 ph10 391 if (*ptr == CHAR_BACKSLASH)
1170 nigel 93 {
1171 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1172 ph10 391 if (*ptr == CHAR_Q) for (;;)
1173 nigel 93 {
1174 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1175 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1176 ph10 391 if (*(++ptr) == CHAR_E) break;
1177 nigel 93 }
1178     continue;
1179     }
1180     }
1181     continue;
1182     }
1183    
1184     /* Skip comments in /x mode */
1185    
1186 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1187 nigel 93 {
1188 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1189 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1190 nigel 93 continue;
1191     }
1192    
1193 ph10 408 /* Check for the special metacharacters */
1194 ph10 411
1195 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1196 nigel 93 {
1197 ph10 408 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1198     if (rc > 0) return rc;
1199     if (*ptr == 0) goto FAIL_EXIT;
1200 nigel 93 }
1201 ph10 411
1202 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1203     {
1204     if (dup_parens && *count < hwm_count) *count = hwm_count;
1205     *ptrptr = ptr;
1206     return -1;
1207     }
1208 ph10 411
1209     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1210 ph10 408 {
1211     if (*count > hwm_count) hwm_count = *count;
1212     *count = start_count;
1213 ph10 411 }
1214 ph10 408 }
1215 nigel 93
1216 ph10 408 FAIL_EXIT:
1217     *ptrptr = ptr;
1218     return -1;
1219     }
1220 nigel 93
1221    
1222    
1223    
1224 ph10 408 /*************************************************
1225     * Find forward referenced subpattern *
1226     *************************************************/
1227 nigel 93
1228 ph10 408 /* This function scans along a pattern's text looking for capturing
1229     subpatterns, and counting them. If it finds a named pattern that matches the
1230     name it is given, it returns its number. Alternatively, if the name is NULL, it
1231     returns when it reaches a given numbered subpattern. This is used for forward
1232     references to subpatterns. We used to be able to start this scan from the
1233     current compiling point, using the current count value from cd->bracount, and
1234     do it all in a single loop, but the addition of the possibility of duplicate
1235     subpattern numbers means that we have to scan from the very start, in order to
1236     take account of such duplicates, and to use a recursive function to keep track
1237     of the different types of group.
1238    
1239     Arguments:
1240     cd compile background data
1241     name name to seek, or NULL if seeking a numbered subpattern
1242     lorn name length, or subpattern number if name is NULL
1243     xmode TRUE if we are in /x mode
1244    
1245     Returns: the number of the found subpattern, or -1 if not found
1246     */
1247    
1248     static int
1249     find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1250     {
1251     uschar *ptr = (uschar *)cd->start_pattern;
1252     int count = 0;
1253     int rc;
1254    
1255     /* If the pattern does not start with an opening parenthesis, the first call
1256     to find_parens_sub() will scan right to the end (if necessary). However, if it
1257     does start with a parenthesis, find_parens_sub() will return when it hits the
1258     matching closing parens. That is why we have to have a loop. */
1259    
1260 ph10 411 for (;;)
1261     {
1262 ph10 408 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1263 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1264     }
1265    
1266 ph10 408 return rc;
1267 nigel 91 }
1268    
1269    
1270    
1271 ph10 408
1272 nigel 91 /*************************************************
1273 nigel 77 * Find first significant op code *
1274     *************************************************/
1275    
1276     /* This is called by several functions that scan a compiled expression looking
1277     for a fixed first character, or an anchoring op code etc. It skips over things
1278     that do not influence this. For some calls, a change of option is important.
1279     For some calls, it makes sense to skip negative forward and all backward
1280     assertions, and also the \b assertion; for others it does not.
1281    
1282     Arguments:
1283     code pointer to the start of the group
1284     options pointer to external options
1285     optbit the option bit whose changing is significant, or
1286     zero if none are
1287     skipassert TRUE if certain assertions are to be skipped
1288    
1289     Returns: pointer to the first significant opcode
1290     */
1291    
1292     static const uschar*
1293     first_significant_code(const uschar *code, int *options, int optbit,
1294     BOOL skipassert)
1295     {
1296     for (;;)
1297     {
1298     switch ((int)*code)
1299     {
1300     case OP_OPT:
1301     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1302     *options = (int)code[1];
1303     code += 2;
1304     break;
1305    
1306     case OP_ASSERT_NOT:
1307     case OP_ASSERTBACK:
1308     case OP_ASSERTBACK_NOT:
1309     if (!skipassert) return code;
1310     do code += GET(code, 1); while (*code == OP_ALT);
1311     code += _pcre_OP_lengths[*code];
1312     break;
1313    
1314     case OP_WORD_BOUNDARY:
1315     case OP_NOT_WORD_BOUNDARY:
1316     if (!skipassert) return code;
1317     /* Fall through */
1318    
1319     case OP_CALLOUT:
1320     case OP_CREF:
1321 ph10 459 case OP_NCREF:
1322 nigel 93 case OP_RREF:
1323 ph10 459 case OP_NRREF:
1324 nigel 93 case OP_DEF:
1325 nigel 77 code += _pcre_OP_lengths[*code];
1326     break;
1327    
1328     default:
1329     return code;
1330     }
1331     }
1332     /* Control never reaches here */
1333     }
1334    
1335    
1336    
1337    
1338     /*************************************************
1339 ph10 454 * Find the fixed length of a branch *
1340 nigel 77 *************************************************/
1341    
1342 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1343 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1344 ph10 461 In UTF8 mode, the result is in characters rather than bytes. The branch is
1345 ph10 454 temporarily terminated with OP_END when this function is called.
1346 nigel 77
1347 ph10 461 This function is called when a backward assertion is encountered, so that if it
1348     fails, the error message can point to the correct place in the pattern.
1349 ph10 454 However, we cannot do this when the assertion contains subroutine calls,
1350 ph10 461 because they can be forward references. We solve this by remembering this case
1351 ph10 454 and doing the check at the end; a flag specifies which mode we are running in.
1352    
1353 nigel 77 Arguments:
1354     code points to the start of the pattern (the bracket)
1355     options the compiling options
1356 ph10 461 atend TRUE if called when the pattern is complete
1357     cd the "compile data" structure
1358 nigel 77
1359 ph10 461 Returns: the fixed length,
1360 ph10 454 or -1 if there is no fixed length,
1361 nigel 77 or -2 if \C was encountered
1362 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1363 nigel 77 */
1364    
1365     static int
1366 ph10 454 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1367 nigel 77 {
1368     int length = -1;
1369    
1370     register int branchlength = 0;
1371     register uschar *cc = code + 1 + LINK_SIZE;
1372    
1373     /* Scan along the opcodes for this branch. If we get to the end of the
1374     branch, check the length against that of the other branches. */
1375    
1376     for (;;)
1377     {
1378     int d;
1379 ph10 454 uschar *ce, *cs;
1380 nigel 77 register int op = *cc;
1381     switch (op)
1382     {
1383 nigel 93 case OP_CBRA:
1384 nigel 77 case OP_BRA:
1385     case OP_ONCE:
1386     case OP_COND:
1387 ph10 454 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1388 nigel 77 if (d < 0) return d;
1389     branchlength += d;
1390     do cc += GET(cc, 1); while (*cc == OP_ALT);
1391     cc += 1 + LINK_SIZE;
1392     break;
1393    
1394     /* Reached end of a branch; if it's a ket it is the end of a nested
1395     call. If it's ALT it is an alternation in a nested call. If it is
1396     END it's the end of the outer call. All can be handled by the same code. */
1397    
1398     case OP_ALT:
1399     case OP_KET:
1400     case OP_KETRMAX:
1401     case OP_KETRMIN:
1402     case OP_END:
1403     if (length < 0) length = branchlength;
1404     else if (length != branchlength) return -1;
1405     if (*cc != OP_ALT) return length;
1406     cc += 1 + LINK_SIZE;
1407     branchlength = 0;
1408     break;
1409 ph10 461
1410 ph10 454 /* A true recursion implies not fixed length, but a subroutine call may
1411     be OK. If the subroutine is a forward reference, we can't deal with
1412     it until the end of the pattern, so return -3. */
1413 ph10 461
1414 ph10 454 case OP_RECURSE:
1415     if (!atend) return -3;
1416     cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1417     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1418     if (cc > cs && cc < ce) return -1; /* Recursion */
1419     d = find_fixedlength(cs + 2, options, atend, cd);
1420 ph10 461 if (d < 0) return d;
1421 ph10 454 branchlength += d;
1422     cc += 1 + LINK_SIZE;
1423 ph10 461 break;
1424 nigel 77
1425     /* Skip over assertive subpatterns */
1426    
1427     case OP_ASSERT:
1428     case OP_ASSERT_NOT:
1429     case OP_ASSERTBACK:
1430     case OP_ASSERTBACK_NOT:
1431     do cc += GET(cc, 1); while (*cc == OP_ALT);
1432     /* Fall through */
1433    
1434     /* Skip over things that don't match chars */
1435    
1436     case OP_REVERSE:
1437     case OP_CREF:
1438 ph10 459 case OP_NCREF:
1439 nigel 93 case OP_RREF:
1440 ph10 459 case OP_NRREF:
1441 nigel 93 case OP_DEF:
1442 nigel 77 case OP_OPT:
1443     case OP_CALLOUT:
1444     case OP_SOD:
1445     case OP_SOM:
1446     case OP_EOD:
1447     case OP_EODN:
1448     case OP_CIRC:
1449     case OP_DOLL:
1450     case OP_NOT_WORD_BOUNDARY:
1451     case OP_WORD_BOUNDARY:
1452     cc += _pcre_OP_lengths[*cc];
1453     break;
1454    
1455     /* Handle literal characters */
1456    
1457     case OP_CHAR:
1458     case OP_CHARNC:
1459 nigel 91 case OP_NOT:
1460 nigel 77 branchlength++;
1461     cc += 2;
1462     #ifdef SUPPORT_UTF8
1463 ph10 461 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1464 ph10 426 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1465 nigel 77 #endif
1466     break;
1467    
1468     /* Handle exact repetitions. The count is already in characters, but we
1469     need to skip over a multibyte character in UTF8 mode. */
1470    
1471     case OP_EXACT:
1472     branchlength += GET2(cc,1);
1473     cc += 4;
1474     #ifdef SUPPORT_UTF8
1475 ph10 461 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1476 ph10 426 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1477 nigel 77 #endif
1478     break;
1479    
1480     case OP_TYPEEXACT:
1481     branchlength += GET2(cc,1);
1482 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1483 nigel 77 cc += 4;
1484     break;
1485    
1486     /* Handle single-char matchers */
1487    
1488     case OP_PROP:
1489     case OP_NOTPROP:
1490 nigel 87 cc += 2;
1491 nigel 77 /* Fall through */
1492    
1493     case OP_NOT_DIGIT:
1494     case OP_DIGIT:
1495     case OP_NOT_WHITESPACE:
1496     case OP_WHITESPACE:
1497     case OP_NOT_WORDCHAR:
1498     case OP_WORDCHAR:
1499     case OP_ANY:
1500 ph10 342 case OP_ALLANY:
1501 nigel 77 branchlength++;
1502     cc++;
1503     break;
1504    
1505     /* The single-byte matcher isn't allowed */
1506    
1507     case OP_ANYBYTE:
1508     return -2;
1509    
1510     /* Check a class for variable quantification */
1511    
1512     #ifdef SUPPORT_UTF8
1513     case OP_XCLASS:
1514     cc += GET(cc, 1) - 33;
1515     /* Fall through */
1516     #endif
1517    
1518     case OP_CLASS:
1519     case OP_NCLASS:
1520     cc += 33;
1521    
1522     switch (*cc)
1523     {
1524     case OP_CRSTAR:
1525     case OP_CRMINSTAR:
1526     case OP_CRQUERY:
1527     case OP_CRMINQUERY:
1528     return -1;
1529    
1530     case OP_CRRANGE:
1531     case OP_CRMINRANGE:
1532     if (GET2(cc,1) != GET2(cc,3)) return -1;
1533     branchlength += GET2(cc,1);
1534     cc += 5;
1535     break;
1536    
1537     default:
1538     branchlength++;
1539     }
1540     break;
1541    
1542     /* Anything else is variable length */
1543    
1544     default:
1545     return -1;
1546     }
1547     }
1548     /* Control never gets here */
1549     }
1550    
1551    
1552    
1553    
1554     /*************************************************
1555 ph10 454 * Scan compiled regex for specific bracket *
1556 nigel 77 *************************************************/
1557    
1558     /* This little function scans through a compiled pattern until it finds a
1559 ph10 454 capturing bracket with the given number, or, if the number is negative, an
1560 ph10 461 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1561     so that it can be called from pcre_study() when finding the minimum matching
1562 ph10 455 length.
1563 nigel 77
1564     Arguments:
1565     code points to start of expression
1566     utf8 TRUE in UTF-8 mode
1567 ph10 454 number the required bracket number or negative to find a lookbehind
1568 nigel 77
1569     Returns: pointer to the opcode for the bracket, or NULL if not found
1570     */
1571    
1572 ph10 455 const uschar *
1573     _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1574 nigel 77 {
1575     for (;;)
1576     {
1577     register int c = *code;
1578     if (c == OP_END) return NULL;
1579 nigel 91
1580     /* XCLASS is used for classes that cannot be represented just by a bit
1581     map. This includes negated single high-valued characters. The length in
1582     the table is zero; the actual length is stored in the compiled code. */
1583    
1584     if (c == OP_XCLASS) code += GET(code, 1);
1585 ph10 461
1586 ph10 454 /* Handle recursion */
1587 ph10 461
1588 ph10 454 else if (c == OP_REVERSE)
1589     {
1590 ph10 461 if (number < 0) return (uschar *)code;
1591 ph10 454 code += _pcre_OP_lengths[c];
1592     }
1593 nigel 91
1594 nigel 93 /* Handle capturing bracket */
1595 nigel 91
1596 nigel 93 else if (c == OP_CBRA)
1597 nigel 77 {
1598 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1599 nigel 77 if (n == number) return (uschar *)code;
1600 nigel 93 code += _pcre_OP_lengths[c];
1601 nigel 77 }
1602 nigel 91
1603 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1604     repeated character types, we have to test for \p and \P, which have an extra
1605 ph10 218 two bytes of parameters. */
1606 nigel 91
1607 nigel 77 else
1608     {
1609 ph10 218 switch(c)
1610     {
1611     case OP_TYPESTAR:
1612     case OP_TYPEMINSTAR:
1613     case OP_TYPEPLUS:
1614     case OP_TYPEMINPLUS:
1615     case OP_TYPEQUERY:
1616     case OP_TYPEMINQUERY:
1617     case OP_TYPEPOSSTAR:
1618     case OP_TYPEPOSPLUS:
1619     case OP_TYPEPOSQUERY:
1620     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1621 ph10 220 break;
1622 ph10 221
1623     case OP_TYPEUPTO:
1624     case OP_TYPEMINUPTO:
1625     case OP_TYPEEXACT:
1626     case OP_TYPEPOSUPTO:
1627     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1628     break;
1629 ph10 220 }
1630    
1631 ph10 218 /* Add in the fixed length from the table */
1632 ph10 220
1633 nigel 77 code += _pcre_OP_lengths[c];
1634 ph10 220
1635 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1636     a multi-byte character. The length in the table is a minimum, so we have to
1637     arrange to skip the extra bytes. */
1638 ph10 220
1639 ph10 107 #ifdef SUPPORT_UTF8
1640 nigel 77 if (utf8) switch(c)
1641     {
1642     case OP_CHAR:
1643     case OP_CHARNC:
1644     case OP_EXACT:
1645     case OP_UPTO:
1646     case OP_MINUPTO:
1647 nigel 93 case OP_POSUPTO:
1648 nigel 77 case OP_STAR:
1649     case OP_MINSTAR:
1650 nigel 93 case OP_POSSTAR:
1651 nigel 77 case OP_PLUS:
1652     case OP_MINPLUS:
1653 nigel 93 case OP_POSPLUS:
1654 nigel 77 case OP_QUERY:
1655     case OP_MINQUERY:
1656 nigel 93 case OP_POSQUERY:
1657     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1658 nigel 77 break;
1659     }
1660 ph10 369 #else
1661     (void)(utf8); /* Keep compiler happy by referencing function argument */
1662 ph10 111 #endif
1663 nigel 77 }
1664     }
1665     }
1666    
1667    
1668    
1669     /*************************************************
1670     * Scan compiled regex for recursion reference *
1671     *************************************************/
1672    
1673     /* This little function scans through a compiled pattern until it finds an
1674     instance of OP_RECURSE.
1675    
1676     Arguments:
1677     code points to start of expression
1678     utf8 TRUE in UTF-8 mode
1679    
1680     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1681     */
1682    
1683     static const uschar *
1684     find_recurse(const uschar *code, BOOL utf8)
1685     {
1686     for (;;)
1687     {
1688     register int c = *code;
1689     if (c == OP_END) return NULL;
1690 nigel 91 if (c == OP_RECURSE) return code;
1691 ph10 220
1692 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1693     map. This includes negated single high-valued characters. The length in
1694     the table is zero; the actual length is stored in the compiled code. */
1695    
1696     if (c == OP_XCLASS) code += GET(code, 1);
1697    
1698 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1699     repeated character types, we have to test for \p and \P, which have an extra
1700 ph10 218 two bytes of parameters. */
1701 nigel 91
1702 nigel 77 else
1703     {
1704 ph10 218 switch(c)
1705     {
1706     case OP_TYPESTAR:
1707     case OP_TYPEMINSTAR:
1708     case OP_TYPEPLUS:
1709     case OP_TYPEMINPLUS:
1710     case OP_TYPEQUERY:
1711     case OP_TYPEMINQUERY:
1712     case OP_TYPEPOSSTAR:
1713     case OP_TYPEPOSPLUS:
1714     case OP_TYPEPOSQUERY:
1715     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1716 ph10 220 break;
1717 ph10 221
1718     case OP_TYPEPOSUPTO:
1719     case OP_TYPEUPTO:
1720     case OP_TYPEMINUPTO:
1721     case OP_TYPEEXACT:
1722     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1723     break;
1724 ph10 220 }
1725    
1726 ph10 218 /* Add in the fixed length from the table */
1727    
1728 nigel 77 code += _pcre_OP_lengths[c];
1729 ph10 220
1730 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1731     by a multi-byte character. The length in the table is a minimum, so we have
1732     to arrange to skip the extra bytes. */
1733 ph10 220
1734 ph10 107 #ifdef SUPPORT_UTF8
1735 nigel 77 if (utf8) switch(c)
1736     {
1737     case OP_CHAR:
1738     case OP_CHARNC:
1739     case OP_EXACT:
1740     case OP_UPTO:
1741     case OP_MINUPTO:
1742 nigel 93 case OP_POSUPTO:
1743 nigel 77 case OP_STAR:
1744     case OP_MINSTAR:
1745 nigel 93 case OP_POSSTAR:
1746 nigel 77 case OP_PLUS:
1747     case OP_MINPLUS:
1748 nigel 93 case OP_POSPLUS:
1749 nigel 77 case OP_QUERY:
1750     case OP_MINQUERY:
1751 nigel 93 case OP_POSQUERY:
1752     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1753 nigel 77 break;
1754     }
1755 ph10 369 #else
1756     (void)(utf8); /* Keep compiler happy by referencing function argument */
1757 ph10 111 #endif
1758 nigel 77 }
1759     }
1760     }
1761    
1762    
1763    
1764     /*************************************************
1765     * Scan compiled branch for non-emptiness *
1766     *************************************************/
1767    
1768     /* This function scans through a branch of a compiled pattern to see whether it
1769 nigel 93 can match the empty string or not. It is called from could_be_empty()
1770     below and from compile_branch() when checking for an unlimited repeat of a
1771     group that can match nothing. Note that first_significant_code() skips over
1772 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1773     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1774     bracket whose current branch will already have been scanned.
1775 nigel 77
1776     Arguments:
1777     code points to start of search
1778     endcode points to where to stop
1779     utf8 TRUE if in UTF8 mode
1780    
1781     Returns: TRUE if what is matched could be empty
1782     */
1783    
1784     static BOOL
1785     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1786     {
1787     register int c;
1788 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1789 nigel 77 code < endcode;
1790     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1791     {
1792     const uschar *ccode;
1793    
1794     c = *code;
1795 ph10 286
1796     /* Skip over forward assertions; the other assertions are skipped by
1797 ph10 282 first_significant_code() with a TRUE final argument. */
1798 ph10 286
1799 ph10 282 if (c == OP_ASSERT)
1800 ph10 286 {
1801 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1802     c = *code;
1803     continue;
1804 ph10 286 }
1805 ph10 172
1806 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1807 nigel 77
1808 ph10 335 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1809 ph10 170 {
1810 ph10 172 code += _pcre_OP_lengths[c];
1811 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1812     c = *code;
1813     continue;
1814     }
1815    
1816     /* For other groups, scan the branches. */
1817 ph10 172
1818 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1819 nigel 77 {
1820     BOOL empty_branch;
1821     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1822 ph10 406
1823     /* If a conditional group has only one branch, there is a second, implied,
1824 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
1825     Otherwise, scan the individual branches of the group. */
1826 ph10 406
1827 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1828 nigel 77 code += GET(code, 1);
1829 ph10 395 else
1830 ph10 406 {
1831 ph10 395 empty_branch = FALSE;
1832     do
1833     {
1834     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1835     empty_branch = TRUE;
1836     code += GET(code, 1);
1837     }
1838     while (*code == OP_ALT);
1839     if (!empty_branch) return FALSE; /* All branches are non-empty */
1840 nigel 77 }
1841 ph10 406
1842 ph10 172 c = *code;
1843 nigel 93 continue;
1844 nigel 77 }
1845    
1846 nigel 93 /* Handle the other opcodes */
1847    
1848     switch (c)
1849 nigel 77 {
1850 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1851     cannot be represented just by a bit map. This includes negated single
1852     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1853 ph10 220 actual length is stored in the compiled code, so we must update "code"
1854 ph10 216 here. */
1855 nigel 77
1856     #ifdef SUPPORT_UTF8
1857     case OP_XCLASS:
1858 ph10 216 ccode = code += GET(code, 1);
1859 nigel 77 goto CHECK_CLASS_REPEAT;
1860     #endif
1861    
1862     case OP_CLASS:
1863     case OP_NCLASS:
1864     ccode = code + 33;
1865    
1866     #ifdef SUPPORT_UTF8
1867     CHECK_CLASS_REPEAT:
1868     #endif
1869    
1870     switch (*ccode)
1871     {
1872     case OP_CRSTAR: /* These could be empty; continue */
1873     case OP_CRMINSTAR:
1874     case OP_CRQUERY:
1875     case OP_CRMINQUERY:
1876     break;
1877    
1878     default: /* Non-repeat => class must match */
1879     case OP_CRPLUS: /* These repeats aren't empty */
1880     case OP_CRMINPLUS:
1881     return FALSE;
1882    
1883     case OP_CRRANGE:
1884     case OP_CRMINRANGE:
1885     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1886     break;
1887     }
1888     break;
1889    
1890     /* Opcodes that must match a character */
1891    
1892     case OP_PROP:
1893     case OP_NOTPROP:
1894     case OP_EXTUNI:
1895     case OP_NOT_DIGIT:
1896     case OP_DIGIT:
1897     case OP_NOT_WHITESPACE:
1898     case OP_WHITESPACE:
1899     case OP_NOT_WORDCHAR:
1900     case OP_WORDCHAR:
1901     case OP_ANY:
1902 ph10 345 case OP_ALLANY:
1903 nigel 77 case OP_ANYBYTE:
1904     case OP_CHAR:
1905     case OP_CHARNC:
1906     case OP_NOT:
1907     case OP_PLUS:
1908     case OP_MINPLUS:
1909 nigel 93 case OP_POSPLUS:
1910 nigel 77 case OP_EXACT:
1911     case OP_NOTPLUS:
1912     case OP_NOTMINPLUS:
1913 nigel 93 case OP_NOTPOSPLUS:
1914 nigel 77 case OP_NOTEXACT:
1915     case OP_TYPEPLUS:
1916     case OP_TYPEMINPLUS:
1917 nigel 93 case OP_TYPEPOSPLUS:
1918 nigel 77 case OP_TYPEEXACT:
1919     return FALSE;
1920 ph10 227
1921     /* These are going to continue, as they may be empty, but we have to
1922     fudge the length for the \p and \P cases. */
1923    
1924 ph10 224 case OP_TYPESTAR:
1925     case OP_TYPEMINSTAR:
1926     case OP_TYPEPOSSTAR:
1927     case OP_TYPEQUERY:
1928     case OP_TYPEMINQUERY:
1929     case OP_TYPEPOSQUERY:
1930     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1931 ph10 227 break;
1932    
1933 ph10 224 /* Same for these */
1934 ph10 227
1935 ph10 224 case OP_TYPEUPTO:
1936     case OP_TYPEMINUPTO:
1937     case OP_TYPEPOSUPTO:
1938     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1939     break;
1940 nigel 77
1941     /* End of branch */
1942    
1943     case OP_KET:
1944     case OP_KETRMAX:
1945     case OP_KETRMIN:
1946     case OP_ALT:
1947     return TRUE;
1948    
1949 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1950     MINUPTO, and POSUPTO may be followed by a multibyte character */
1951 nigel 77
1952     #ifdef SUPPORT_UTF8
1953     case OP_STAR:
1954     case OP_MINSTAR:
1955 nigel 93 case OP_POSSTAR:
1956 nigel 77 case OP_QUERY:
1957     case OP_MINQUERY:
1958 nigel 93 case OP_POSQUERY:
1959 ph10 426 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1960     break;
1961 ph10 461
1962 nigel 77 case OP_UPTO:
1963     case OP_MINUPTO:
1964 nigel 93 case OP_POSUPTO:
1965 ph10 426 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
1966 nigel 77 break;
1967     #endif
1968     }
1969     }
1970    
1971     return TRUE;
1972     }
1973    
1974    
1975    
1976     /*************************************************
1977     * Scan compiled regex for non-emptiness *
1978     *************************************************/
1979    
1980     /* This function is called to check for left recursive calls. We want to check
1981     the current branch of the current pattern to see if it could match the empty
1982     string. If it could, we must look outwards for branches at other levels,
1983     stopping when we pass beyond the bracket which is the subject of the recursion.
1984    
1985     Arguments:
1986     code points to start of the recursion
1987     endcode points to where to stop (current RECURSE item)
1988     bcptr points to the chain of current (unclosed) branch starts
1989     utf8 TRUE if in UTF-8 mode
1990    
1991     Returns: TRUE if what is matched could be empty
1992     */
1993    
1994     static BOOL
1995     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1996     BOOL utf8)
1997     {
1998 ph10 475 while (bcptr != NULL && bcptr->current_branch >= code)
1999 nigel 77 {
2000 ph10 487 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8))
2001 ph10 475 return FALSE;
2002 nigel 77 bcptr = bcptr->outer;
2003     }
2004     return TRUE;
2005     }
2006    
2007    
2008    
2009     /*************************************************
2010     * Check for POSIX class syntax *
2011     *************************************************/
2012    
2013     /* This function is called when the sequence "[:" or "[." or "[=" is
2014 ph10 295 encountered in a character class. It checks whether this is followed by a
2015 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2016 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2017 nigel 77
2018 ph10 298 Originally, this function only recognized a sequence of letters between the
2019     terminators, but it seems that Perl recognizes any sequence of characters,
2020     though of course unknown POSIX names are subsequently rejected. Perl gives an
2021     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2022     didn't consider this to be a POSIX class. Likewise for [:1234:].
2023 ph10 295
2024 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2025     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2026     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2027     below handles the special case of \], but does not try to do any other escape
2028     processing. This makes it different from Perl for cases such as [:l\ower:]
2029 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2030 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2031 ph10 295 I think.
2032    
2033     Arguments:
2034 nigel 77 ptr pointer to the initial [
2035     endptr where to return the end pointer
2036    
2037     Returns: TRUE or FALSE
2038     */
2039    
2040     static BOOL
2041 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2042 nigel 77 {
2043     int terminator; /* Don't combine these lines; the Solaris cc */
2044     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2045 ph10 295 for (++ptr; *ptr != 0; ptr++)
2046 nigel 77 {
2047 ph10 391 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2048 ph10 298 {
2049 ph10 391 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2050     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2051 ph10 295 {
2052     *endptr = ptr;
2053     return TRUE;
2054 ph10 298 }
2055     }
2056     }
2057 nigel 77 return FALSE;
2058     }
2059    
2060    
2061    
2062    
2063     /*************************************************
2064     * Check POSIX class name *
2065     *************************************************/
2066    
2067     /* This function is called to check the name given in a POSIX-style class entry
2068     such as [:alnum:].
2069    
2070     Arguments:
2071     ptr points to the first letter
2072     len the length of the name
2073    
2074     Returns: a value representing the name, or -1 if unknown
2075     */
2076    
2077     static int
2078     check_posix_name(const uschar *ptr, int len)
2079     {
2080 ph10 240 const char *pn = posix_names;
2081 nigel 77 register int yield = 0;
2082     while (posix_name_lengths[yield] != 0)
2083     {
2084     if (len == posix_name_lengths[yield] &&
2085 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2086 ph10 243 pn += posix_name_lengths[yield] + 1;
2087 nigel 77 yield++;
2088     }
2089     return -1;
2090     }
2091    
2092    
2093     /*************************************************
2094     * Adjust OP_RECURSE items in repeated group *
2095     *************************************************/
2096    
2097     /* OP_RECURSE items contain an offset from the start of the regex to the group
2098     that is referenced. This means that groups can be replicated for fixed
2099     repetition simply by copying (because the recursion is allowed to refer to
2100     earlier groups that are outside the current group). However, when a group is
2101 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2102     inserted before it, after it has been compiled. This means that any OP_RECURSE
2103     items within it that refer to the group itself or any contained groups have to
2104     have their offsets adjusted. That one of the jobs of this function. Before it
2105     is called, the partially compiled regex must be temporarily terminated with
2106     OP_END.
2107 nigel 77
2108 nigel 93 This function has been extended with the possibility of forward references for
2109     recursions and subroutine calls. It must also check the list of such references
2110     for the group we are dealing with. If it finds that one of the recursions in
2111     the current group is on this list, it adjusts the offset in the list, not the
2112     value in the reference (which is a group number).
2113    
2114 nigel 77 Arguments:
2115     group points to the start of the group
2116     adjust the amount by which the group is to be moved
2117     utf8 TRUE in UTF-8 mode
2118     cd contains pointers to tables etc.
2119 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2120 nigel 77
2121     Returns: nothing
2122     */
2123    
2124     static void
2125 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2126     uschar *save_hwm)
2127 nigel 77 {
2128     uschar *ptr = group;
2129 ph10 224
2130 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2131     {
2132 nigel 93 int offset;
2133     uschar *hc;
2134    
2135     /* See if this recursion is on the forward reference list. If so, adjust the
2136     reference. */
2137 ph10 345
2138 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2139     {
2140     offset = GET(hc, 0);
2141     if (cd->start_code + offset == ptr + 1)
2142     {
2143     PUT(hc, 0, offset + adjust);
2144     break;
2145     }
2146     }
2147    
2148     /* Otherwise, adjust the recursion offset if it's after the start of this
2149     group. */
2150    
2151     if (hc >= cd->hwm)
2152     {
2153     offset = GET(ptr, 1);
2154     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2155     }
2156    
2157 nigel 77 ptr += 1 + LINK_SIZE;
2158     }
2159     }
2160    
2161    
2162    
2163     /*************************************************
2164     * Insert an automatic callout point *
2165     *************************************************/
2166    
2167     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2168     callout points before each pattern item.
2169    
2170     Arguments:
2171     code current code pointer
2172     ptr current pattern pointer
2173     cd pointers to tables etc
2174    
2175     Returns: new code pointer
2176     */
2177    
2178     static uschar *
2179     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2180     {
2181     *code++ = OP_CALLOUT;
2182     *code++ = 255;
2183     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2184     PUT(code, LINK_SIZE, 0); /* Default length */
2185     return code + 2*LINK_SIZE;
2186     }
2187    
2188    
2189    
2190     /*************************************************
2191     * Complete a callout item *
2192     *************************************************/
2193    
2194     /* A callout item contains the length of the next item in the pattern, which
2195     we can't fill in till after we have reached the relevant point. This is used
2196     for both automatic and manual callouts.
2197    
2198     Arguments:
2199     previous_callout points to previous callout item
2200     ptr current pattern pointer
2201     cd pointers to tables etc
2202    
2203     Returns: nothing
2204     */
2205    
2206     static void
2207     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2208     {
2209     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2210     PUT(previous_callout, 2 + LINK_SIZE, length);
2211     }
2212    
2213    
2214    
2215     #ifdef SUPPORT_UCP
2216     /*************************************************
2217     * Get othercase range *
2218     *************************************************/
2219    
2220     /* This function is passed the start and end of a class range, in UTF-8 mode
2221     with UCP support. It searches up the characters, looking for internal ranges of
2222     characters in the "other" case. Each call returns the next one, updating the
2223     start address.
2224    
2225     Arguments:
2226     cptr points to starting character value; updated
2227     d end value
2228     ocptr where to put start of othercase range
2229     odptr where to put end of othercase range
2230    
2231     Yield: TRUE when range returned; FALSE when no more
2232     */
2233    
2234     static BOOL
2235 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2236     unsigned int *odptr)
2237 nigel 77 {
2238 nigel 93 unsigned int c, othercase, next;
2239 nigel 77
2240     for (c = *cptr; c <= d; c++)
2241 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2242 nigel 77
2243     if (c > d) return FALSE;
2244    
2245     *ocptr = othercase;
2246     next = othercase + 1;
2247    
2248     for (++c; c <= d; c++)
2249     {
2250 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2251 nigel 77 next++;
2252     }
2253    
2254     *odptr = next - 1;
2255     *cptr = c;
2256    
2257     return TRUE;
2258     }
2259     #endif /* SUPPORT_UCP */
2260    
2261    
2262 nigel 93
2263 nigel 77 /*************************************************
2264 nigel 93 * Check if auto-possessifying is possible *
2265     *************************************************/
2266    
2267     /* This function is called for unlimited repeats of certain items, to see
2268     whether the next thing could possibly match the repeated item. If not, it makes
2269     sense to automatically possessify the repeated item.
2270    
2271     Arguments:
2272     op_code the repeated op code
2273     this data for this item, depends on the opcode
2274     utf8 TRUE in UTF-8 mode
2275     utf8_char used for utf8 character bytes, NULL if not relevant
2276     ptr next character in pattern
2277     options options bits
2278     cd contains pointers to tables etc.
2279    
2280     Returns: TRUE if possessifying is wanted
2281     */
2282    
2283     static BOOL
2284     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2285     const uschar *ptr, int options, compile_data *cd)
2286     {
2287     int next;
2288    
2289     /* Skip whitespace and comments in extended mode */
2290    
2291     if ((options & PCRE_EXTENDED) != 0)
2292     {
2293     for (;;)
2294     {
2295     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2296 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2297 nigel 93 {
2298     while (*(++ptr) != 0)
2299     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2300     }
2301     else break;
2302     }
2303     }
2304    
2305     /* If the next item is one that we can handle, get its value. A non-negative
2306     value is a character, a negative value is an escape value. */
2307    
2308 ph10 391 if (*ptr == CHAR_BACKSLASH)
2309 nigel 93 {
2310     int temperrorcode = 0;
2311     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2312     if (temperrorcode != 0) return FALSE;
2313     ptr++; /* Point after the escape sequence */
2314     }
2315    
2316     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2317     {
2318     #ifdef SUPPORT_UTF8
2319     if (utf8) { GETCHARINC(next, ptr); } else
2320     #endif
2321     next = *ptr++;
2322     }
2323    
2324     else return FALSE;
2325    
2326     /* Skip whitespace and comments in extended mode */
2327    
2328     if ((options & PCRE_EXTENDED) != 0)
2329     {
2330     for (;;)
2331     {
2332     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2333 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2334 nigel 93 {
2335     while (*(++ptr) != 0)
2336     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2337     }
2338     else break;
2339     }
2340     }
2341    
2342     /* If the next thing is itself optional, we have to give up. */
2343    
2344 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2345 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2346     return FALSE;
2347 nigel 93
2348     /* Now compare the next item with the previous opcode. If the previous is a
2349     positive single character match, "item" either contains the character or, if
2350     "item" is greater than 127 in utf8 mode, the character's bytes are in
2351     utf8_char. */
2352    
2353    
2354     /* Handle cases when the next item is a character. */
2355    
2356     if (next >= 0) switch(op_code)
2357     {
2358     case OP_CHAR:
2359     #ifdef SUPPORT_UTF8
2360     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2361 ph10 369 #else
2362     (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2363 nigel 93 #endif
2364     return item != next;
2365    
2366     /* For CHARNC (caseless character) we must check the other case. If we have
2367     Unicode property support, we can use it to test the other case of
2368     high-valued characters. */
2369    
2370     case OP_CHARNC:
2371     #ifdef SUPPORT_UTF8
2372     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2373     #endif
2374     if (item == next) return FALSE;
2375     #ifdef SUPPORT_UTF8
2376     if (utf8)
2377     {
2378     unsigned int othercase;
2379     if (next < 128) othercase = cd->fcc[next]; else
2380     #ifdef SUPPORT_UCP
2381 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2382 nigel 93 #else
2383     othercase = NOTACHAR;
2384     #endif
2385     return (unsigned int)item != othercase;
2386     }
2387     else
2388     #endif /* SUPPORT_UTF8 */
2389     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2390    
2391     /* For OP_NOT, "item" must be a single-byte character. */
2392    
2393     case OP_NOT:
2394     if (item == next) return TRUE;
2395     if ((options & PCRE_CASELESS) == 0) return FALSE;
2396     #ifdef SUPPORT_UTF8
2397     if (utf8)
2398     {
2399     unsigned int othercase;
2400     if (next < 128) othercase = cd->fcc[next]; else
2401     #ifdef SUPPORT_UCP
2402 ph10 349 othercase = UCD_OTHERCASE(next);
2403 nigel 93 #else
2404     othercase = NOTACHAR;
2405     #endif
2406     return (unsigned int)item == othercase;
2407     }
2408     else
2409     #endif /* SUPPORT_UTF8 */
2410     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2411    
2412     case OP_DIGIT:
2413     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2414    
2415     case OP_NOT_DIGIT:
2416     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2417    
2418     case OP_WHITESPACE:
2419     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2420    
2421     case OP_NOT_WHITESPACE:
2422     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2423    
2424     case OP_WORDCHAR:
2425     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2426    
2427     case OP_NOT_WORDCHAR:
2428     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2429    
2430 ph10 180 case OP_HSPACE:
2431     case OP_NOT_HSPACE:
2432     switch(next)
2433     {
2434     case 0x09:
2435     case 0x20:
2436     case 0xa0:
2437     case 0x1680:
2438     case 0x180e:
2439     case 0x2000:
2440     case 0x2001:
2441     case 0x2002:
2442     case 0x2003:
2443     case 0x2004:
2444     case 0x2005:
2445     case 0x2006:
2446     case 0x2007:
2447     case 0x2008:
2448     case 0x2009:
2449     case 0x200A:
2450     case 0x202f:
2451     case 0x205f:
2452     case 0x3000:
2453     return op_code != OP_HSPACE;
2454     default:
2455     return op_code == OP_HSPACE;
2456     }
2457    
2458     case OP_VSPACE:
2459     case OP_NOT_VSPACE:
2460     switch(next)
2461     {
2462     case 0x0a:
2463     case 0x0b:
2464     case 0x0c:
2465     case 0x0d:
2466     case 0x85:
2467     case 0x2028:
2468     case 0x2029:
2469     return op_code != OP_VSPACE;
2470     default:
2471     return op_code == OP_VSPACE;
2472     }
2473    
2474 nigel 93 default:
2475     return FALSE;
2476     }
2477    
2478    
2479     /* Handle the case when the next item is \d, \s, etc. */
2480    
2481     switch(op_code)
2482     {
2483     case OP_CHAR:
2484     case OP_CHARNC:
2485     #ifdef SUPPORT_UTF8
2486     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2487     #endif
2488     switch(-next)
2489     {
2490     case ESC_d:
2491     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2492    
2493     case ESC_D:
2494     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2495    
2496     case ESC_s:
2497     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2498    
2499     case ESC_S:
2500     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2501    
2502     case ESC_w:
2503     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2504    
2505     case ESC_W:
2506     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2507 ph10 182
2508 ph10 180 case ESC_h:
2509     case ESC_H:
2510     switch(item)
2511     {
2512     case 0x09:
2513     case 0x20:
2514     case 0xa0:
2515     case 0x1680:
2516     case 0x180e:
2517     case 0x2000:
2518     case 0x2001:
2519     case 0x2002:
2520     case 0x2003:
2521     case 0x2004:
2522     case 0x2005:
2523     case 0x2006:
2524     case 0x2007:
2525     case 0x2008:
2526     case 0x2009:
2527     case 0x200A:
2528     case 0x202f:
2529     case 0x205f:
2530     case 0x3000:
2531     return -next != ESC_h;
2532     default:
2533     return -next == ESC_h;
2534 ph10 182 }
2535    
2536 ph10 180 case ESC_v:
2537     case ESC_V:
2538     switch(item)
2539     {
2540     case 0x0a:
2541     case 0x0b:
2542     case 0x0c:
2543     case 0x0d:
2544     case 0x85:
2545     case 0x2028:
2546     case 0x2029:
2547     return -next != ESC_v;
2548     default:
2549     return -next == ESC_v;
2550 ph10 182 }
2551 nigel 93
2552     default:
2553     return FALSE;
2554     }
2555    
2556     case OP_DIGIT:
2557 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2558     next == -ESC_h || next == -ESC_v;
2559 nigel 93
2560     case OP_NOT_DIGIT:
2561     return next == -ESC_d;
2562    
2563     case OP_WHITESPACE:
2564     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2565    
2566     case OP_NOT_WHITESPACE:
2567 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2568 nigel 93
2569 ph10 180 case OP_HSPACE:
2570     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2571    
2572     case OP_NOT_HSPACE:
2573     return next == -ESC_h;
2574 ph10 182
2575 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2576 ph10 182 case OP_VSPACE:
2577 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2578    
2579     case OP_NOT_VSPACE:
2580 ph10 182 return next == -ESC_v;
2581 ph10 180
2582 nigel 93 case OP_WORDCHAR:
2583 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2584 nigel 93
2585     case OP_NOT_WORDCHAR:
2586     return next == -ESC_w || next == -ESC_d;
2587 ph10 182
2588 nigel 93 default:
2589     return FALSE;
2590     }
2591    
2592     /* Control does not reach here */
2593     }
2594    
2595    
2596    
2597     /*************************************************
2598 nigel 77 * Compile one branch *
2599     *************************************************/
2600    
2601 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2602 nigel 77 changed during the branch, the pointer is used to change the external options
2603 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2604     to find out the amount of memory needed, as well as during the real compile
2605     phase. The value of lengthptr distinguishes the two phases.
2606 nigel 77
2607     Arguments:
2608     optionsptr pointer to the option bits
2609     codeptr points to the pointer to the current code point
2610     ptrptr points to the current pattern pointer
2611     errorcodeptr points to error code variable
2612     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2613     reqbyteptr set to the last literal character required, else < 0
2614     bcptr points to current branch chain
2615     cd contains pointers to tables etc.
2616 nigel 93 lengthptr NULL during the real compile phase
2617     points to length accumulator during pre-compile phase
2618 nigel 77
2619     Returns: TRUE on success
2620     FALSE, with *errorcodeptr set non-zero on error
2621     */
2622    
2623     static BOOL
2624 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2625     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2626     compile_data *cd, int *lengthptr)
2627 nigel 77 {
2628     int repeat_type, op_type;
2629     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2630     int bravalue = 0;
2631     int greedy_default, greedy_non_default;
2632     int firstbyte, reqbyte;
2633     int zeroreqbyte, zerofirstbyte;
2634     int req_caseopt, reqvary, tempreqvary;
2635     int options = *optionsptr;
2636     int after_manual_callout = 0;
2637 nigel 93 int length_prevgroup = 0;
2638 nigel 77 register int c;
2639     register uschar *code = *codeptr;
2640 nigel 93 uschar *last_code = code;
2641     uschar *orig_code = code;
2642 nigel 77 uschar *tempcode;
2643     BOOL inescq = FALSE;
2644     BOOL groupsetfirstbyte = FALSE;
2645     const uschar *ptr = *ptrptr;
2646     const uschar *tempptr;
2647     uschar *previous = NULL;
2648     uschar *previous_callout = NULL;
2649 nigel 93 uschar *save_hwm = NULL;
2650 nigel 77 uschar classbits[32];
2651    
2652     #ifdef SUPPORT_UTF8
2653     BOOL class_utf8;
2654     BOOL utf8 = (options & PCRE_UTF8) != 0;
2655     uschar *class_utf8data;
2656 ph10 300 uschar *class_utf8data_base;
2657 nigel 77 uschar utf8_char[6];
2658     #else
2659     BOOL utf8 = FALSE;
2660 nigel 93 uschar *utf8_char = NULL;
2661 nigel 77 #endif
2662    
2663 ph10 475 #ifdef PCRE_DEBUG
2664 nigel 93 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2665     #endif
2666    
2667 nigel 77 /* Set up the default and non-default settings for greediness */
2668    
2669     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2670     greedy_non_default = greedy_default ^ 1;
2671    
2672     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2673     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2674     matches a non-fixed char first char; reqbyte just remains unset if we never
2675     find one.
2676    
2677     When we hit a repeat whose minimum is zero, we may have to adjust these values
2678     to take the zero repeat into account. This is implemented by setting them to
2679     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2680     item types that can be repeated set these backoff variables appropriately. */
2681    
2682     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2683    
2684     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2685     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2686     value > 255. It is added into the firstbyte or reqbyte variables to record the
2687     case status of the value. This is used only for ASCII characters. */
2688    
2689     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2690    
2691     /* Switch on next character until the end of the branch */
2692    
2693     for (;; ptr++)
2694     {
2695     BOOL negate_class;
2696 ph10 286 BOOL should_flip_negation;
2697 nigel 77 BOOL possessive_quantifier;
2698     BOOL is_quantifier;
2699 nigel 93 BOOL is_recurse;
2700 ph10 180 BOOL reset_bracount;
2701 nigel 77 int class_charcount;
2702     int class_lastchar;
2703     int newoptions;
2704     int recno;
2705 ph10 172 int refsign;
2706 nigel 77 int skipbytes;
2707     int subreqbyte;
2708     int subfirstbyte;
2709 nigel 93 int terminator;
2710 nigel 77 int mclength;
2711     uschar mcbuffer[8];
2712    
2713 nigel 93 /* Get next byte in the pattern */
2714 nigel 77
2715     c = *ptr;
2716 ph10 345
2717 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2718     previous cycle of this loop. */
2719    
2720     if (lengthptr != NULL)
2721     {
2722 ph10 475 #ifdef PCRE_DEBUG
2723 nigel 93 if (code > cd->hwm) cd->hwm = code; /* High water info */
2724     #endif
2725     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2726     {
2727     *errorcodeptr = ERR52;
2728     goto FAILED;
2729     }
2730    
2731     /* There is at least one situation where code goes backwards: this is the
2732     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2733     the class is simply eliminated. However, it is created first, so we have to
2734     allow memory for it. Therefore, don't ever reduce the length at this point.
2735     */
2736    
2737     if (code < last_code) code = last_code;
2738 ph10 202
2739     /* Paranoid check for integer overflow */
2740    
2741     if (OFLOW_MAX - *lengthptr < code - last_code)
2742     {
2743     *errorcodeptr = ERR20;
2744     goto FAILED;
2745     }
2746    
2747 nigel 93 *lengthptr += code - last_code;
2748     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2749    
2750     /* If "previous" is set and it is not at the start of the work space, move
2751     it back to there, in order to avoid filling up the work space. Otherwise,
2752     if "previous" is NULL, reset the current code pointer to the start. */
2753    
2754     if (previous != NULL)
2755     {
2756     if (previous > orig_code)
2757     {
2758     memmove(orig_code, previous, code - previous);
2759     code -= previous - orig_code;
2760     previous = orig_code;
2761     }
2762     }
2763     else code = orig_code;
2764    
2765     /* Remember where this code item starts so we can pick up the length
2766     next time round. */
2767    
2768     last_code = code;
2769     }
2770    
2771     /* In the real compile phase, just check the workspace used by the forward
2772     reference list. */
2773    
2774     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2775     {
2776     *errorcodeptr = ERR52;
2777     goto FAILED;
2778     }
2779    
2780 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2781    
2782     if (inescq && c != 0)
2783     {
2784 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2785 nigel 77 {
2786     inescq = FALSE;
2787     ptr++;
2788     continue;
2789     }
2790     else
2791     {
2792     if (previous_callout != NULL)
2793     {
2794 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2795     complete_callout(previous_callout, ptr, cd);
2796 nigel 77 previous_callout = NULL;
2797     }
2798     if ((options & PCRE_AUTO_CALLOUT) != 0)
2799     {
2800     previous_callout = code;
2801     code = auto_callout(code, ptr, cd);
2802     }
2803     goto NORMAL_CHAR;
2804     }
2805     }
2806    
2807     /* Fill in length of a previous callout, except when the next thing is
2808     a quantifier. */
2809    
2810 ph10 392 is_quantifier =
2811 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2812     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2813 nigel 77
2814     if (!is_quantifier && previous_callout != NULL &&
2815     after_manual_callout-- <= 0)
2816     {
2817 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2818     complete_callout(previous_callout, ptr, cd);
2819 nigel 77 previous_callout = NULL;
2820     }
2821    
2822     /* In extended mode, skip white space and comments */
2823    
2824     if ((options & PCRE_EXTENDED) != 0)
2825     {
2826     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2827 ph10 391 if (c == CHAR_NUMBER_SIGN)
2828 nigel 77 {
2829 nigel 93 while (*(++ptr) != 0)
2830 nigel 91 {
2831 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2832 nigel 91 }
2833 nigel 93 if (*ptr != 0) continue;
2834    
2835 nigel 91 /* Else fall through to handle end of string */
2836     c = 0;
2837 nigel 77 }
2838     }
2839    
2840     /* No auto callout for quantifiers. */
2841    
2842     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2843     {
2844     previous_callout = code;
2845     code = auto_callout(code, ptr, cd);
2846     }
2847    
2848     switch(c)
2849     {
2850 nigel 93 /* ===================================================================*/
2851     case 0: /* The branch terminates at string end */
2852 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
2853     case CHAR_RIGHT_PARENTHESIS:
2854 nigel 77 *firstbyteptr = firstbyte;
2855     *reqbyteptr = reqbyte;
2856     *codeptr = code;
2857     *ptrptr = ptr;
2858 nigel 93 if (lengthptr != NULL)
2859     {
2860 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2861     {
2862     *errorcodeptr = ERR20;
2863     goto FAILED;
2864     }
2865 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2866     DPRINTF((">> end branch\n"));
2867     }
2868 nigel 77 return TRUE;
2869    
2870 nigel 93
2871     /* ===================================================================*/
2872 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2873     the setting of any following char as a first character. */
2874    
2875 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
2876 nigel 77 if ((options & PCRE_MULTILINE) != 0)
2877     {
2878     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2879     }
2880     previous = NULL;
2881     *code++ = OP_CIRC;
2882     break;
2883    
2884 ph10 391 case CHAR_DOLLAR_SIGN:
2885 nigel 77 previous = NULL;
2886     *code++ = OP_DOLL;
2887     break;
2888    
2889     /* There can never be a first char if '.' is first, whatever happens about
2890     repeats. The value of reqbyte doesn't change either. */
2891    
2892 ph10 391 case CHAR_DOT:
2893 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2894     zerofirstbyte = firstbyte;
2895     zeroreqbyte = reqbyte;
2896     previous = code;
2897 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2898 nigel 77 break;
2899    
2900 nigel 93
2901     /* ===================================================================*/
2902 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2903     32-byte bitmap of the permitted characters, except in the special case
2904     where there is only one such character. For negated classes, we build the
2905     map as usual, then invert it at the end. However, we use a different opcode
2906     so that data characters > 255 can be handled correctly.
2907 nigel 77
2908     If the class contains characters outside the 0-255 range, a different
2909     opcode is compiled. It may optionally have a bit map for characters < 256,
2910     but those above are are explicitly listed afterwards. A flag byte tells
2911     whether the bitmap is present, and whether this is a negated class or not.
2912 ph10 345
2913 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
2914     default (Perl) mode, it is treated as a data character. */
2915 ph10 345
2916 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
2917 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2918     {
2919     *errorcodeptr = ERR64;
2920 ph10 345 goto FAILED;
2921 ph10 336 }
2922 ph10 345 goto NORMAL_CHAR;
2923 nigel 77
2924 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
2925 nigel 77 previous = code;
2926    
2927     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2928     they are encountered at the top level, so we'll do that too. */
2929    
2930 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2931 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
2932 ph10 295 check_posix_syntax(ptr, &tempptr))
2933 nigel 77 {
2934 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2935 nigel 77 goto FAILED;
2936     }
2937    
2938 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2939 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2940 ph10 205 skip them too. This makes for compatibility with Perl. */
2941 ph10 208
2942 ph10 205 negate_class = FALSE;
2943     for (;;)
2944 nigel 77 {
2945     c = *(++ptr);
2946 ph10 391 if (c == CHAR_BACKSLASH)
2947 ph10 205 {
2948 ph10 392 if (ptr[1] == CHAR_E)
2949 ph10 391 ptr++;
2950 ph10 392 else if (strncmp((const char *)ptr+1,
2951     STR_Q STR_BACKSLASH STR_E, 3) == 0)
2952 ph10 391 ptr += 3;
2953 ph10 392 else
2954 ph10 391 break;
2955 ph10 205 }
2956 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2957 ph10 205 negate_class = TRUE;
2958     else break;
2959 ph10 208 }
2960 ph10 345
2961     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2962     an initial ']' is taken as a data character -- the code below handles
2963 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2964     [^] must match any character, so generate OP_ALLANY. */
2965 ph10 345
2966 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2967 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2968 ph10 341 {
2969     *code++ = negate_class? OP_ALLANY : OP_FAIL;
2970     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2971     zerofirstbyte = firstbyte;
2972     break;
2973 ph10 345 }
2974 nigel 77
2975 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
2976     negation flag at the end, so that support for characters > 255 works
2977 ph10 264 correctly (they are all included in the class). */
2978    
2979     should_flip_negation = FALSE;
2980    
2981 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
2982 nigel 93 of just a single character (as long as it's < 256). However, For higher
2983     valued UTF-8 characters, we don't yet do any optimization. */
2984 nigel 77
2985     class_charcount = 0;
2986     class_lastchar = -1;
2987    
2988 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2989     temporary bit of memory, in case the class contains only 1 character (less
2990     than 256), because in that case the compiled code doesn't use the bit map.
2991     */
2992    
2993     memset(classbits, 0, 32 * sizeof(uschar));
2994    
2995 nigel 77 #ifdef SUPPORT_UTF8
2996     class_utf8 = FALSE; /* No chars >= 256 */
2997 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2998 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2999 nigel 77 #endif
3000    
3001     /* Process characters until ] is reached. By writing this as a "do" it
3002 nigel 93 means that an initial ] is taken as a data character. At the start of the
3003     loop, c contains the first byte of the character. */
3004 nigel 77
3005 nigel 93 if (c != 0) do
3006 nigel 77 {
3007 nigel 93 const uschar *oldptr;
3008    
3009 nigel 77 #ifdef SUPPORT_UTF8
3010     if (utf8 && c > 127)
3011     { /* Braces are required because the */
3012     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3013     }
3014 ph10 309
3015 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3016 ph10 309 data and reset the pointer. This is so that very large classes that
3017 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
3018 ph10 309 (which is on the stack). */
3019    
3020 ph10 300 if (lengthptr != NULL)
3021     {
3022     *lengthptr += class_utf8data - class_utf8data_base;
3023 ph10 309 class_utf8data = class_utf8data_base;
3024     }
3025    
3026 nigel 77 #endif
3027    
3028     /* Inside \Q...\E everything is literal except \E */
3029    
3030     if (inescq)
3031     {
3032 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3033 nigel 77 {
3034 nigel 93 inescq = FALSE; /* Reset literal state */
3035     ptr++; /* Skip the 'E' */
3036     continue; /* Carry on with next */
3037 nigel 77 }
3038 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
3039 nigel 77 }
3040    
3041     /* Handle POSIX class names. Perl allows a negation extension of the
3042     form [:^name:]. A square bracket that doesn't match the syntax is
3043     treated as a literal. We also recognize the POSIX constructions
3044     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3045     5.6 and 5.8 do. */
3046    
3047 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3048 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3049 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3050 nigel 77 {
3051     BOOL local_negate = FALSE;
3052 nigel 87 int posix_class, taboffset, tabopt;
3053 nigel 77 register const uschar *cbits = cd->cbits;
3054 nigel 87 uschar pbits[32];
3055 nigel 77
3056 ph10 391 if (ptr[1] != CHAR_COLON)
3057 nigel 77 {
3058     *errorcodeptr = ERR31;
3059     goto FAILED;
3060     }
3061    
3062     ptr += 2;
3063 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3064 nigel 77 {
3065     local_negate = TRUE;
3066 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3067 nigel 77 ptr++;
3068     }
3069    
3070     posix_class = check_posix_name(ptr, tempptr - ptr);
3071     if (posix_class < 0)
3072     {
3073     *errorcodeptr = ERR30;
3074     goto FAILED;
3075     }
3076    
3077     /* If matching is caseless, upper and lower are converted to
3078     alpha. This relies on the fact that the class table starts with
3079     alpha, lower, upper as the first 3 entries. */
3080    
3081     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3082     posix_class = 0;
3083    
3084 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
3085     because we may be adding and subtracting from it, and we don't want to
3086     subtract bits that may be in the main map already. At the end we or the
3087     result into the bit map that is being built. */
3088 nigel 77
3089     posix_class *= 3;
3090 nigel 87
3091     /* Copy in the first table (always present) */
3092    
3093     memcpy(pbits, cbits + posix_class_maps[posix_class],
3094     32 * sizeof(uschar));
3095    
3096     /* If there is a second table, add or remove it as required. */
3097    
3098     taboffset = posix_class_maps[posix_class + 1];
3099     tabopt = posix_class_maps[posix_class + 2];
3100    
3101     if (taboffset >= 0)
3102 nigel 77 {
3103 nigel 87 if (tabopt >= 0)
3104     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3105 nigel 77 else
3106 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3107 nigel 77 }
3108    
3109 nigel 87 /* Not see if we need to remove any special characters. An option
3110     value of 1 removes vertical space and 2 removes underscore. */
3111    
3112     if (tabopt < 0) tabopt = -tabopt;
3113     if (tabopt == 1) pbits[1] &= ~0x3c;
3114     else if (tabopt == 2) pbits[11] &= 0x7f;
3115    
3116     /* Add the POSIX table or its complement into the main table that is
3117     being built and we are done. */
3118    
3119     if (local_negate)
3120     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3121     else
3122     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3123    
3124 nigel 77 ptr = tempptr + 1;
3125     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3126     continue; /* End of POSIX syntax handling */
3127     }
3128    
3129     /* Backslash may introduce a single character, or it may introduce one
3130 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3131     case. Inside a class (and only there) it is treated as backspace.
3132     Elsewhere it marks a word boundary. Other escapes have preset maps ready
3133 ph10 205 to 'or' into the one we are building. We assume they have more than one
3134 nigel 77 character in them, so set class_charcount bigger than one. */
3135    
3136 ph10 391 if (c == CHAR_BACKSLASH)
3137 nigel 77 {
3138 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3139     if (*errorcodeptr != 0) goto FAILED;
3140 nigel 77
3141 ph10 391 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3142     else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
3143     else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
3144 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3145     {
3146 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3147 nigel 77 {
3148     ptr += 2; /* avoid empty string */
3149     }
3150     else inescq = TRUE;
3151     continue;
3152     }
3153 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3154 nigel 77
3155     if (c < 0)
3156     {
3157     register const uschar *cbits = cd->cbits;
3158     class_charcount += 2; /* Greater than 1 is what matters */
3159 nigel 93
3160     /* Save time by not doing this in the pre-compile phase. */
3161    
3162     if (lengthptr == NULL) switch (-c)
3163 nigel 77 {
3164     case ESC_d:
3165     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3166     continue;
3167    
3168     case ESC_D:
3169 ph10 286 should_flip_negation = TRUE;
3170 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3171     continue;
3172    
3173     case ESC_w:
3174     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3175     continue;
3176    
3177     case ESC_W:
3178 ph10 286 should_flip_negation = TRUE;
3179 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3180     continue;
3181    
3182     case ESC_s:
3183     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3184     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3185     continue;
3186    
3187     case ESC_S:
3188 ph10 286 should_flip_negation = TRUE;
3189 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3190     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3191     continue;
3192    
3193 nigel 93 default: /* Not recognized; fall through */
3194     break; /* Need "default" setting to stop compiler warning. */
3195     }
3196    
3197     /* In the pre-compile phase, just do the recognition. */
3198    
3199     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3200     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3201 ph10 180
3202 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
3203     they use extra memory. */
3204 ph10 180
3205 ph10 178 if (-c == ESC_h)
3206     {
3207     SETBIT(classbits, 0x09); /* VT */
3208     SETBIT(classbits, 0x20); /* SPACE */
3209 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
3210 ph10 178 #ifdef SUPPORT_UTF8
3211     if (utf8)
3212 ph10 180 {
3213 ph10 178 class_utf8 = TRUE;
3214     *class_utf8data++ = XCL_SINGLE;
3215 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3216 ph10 178 *class_utf8data++ = XCL_SINGLE;
3217 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3218     *class_utf8data++ = XCL_RANGE;
3219     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3220     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3221 ph10 178 *class_utf8data++ = XCL_SINGLE;
3222 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3223 ph10 178 *class_utf8data++ = XCL_SINGLE;
3224 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3225 ph10 178 *class_utf8data++ = XCL_SINGLE;
3226 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3227     }
3228     #endif
3229     continue;
3230     }
3231 nigel 93
3232 ph10 178 if (-c == ESC_H)
3233     {
3234     for (c = 0; c < 32; c++)
3235     {
3236     int x = 0xff;
3237     switch (c)
3238 ph10 180 {
3239 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
3240     case 0x20/8: x ^= 1 << (0x20%8); break;
3241     case 0xa0/8: x ^= 1 << (0xa0%8); break;
3242     default: break;
3243     }
3244     classbits[c] |= x;
3245 ph10 180 }
3246    
3247 ph10 178 #ifdef SUPPORT_UTF8
3248     if (utf8)
3249 ph10 180 {
3250 ph10 178 class_utf8 = TRUE;
3251 ph10 180 *class_utf8data++ = XCL_RANGE;
3252     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3253     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3254     *class_utf8data++ = XCL_RANGE;
3255     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3256     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3257     *class_utf8data++ = XCL_RANGE;
3258     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3259     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3260     *class_utf8data++ = XCL_RANGE;
3261     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3262     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3263     *class_utf8data++ = XCL_RANGE;
3264     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3265     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3266     *class_utf8data++ = XCL_RANGE;
3267     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3268     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3269     *class_utf8data++ = XCL_RANGE;
3270     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3271     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3272     }
3273     #endif
3274     continue;
3275     }
3276 ph10 178
3277     if (-c == ESC_v)
3278     {
3279     SETBIT(classbits, 0x0a); /* LF */
3280     SETBIT(classbits, 0x0b); /* VT */
3281 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3282     SETBIT(classbits, 0x0d); /* CR */
3283     SETBIT(classbits, 0x85); /* NEL */
3284 ph10 178 #ifdef SUPPORT_UTF8
3285     if (utf8)
3286 ph10 180 {
3287 ph10 178 class_utf8 = TRUE;
3288 ph10 180 *class_utf8data++ = XCL_RANGE;
3289     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3290     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3291     }
3292     #endif
3293     continue;
3294     }
3295 ph10 178
3296     if (-c == ESC_V)
3297     {
3298     for (c = 0; c < 32; c++)
3299     {
3300     int x = 0xff;
3301     switch (c)
3302 ph10 180 {
3303 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3304     x ^= 1 << (0x0b%8);
3305     x ^= 1 << (0x0c%8);
3306 ph10 180 x ^= 1 << (0x0d%8);
3307 ph10 178 break;
3308     case 0x85/8: x ^= 1 << (0x85%8); break;
3309     default: break;
3310     }
3311     classbits[c] |= x;
3312 ph10 180 }
3313    
3314 ph10 178 #ifdef SUPPORT_UTF8
3315     if (utf8)
3316 ph10 180 {
3317 ph10 178 class_utf8 = TRUE;
3318 ph10 180 *class_utf8data++ = XCL_RANGE;
3319     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3320     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3321     *class_utf8data++ = XCL_RANGE;
3322     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3323     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3324     }
3325     #endif
3326     continue;
3327     }
3328 ph10 178
3329 nigel 93 /* We need to deal with \P and \p in both phases. */
3330    
3331 nigel 77 #ifdef SUPPORT_UCP
3332 nigel 93 if (-c == ESC_p || -c == ESC_P)
3333     {
3334     BOOL negated;
3335     int pdata;
3336     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3337     if (ptype < 0) goto FAILED;
3338     class_utf8 = TRUE;
3339     *class_utf8data++ = ((-c == ESC_p) != negated)?
3340     XCL_PROP : XCL_NOTPROP;
3341     *class_utf8data++ = ptype;
3342     *class_utf8data++ = pdata;
3343     class_charcount -= 2; /* Not a < 256 character */
3344 nigel 77 continue;
3345 nigel 93 }
3346 nigel 77 #endif
3347 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
3348     strict mode. By default, for compatibility with Perl, they are
3349     treated as literals. */
3350 nigel 77
3351 nigel 93 if ((options & PCRE_EXTRA) != 0)
3352     {
3353     *errorcodeptr = ERR7;
3354     goto FAILED;
3355     }
3356 nigel 77
3357 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3358     c = *ptr; /* Get the final character and fall through */
3359 nigel 77 }
3360    
3361     /* Fall through if we have a single character (c >= 0). This may be
3362 nigel 93 greater than 256 in UTF-8 mode. */
3363 nigel 77
3364     } /* End of backslash handling */
3365    
3366     /* A single character may be followed by '-' to form a range. However,
3367     Perl does not permit ']' to be the end of the range. A '-' character
3368 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3369     entirely. The code for handling \Q and \E is messy. */
3370 nigel 77
3371 nigel 93 CHECK_RANGE:
3372 ph10 391 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3373 nigel 77 {
3374 nigel 93 inescq = FALSE;
3375     ptr += 2;
3376     }
3377    
3378     oldptr = ptr;
3379 ph10 231
3380 ph10 230 /* Remember \r or \n */
3381 ph10 231
3382 ph10 391 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3383 ph10 231
3384 ph10 230 /* Check for range */
3385 nigel 93
3386 ph10 391 if (!inescq && ptr[1] == CHAR_MINUS)
3387 nigel 93 {
3388 nigel 77 int d;
3389     ptr += 2;
3390 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3391 nigel 77
3392 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3393     mode. */
3394    
3395 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3396 nigel 93 {
3397     ptr += 2;
3398 ph10 392 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3399 ph10 391 { ptr += 2; continue; }
3400 nigel 93 inescq = TRUE;
3401     break;
3402     }
3403    
3404 ph10 391 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3405 nigel 93 {
3406     ptr = oldptr;
3407     goto LONE_SINGLE_CHARACTER;
3408     }
3409    
3410 nigel 77 #ifdef SUPPORT_UTF8
3411     if (utf8)
3412     { /* Braces are required because the */
3413     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3414     }
3415     else
3416     #endif
3417     d = *ptr; /* Not UTF-8 mode */
3418    
3419     /* The second part of a range can be a single-character escape, but
3420     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3421     in such circumstances. */
3422    
3423 ph10 391 if (!inescq && d == CHAR_BACKSLASH)
3424 nigel 77 {
3425 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3426     if (*errorcodeptr != 0) goto FAILED;
3427 nigel 77
3428 ph10 275 /* \b is backspace; \X is literal X; \R is literal R; any other
3429 nigel 93 special means the '-' was literal */
3430 nigel 77
3431     if (d < 0)
3432     {
3433 ph10 391 if (d == -ESC_b) d = CHAR_BS;
3434     else if (d == -ESC_X) d = CHAR_X;
3435     else if (d == -ESC_R) d = CHAR_R; else
3436 nigel 77 {
3437 nigel 93 ptr = oldptr;
3438 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3439     }
3440     }
3441     }
3442    
3443 nigel 93 /* Check that the two values are in the correct order. Optimize
3444     one-character ranges */
3445 nigel 77
3446 nigel 93 if (d < c)
3447     {
3448     *errorcodeptr = ERR8;
3449     goto FAILED;
3450     }
3451    
3452 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3453    
3454 ph10 230 /* Remember \r or \n */
3455 ph10 231
3456 ph10 391 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3457 ph10 231
3458 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3459     matching, we have to use an XCLASS with extra data items. Caseless
3460     matching for characters > 127 is available only if UCP support is
3461     available. */
3462    
3463     #ifdef SUPPORT_UTF8
3464     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3465     {
3466     class_utf8 = TRUE;
3467    
3468     /* With UCP support, we can find the other case equivalents of
3469     the relevant characters. There may be several ranges. Optimize how
3470     they fit with the basic range. */
3471    
3472     #ifdef SUPPORT_UCP
3473     if ((options & PCRE_CASELESS) != 0)
3474     {
3475 nigel 93 unsigned int occ, ocd;
3476     unsigned int cc = c;
3477     unsigned int origd = d;
3478 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3479     {
3480 ph10 180 if (occ >= (unsigned int)c &&
3481     ocd <= (unsigned int)d)
3482 ph10 176 continue; /* Skip embedded ranges */
3483 nigel 77
3484 ph10 180 if (occ < (unsigned int)c &&
3485 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3486 nigel 77 { /* if there is overlap, */
3487     c = occ; /* noting that if occ < c */
3488     continue; /* we can't have ocd > d */
3489     } /* because a subrange is */
3490 ph10 180 if (ocd > (unsigned int)d &&
3491 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3492 nigel 77 { /* the basic range. */
3493     d = ocd;
3494     continue;
3495     }
3496    
3497     if (occ == ocd)
3498     {
3499     *class_utf8data++ = XCL_SINGLE;
3500     }
3501     else
3502     {
3503     *class_utf8data++ = XCL_RANGE;
3504     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3505     }
3506     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3507     }
3508     }
3509     #endif /* SUPPORT_UCP */
3510    
3511     /* Now record the original range, possibly modified for UCP caseless
3512     overlapping ranges. */
3513    
3514     *class_utf8data++ = XCL_RANGE;
3515     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3516     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3517    
3518     /* With UCP support, we are done. Without UCP support, there is no
3519     caseless matching for UTF-8 characters > 127; we can use the bit map
3520     for the smaller ones. */
3521    
3522     #ifdef SUPPORT_UCP
3523     continue; /* With next character in the class */
3524     #else
3525     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3526    
3527     /* Adjust upper limit and fall through to set up the map */
3528    
3529     d = 127;
3530    
3531     #endif /* SUPPORT_UCP */
3532     }
3533     #endif /* SUPPORT_UTF8 */
3534    
3535     /* We use the bit map for all cases when not in UTF-8 mode; else
3536     ranges that lie entirely within 0-127 when there is UCP support; else
3537     for partial ranges without UCP support. */
3538    
3539 nigel 93 class_charcount += d - c + 1;
3540     class_lastchar = d;
3541    
3542     /* We can save a bit of time by skipping this in the pre-compile. */
3543    
3544     if (lengthptr == NULL) for (; c <= d; c++)
3545 nigel 77 {
3546     classbits[c/8] |= (1 << (c&7));
3547     if ((options & PCRE_CASELESS) != 0)
3548     {
3549     int uc = cd->fcc[c]; /* flip case */
3550     classbits[uc/8] |= (1 << (uc&7));
3551     }
3552     }
3553    
3554     continue; /* Go get the next char in the class */
3555     }
3556    
3557     /* Handle a lone single character - we can get here for a normal
3558     non-escape char, or after \ that introduces a single character or for an
3559     apparent range that isn't. */
3560    
3561     LONE_SINGLE_CHARACTER:
3562 ph10 231
3563 nigel 77 /* Handle a character that cannot go in the bit map */
3564    
3565     #ifdef SUPPORT_UTF8
3566     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3567     {
3568     class_utf8 = TRUE;
3569     *class_utf8data++ = XCL_SINGLE;
3570     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3571    
3572     #ifdef SUPPORT_UCP
3573     if ((options & PCRE_CASELESS) != 0)
3574     {
3575 nigel 93 unsigned int othercase;
3576 ph10 349 if ((othercase = UCD_OTHERCASE(c)) != c)
3577 nigel 77 {
3578     *class_utf8data++ = XCL_SINGLE;
3579     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3580     }
3581     }
3582     #endif /* SUPPORT_UCP */
3583    
3584     }
3585     else
3586     #endif /* SUPPORT_UTF8 */
3587    
3588     /* Handle a single-byte character */
3589     {
3590     classbits[c/8] |= (1 << (c&7));
3591     if ((options & PCRE_CASELESS) != 0)
3592     {
3593     c = cd->fcc[c]; /* flip case */
3594     classbits[c/8] |= (1 << (c&7));
3595     }
3596     class_charcount++;
3597     class_lastchar = c;
3598     }
3599     }
3600    
3601 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3602 nigel 77
3603 ph10 391 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3604 nigel 77
3605 nigel 93 if (c == 0) /* Missing terminating ']' */
3606     {
3607     *errorcodeptr = ERR6;
3608     goto FAILED;
3609     }
3610 ph10 231
3611    
3612 ph10 230 /* This code has been disabled because it would mean that \s counts as
3613     an explicit \r or \n reference, and that's not really what is wanted. Now
3614     we set the flag only if there is a literal "\r" or "\n" in the class. */
3615 ph10 227
3616 ph10 230 #if 0
3617 ph10 226 /* Remember whether \r or \n are in this class */
3618 ph10 227
3619 ph10 226 if (negate_class)
3620     {
3621 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3622 ph10 226 }
3623     else
3624     {
3625 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3626 ph10 227 }
3627 ph10 230 #endif
3628 ph10 227
3629 ph10 231
3630 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3631 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3632     use of \p or \P, in other words, no use of any XCLASS features, we can
3633     optimize.
3634    
3635 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3636     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3637     operate on single-bytes only. This is an historical hangover. Maybe one day
3638     we can tidy these opcodes to handle multi-byte characters.
3639 nigel 77
3640     The optimization throws away the bit map. We turn the item into a
3641     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3642     that OP_NOT does not support multibyte characters. In the positive case, it
3643     can cause firstbyte to be set. Otherwise, there can be no first char if
3644     this item is first, whatever repeat count may follow. In the case of
3645     reqbyte, save the previous value for reinstating. */
3646    
3647     #ifdef SUPPORT_UTF8
3648 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3649 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3650 nigel 77 #else
3651     if (class_charcount == 1)
3652     #endif
3653     {
3654     zeroreqbyte = reqbyte;
3655    
3656     /* The OP_NOT opcode works on one-byte characters only. */
3657    
3658     if (negate_class)
3659     {
3660     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3661     zerofirstbyte = firstbyte;
3662     *code++ = OP_NOT;
3663     *code++ = class_lastchar;
3664     break;
3665     }
3666    
3667     /* For a single, positive character, get the value into mcbuffer, and
3668     then we can handle this with the normal one-character code. */
3669    
3670     #ifdef SUPPORT_UTF8
3671     if (utf8 && class_lastchar > 127)
3672     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3673     else
3674     #endif
3675     {
3676     mcbuffer[0] = class_lastchar;
3677     mclength = 1;
3678     }
3679     goto ONE_CHAR;
3680     } /* End of 1-char optimization */
3681    
3682     /* The general case - not the one-char optimization. If this is the first
3683     thing in the branch, there can be no first char setting, whatever the
3684     repeat count. Any reqbyte setting must remain unchanged after any kind of
3685     repeat. */
3686    
3687     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3688     zerofirstbyte = firstbyte;
3689     zeroreqbyte = reqbyte;
3690    
3691     /* If there are characters with values > 255, we have to compile an
3692 ph10 286 extended class, with its own opcode, unless there was a negated special
3693     such as \S in the class, because in that case all characters > 255 are in
3694     the class, so any that were explicitly given as well can be ignored. If
3695 ph10 264 (when there are explicit characters > 255 that must be listed) there are no
3696     characters < 256, we can omit the bitmap in the actual compiled code. */
3697 nigel 77
3698     #ifdef SUPPORT_UTF8
3699 ph10 264 if (class_utf8 && !should_flip_negation)
3700 nigel 77 {
3701     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3702     *code++ = OP_XCLASS;
3703     code += LINK_SIZE;
3704     *code = negate_class? XCL_NOT : 0;
3705    
3706 nigel 93 /* If the map is required, move up the extra data to make room for it;
3707     otherwise just move the code pointer to the end of the extra data. */
3708 nigel 77
3709     if (class_charcount > 0)
3710     {
3711     *code++ |= XCL_MAP;
3712 nigel 93 memmove(code + 32, code, class_utf8data - code);
3713 nigel 77 memcpy(code, classbits, 32);
3714 nigel 93 code = class_utf8data + 32;
3715 nigel 77 }
3716 nigel 93 else code = class_utf8data;
3717 nigel 77
3718     /* Now fill in the complete length of the item */
3719    
3720     PUT(previous, 1, code - previous);
3721     break; /* End of class handling */
3722     }
3723     #endif
3724    
3725 ph10 286 /* If there are no characters > 255, set the opcode to OP_CLASS or
3726     OP_NCLASS, depending on whether the whole class was negated and whether
3727     there were negative specials such as \S in the class. Then copy the 32-byte
3728 ph10 264 map into the code vector, negating it if necessary. */
3729 ph10 286
3730 ph10 264 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3731 nigel 77 if (negate_class)
3732     {
3733 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3734     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3735 nigel 77 }
3736     else
3737     {
3738     memcpy(code, classbits, 32);
3739     }
3740     code += 32;
3741     break;
3742    
3743 nigel 93
3744     /* ===================================================================*/
3745 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3746     has been tested above. */
3747    
3748 ph10 391 case CHAR_LEFT_CURLY_BRACKET:
3749 nigel 77 if (!is_quantifier) goto NORMAL_CHAR;
3750     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3751     if (*errorcodeptr != 0) goto FAILED;
3752     goto REPEAT;
3753    
3754 ph10 391 case CHAR_ASTERISK:
3755 nigel 77 repeat_min = 0;
3756     repeat_max = -1;
3757     goto REPEAT;
3758    
3759 ph10 391 case CHAR_PLUS:
3760 nigel 77 repeat_min = 1;
3761     repeat_max = -1;
3762     goto REPEAT;
3763    
3764 ph10 391 case CHAR_QUESTION_MARK:
3765 nigel 77 repeat_min = 0;
3766     repeat_max = 1;
3767    
3768     REPEAT:
3769     if (previous == NULL)
3770     {
3771     *errorcodeptr = ERR9;
3772     goto FAILED;
3773     }
3774    
3775     if (repeat_min == 0)
3776     {
3777     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3778     reqbyte = zeroreqbyte; /* Ditto */
3779     }
3780    
3781     /* Remember whether this is a variable length repeat */
3782    
3783     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3784    
3785     op_type = 0; /* Default single-char op codes */
3786     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3787    
3788     /* Save start of previous item, in case we have to move it up to make space
3789     for an inserted OP_ONCE for the additional '+' extension. */
3790    
3791     tempcode = previous;
3792    
3793     /* If the next character is '+', we have a possessive quantifier. This
3794     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3795     If the next character is '?' this is a minimizing repeat, by default,
3796     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3797     repeat type to the non-default. */
3798    
3799 ph10 391 if (ptr[1] == CHAR_PLUS)
3800 nigel 77 {
3801     repeat_type = 0; /* Force greedy */
3802     possessive_quantifier = TRUE;
3803     ptr++;
3804     }
3805 ph10 391 else if (ptr[1] == CHAR_QUESTION_MARK)
3806 nigel 77 {
3807     repeat_type = greedy_non_default;
3808     ptr++;
3809     }
3810     else repeat_type = greedy_default;
3811    
3812     /* If previous was a character match, abolish the item and generate a
3813     repeat item instead. If a char item has a minumum of more than one, ensure
3814     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3815     the first thing in a branch because the x will have gone into firstbyte
3816     instead. */
3817    
3818     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3819     {
3820     /* Deal with UTF-8 characters that take up more than one byte. It's
3821     easier to write this out separately than try to macrify it. Use c to
3822     hold the length of the character in bytes, plus 0x80 to flag that it's a
3823     length rather than a small character. */
3824    
3825     #ifdef SUPPORT_UTF8
3826     if (utf8 && (code[-1] & 0x80) != 0)
3827     {
3828     uschar *lastchar = code - 1;
3829     while((*lastchar & 0xc0) == 0x80) lastchar--;
3830     c = code - lastchar; /* Length of UTF-8 character */
3831     memcpy(utf8_char, lastchar, c); /* Save the char */
3832     c |= 0x80; /* Flag c as a length */
3833     }
3834     else
3835     #endif
3836    
3837     /* Handle the case of a single byte - either with no UTF8 support, or
3838     with UTF-8 disabled, or for a UTF-8 character < 128. */
3839    
3840     {
3841     c = code[-1];
3842     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3843     }
3844    
3845 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3846     the line is something that cannot possibly match this character. If so,
3847     automatically possessifying this item gains some performance in the case
3848     where the match fails. */
3849    
3850     if (!possessive_quantifier &&
3851     repeat_max < 0 &&
3852     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3853     options, cd))
3854     {
3855     repeat_type = 0; /* Force greedy */
3856     possessive_quantifier = TRUE;
3857     }
3858    
3859 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3860     }
3861    
3862     /* If previous was a single negated character ([^a] or similar), we use
3863     one of the special opcodes, replacing it. The code is shared with single-
3864     character repeats by setting opt_type to add a suitable offset into
3865 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3866     currently used only for single-byte chars. */
3867 nigel 77
3868     else if (*previous == OP_NOT)
3869     {
3870     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3871     c = previous[1];
3872 nigel 93 if (!possessive_quantifier &&
3873     repeat_max < 0 &&
3874     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3875     {
3876     repeat_type = 0; /* Force greedy */
3877     possessive_quantifier = TRUE;
3878     }
3879 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3880     }
3881    
3882     /* If previous was a character type match (\d or similar), abolish it and
3883     create a suitable repeat item. The code is shared with single-character
3884     repeats by setting op_type to add a suitable offset into repeat_type. Note
3885     the the Unicode property types will be present only when SUPPORT_UCP is
3886     defined, but we don't wrap the little bits of code here because it just
3887     makes it horribly messy. */
3888    
3889     else if (*previous < OP_EODN)
3890     {
3891     uschar *oldcode;
3892 nigel 87 int prop_type, prop_value;
3893 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3894     c = *previous;
3895    
3896 nigel 93 if (!possessive_quantifier &&
3897     repeat_max < 0 &&
3898     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3899     {
3900     repeat_type = 0; /* Force greedy */
3901     possessive_quantifier = TRUE;
3902     }
3903    
3904 nigel 77 OUTPUT_SINGLE_REPEAT:
3905 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3906     {
3907     prop_type = previous[1];
3908     prop_value = previous[2];
3909     }
3910     else prop_type = prop_value = -1;
3911 nigel 77
3912     oldcode = code;
3913     code = previous; /* Usually overwrite previous item */
3914    
3915     /* If the maximum is zero then the minimum must also be zero; Perl allows
3916     this case, so we do too - by simply omitting the item altogether. */
3917    
3918     if (repeat_max == 0) goto END_REPEAT;
3919    
3920 ph10 461 /*--------------------------------------------------------------------*/
3921 ph10 426 /* This code is obsolete from release 8.00; the restriction was finally
3922     removed: */
3923 ph10 461
3924 nigel 77 /* All real repeats make it impossible to handle partial matching (maybe
3925     one day we will be able to remove this restriction). */
3926 ph10 461
3927 ph10 426 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3928 ph10 461 /*--------------------------------------------------------------------*/
3929 nigel 77
3930     /* Combine the op_type with the repeat_type */
3931    
3932     repeat_type += op_type;
3933    
3934     /* A minimum of zero is handled either as the special case * or ?, or as
3935     an UPTO, with the maximum given. */
3936    
3937     if (repeat_min == 0)
3938     {
3939     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3940     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3941     else
3942     {
3943     *code++ = OP_UPTO + repeat_type;
3944     PUT2INC(code, 0, repeat_max);
3945     }
3946     }
3947    
3948     /* A repeat minimum of 1 is optimized into some special cases. If the
3949 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3950 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3951     one less than the maximum. */
3952    
3953     else if (repeat_min == 1)
3954     {
3955