/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 457 - (hide annotations) (download)
Sat Oct 3 16:24:08 2009 UTC (4 years, 9 months ago) by ph10
File MIME type: text/plain
File size: 219485 byte(s)
Allow duplicate names for same-numbered groups; forbid different names.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 381 Copyright (c) 1997-2009 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 391 #ifndef EBCDIC
101    
102     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
103 ph10 392 in UTF-8 mode. */
104 ph10 391
105 ph10 392 static const short int escapes[] = {
106 ph10 391 0, 0,
107     0, 0,
108 ph10 392 0, 0,
109     0, 0,
110     0, 0,
111 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
112 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
113 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
114 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
115     -ESC_B, -ESC_C,
116     -ESC_D, -ESC_E,
117     0, -ESC_G,
118     -ESC_H, 0,
119     0, -ESC_K,
120 ph10 391 0, 0,
121 ph10 392 0, 0,
122 ph10 391 -ESC_P, -ESC_Q,
123     -ESC_R, -ESC_S,
124 ph10 392 0, 0,
125     -ESC_V, -ESC_W,
126     -ESC_X, 0,
127     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
128 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
129 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
130 ph10 391 CHAR_GRAVE_ACCENT, 7,
131 ph10 392 -ESC_b, 0,
132     -ESC_d, ESC_e,
133 ph10 391 ESC_f, 0,
134     -ESC_h, 0,
135 ph10 392 0, -ESC_k,
136 ph10 391 0, 0,
137     ESC_n, 0,
138 ph10 392 -ESC_p, 0,
139     ESC_r, -ESC_s,
140 ph10 391 ESC_tee, 0,
141 ph10 392 -ESC_v, -ESC_w,
142     0, 0,
143 ph10 391 -ESC_z
144 nigel 77 };
145    
146 ph10 392 #else
147 ph10 391
148     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
149    
150 nigel 77 static const short int escapes[] = {
151     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
152     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
153     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
154     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
155     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
156     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
157     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
158     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
159 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
160 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
161 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
162 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
163 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
164     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
165     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
166     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
167 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
168 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
169 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
170 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
171 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
172     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
173     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
174     };
175     #endif
176    
177    
178 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
179     searched linearly. Put all the names into a single string, in order to reduce
180 ph10 392 the number of relocations when a shared library is dynamically linked. The
181     string is built from string macros so that it works in UTF-8 mode on EBCDIC
182 ph10 391 platforms. */
183 ph10 210
184     typedef struct verbitem {
185     int len;
186     int op;
187 ph10 211 } verbitem;
188 ph10 210
189 ph10 240 static const char verbnames[] =
190 ph10 391 STRING_ACCEPT0
191     STRING_COMMIT0
192     STRING_F0
193     STRING_FAIL0
194     STRING_PRUNE0
195     STRING_SKIP0
196     STRING_THEN;
197 ph10 240
198 ph10 327 static const verbitem verbs[] = {
199 ph10 240 { 6, OP_ACCEPT },
200     { 6, OP_COMMIT },
201     { 1, OP_FAIL },
202     { 4, OP_FAIL },
203     { 5, OP_PRUNE },
204     { 4, OP_SKIP },
205     { 4, OP_THEN }
206 ph10 210 };
207    
208 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
209 ph10 210
210    
211 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
212     now all in a single string, to reduce the number of relocations when a shared
213 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
214     length entry. The first three must be alpha, lower, upper, as this is assumed
215     for handling case independence. */
216 nigel 77
217 ph10 240 static const char posix_names[] =
218 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
219     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
220 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
221     STRING_word0 STRING_xdigit;
222 nigel 77
223     static const uschar posix_name_lengths[] = {
224     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
225    
226 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
227     base map, with an optional addition or removal of another map. Then, for some
228     classes, there is some additional tweaking: for [:blank:] the vertical space
229     characters are removed, and for [:alpha:] and [:alnum:] the underscore
230     character is removed. The triples in the table consist of the base map offset,
231     second map offset or -1 if no second map, and a non-negative value for map
232     addition or a negative value for map subtraction (if there are two maps). The
233     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
234     remove vertical space characters, 2 => remove underscore. */
235 nigel 77
236     static const int posix_class_maps[] = {
237 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
238     cbit_lower, -1, 0, /* lower */
239     cbit_upper, -1, 0, /* upper */
240     cbit_word, -1, 2, /* alnum - word without underscore */
241     cbit_print, cbit_cntrl, 0, /* ascii */
242     cbit_space, -1, 1, /* blank - a GNU extension */
243     cbit_cntrl, -1, 0, /* cntrl */
244     cbit_digit, -1, 0, /* digit */
245     cbit_graph, -1, 0, /* graph */
246     cbit_print, -1, 0, /* print */
247     cbit_punct, -1, 0, /* punct */
248     cbit_space, -1, 0, /* space */
249     cbit_word, -1, 0, /* word - a Perl extension */
250     cbit_xdigit,-1, 0 /* xdigit */
251 nigel 77 };
252    
253    
254 nigel 93 #define STRING(a) # a
255     #define XSTRING(s) STRING(s)
256    
257 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
258 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
259     they are documented. Always add a new error instead. Messages marked DEAD below
260 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
261     the number of relocations needed when a shared library is loaded dynamically,
262     it is now one long string. We cannot use a table of offsets, because the
263     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
264     simply count through to the one we want - this isn't a performance issue
265 ph10 240 because these strings are used only when there is a compilation error. */
266 nigel 77
267 ph10 240 static const char error_texts[] =
268     "no error\0"
269     "\\ at end of pattern\0"
270     "\\c at end of pattern\0"
271     "unrecognized character follows \\\0"
272     "numbers out of order in {} quantifier\0"
273 nigel 77 /* 5 */
274 ph10 240 "number too big in {} quantifier\0"
275     "missing terminating ] for character class\0"
276     "invalid escape sequence in character class\0"
277     "range out of order in character class\0"
278     "nothing to repeat\0"
279 nigel 77 /* 10 */
280 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
281     "internal error: unexpected repeat\0"
282 ph10 269 "unrecognized character after (? or (?-\0"
283 ph10 240 "POSIX named classes are supported only within a class\0"
284     "missing )\0"
285 nigel 77 /* 15 */
286 ph10 240 "reference to non-existent subpattern\0"
287     "erroffset passed as NULL\0"
288     "unknown option bit(s) set\0"
289     "missing ) after comment\0"
290     "parentheses nested too deeply\0" /** DEAD **/
291 nigel 77 /* 20 */
292 ph10 240 "regular expression is too large\0"
293     "failed to get memory\0"
294     "unmatched parentheses\0"
295     "internal error: code overflow\0"
296     "unrecognized character after (?<\0"
297 nigel 77 /* 25 */
298 ph10 240 "lookbehind assertion is not fixed length\0"
299     "malformed number or name after (?(\0"
300     "conditional group contains more than two branches\0"
301     "assertion expected after (?(\0"
302     "(?R or (?[+-]digits must be followed by )\0"
303 nigel 77 /* 30 */
304 ph10 240 "unknown POSIX class name\0"
305     "POSIX collating elements are not supported\0"
306     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
307     "spare error\0" /** DEAD **/
308     "character value in \\x{...} sequence is too large\0"
309 nigel 77 /* 35 */
310 ph10 240 "invalid condition (?(0)\0"
311     "\\C not allowed in lookbehind assertion\0"
312     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
313     "number after (?C is > 255\0"
314     "closing ) for (?C expected\0"
315 nigel 77 /* 40 */
316 ph10 240 "recursive call could loop indefinitely\0"
317     "unrecognized character after (?P\0"
318     "syntax error in subpattern name (missing terminator)\0"
319     "two named subpatterns have the same name\0"
320     "invalid UTF-8 string\0"
321 nigel 77 /* 45 */
322 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
323     "malformed \\P or \\p sequence\0"
324     "unknown property name after \\P or \\p\0"
325     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
326     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
327 nigel 91 /* 50 */
328 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
329     "octal value is greater than \\377 (not in UTF-8 mode)\0"
330     "internal error: overran compiling workspace\0"
331     "internal error: previously-checked referenced subpattern not found\0"
332     "DEFINE group contains more than one branch\0"
333 nigel 93 /* 55 */
334 ph10 240 "repeating a DEFINE group is not allowed\0"
335     "inconsistent NEWLINE options\0"
336 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
337     "a numbered reference must not be zero\0"
338 ph10 240 "(*VERB) with an argument is not supported\0"
339 ph10 211 /* 60 */
340 ph10 240 "(*VERB) not recognized\0"
341 ph10 268 "number is too big\0"
342 ph10 272 "subpattern name expected\0"
343 ph10 336 "digit expected after (?+\0"
344 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
345     /* 65 */
346     "different names for subpatterns of the same number are not allowed";
347 nigel 77
348    
349     /* Table to identify digits and hex digits. This is used when compiling
350     patterns. Note that the tables in chartables are dependent on the locale, and
351     may mark arbitrary characters as digits - but the PCRE compiling code expects
352     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
353     a private table here. It costs 256 bytes, but it is a lot faster than doing
354     character value tests (at least in some simple cases I timed), and in some
355     applications one wants PCRE to compile efficiently as well as match
356     efficiently.
357    
358     For convenience, we use the same bit definitions as in chartables:
359    
360     0x04 decimal digit
361     0x08 hexadecimal digit
362    
363     Then we can use ctype_digit and ctype_xdigit in the code. */
364    
365 ph10 392 #ifndef EBCDIC
366 ph10 391
367 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
368 ph10 391 UTF-8 mode. */
369    
370 nigel 77 static const unsigned char digitab[] =
371     {
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
374     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
375     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
376     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
377     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
378     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
379     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
380     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
381     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
382     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
383     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
384     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
387     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
388     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
393     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
394     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
395     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
396     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
397     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
398     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
399     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
400     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
401     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
402     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
403     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
404    
405 ph10 392 #else
406 ph10 391
407     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
408    
409 nigel 77 static const unsigned char digitab[] =
410     {
411     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
412     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
413     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
414     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
415     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
416     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
417     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
418     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
419     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
420     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
421     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
422 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
423 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
424     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
425     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
426     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
427     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
428     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
429     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
430     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
431     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
432     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
433     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
434     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
435     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
436     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
437     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
438     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
439     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
440     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
441     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
442     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
443    
444     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
445     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
446     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
447     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
448     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
449     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
450     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
451     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
452     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
453     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
454     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
455     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
456 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
457 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
458     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
459     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
460     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
461     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
462     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
463     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
464     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
465     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
466     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
467     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
468     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
469     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
470     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
471     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
472     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
473     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
474     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
475     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
476     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
477     #endif
478    
479    
480     /* Definition to allow mutual recursion */
481    
482     static BOOL
483 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
484 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
485 nigel 77
486    
487    
488     /*************************************************
489 ph10 240 * Find an error text *
490     *************************************************/
491    
492 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
493     some of the text is of unknown length, we can't use a table of offsets.
494     Instead, just count through the strings. This is not a performance issue
495 ph10 240 because it happens only when there has been a compilation error.
496    
497     Argument: the error number
498     Returns: pointer to the error string
499     */
500    
501     static const char *
502     find_error_text(int n)
503     {
504     const char *s = error_texts;
505 ph10 369 for (; n > 0; n--) while (*s++ != 0) {};
506 ph10 240 return s;
507     }
508    
509    
510     /*************************************************
511 nigel 77 * Handle escapes *
512     *************************************************/
513    
514     /* This function is called when a \ has been encountered. It either returns a
515     positive value for a simple escape such as \n, or a negative value which
516 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
517     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
518     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
519     ptr is pointing at the \. On exit, it is on the final character of the escape
520     sequence.
521 nigel 77
522     Arguments:
523     ptrptr points to the pattern position pointer
524     errorcodeptr points to the errorcode variable
525     bracount number of previous extracting brackets
526     options the options bits
527     isclass TRUE if inside a character class
528    
529     Returns: zero or positive => a data character
530     negative => a special escape sequence
531 ph10 213 on error, errorcodeptr is set
532 nigel 77 */
533    
534     static int
535     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
536     int options, BOOL isclass)
537     {
538 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
539     const uschar *ptr = *ptrptr + 1;
540 nigel 77 int c, i;
541    
542 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
543     ptr--; /* Set pointer back to the last byte */
544    
545 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
546    
547     if (c == 0) *errorcodeptr = ERR1;
548    
549 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
550     in a table. A non-zero result is something that can be returned immediately.
551 nigel 77 Otherwise further processing may be required. */
552    
553 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
554     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
555     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
556 nigel 77
557 ph10 97 #else /* EBCDIC coding */
558 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
559 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
560     #endif
561    
562     /* Escapes that need further processing, or are illegal. */
563    
564     else
565     {
566     const uschar *oldptr;
567 nigel 93 BOOL braced, negated;
568    
569 nigel 77 switch (c)
570     {
571     /* A number of Perl escapes are not handled by PCRE. We give an explicit
572     error. */
573    
574 ph10 391 case CHAR_l:
575     case CHAR_L:
576     case CHAR_N:
577     case CHAR_u:
578     case CHAR_U:
579 nigel 77 *errorcodeptr = ERR37;
580     break;
581    
582 ph10 333 /* \g must be followed by one of a number of specific things:
583 ph10 345
584 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
585     backreference. If negative, it is a relative backreference. This is a Perl
586     5.10 feature.
587 ph10 345
588 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
589     is part of Perl's movement towards a unified syntax for back references. As
590     this is synonymous with \k{name}, we fudge it up by pretending it really
591     was \k.
592 ph10 345
593     (3) For Oniguruma compatibility we also support \g followed by a name or a
594     number either in angle brackets or in single quotes. However, these are
595     (possibly recursive) subroutine calls, _not_ backreferences. Just return
596 ph10 333 the -ESC_g code (cf \k). */
597 nigel 93
598 ph10 391 case CHAR_g:
599     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
600 ph10 333 {
601     c = -ESC_g;
602 ph10 345 break;
603     }
604 ph10 333
605     /* Handle the Perl-compatible cases */
606 ph10 345
607 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
608 nigel 93 {
609 ph10 171 const uschar *p;
610 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
611     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
612     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
613 ph10 171 {
614     c = -ESC_k;
615     break;
616 ph10 172 }
617 nigel 93 braced = TRUE;
618     ptr++;
619     }
620     else braced = FALSE;
621    
622 ph10 391 if (ptr[1] == CHAR_MINUS)
623 nigel 93 {
624     negated = TRUE;
625     ptr++;
626     }
627     else negated = FALSE;
628    
629     c = 0;
630     while ((digitab[ptr[1]] & ctype_digit) != 0)
631 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
632 ph10 220
633 ph10 333 if (c < 0) /* Integer overflow */
634 ph10 213 {
635     *errorcodeptr = ERR61;
636     break;
637 ph10 220 }
638 ph10 345
639 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
640 nigel 93 {
641     *errorcodeptr = ERR57;
642 ph10 213 break;
643 nigel 93 }
644 ph10 345
645 ph10 333 if (c == 0)
646     {
647     *errorcodeptr = ERR58;
648     break;
649 ph10 345 }
650 nigel 93
651     if (negated)
652     {
653     if (c > bracount)
654     {
655     *errorcodeptr = ERR15;
656 ph10 213 break;
657 nigel 93 }
658     c = bracount - (c - 1);
659     }
660    
661     c = -(ESC_REF + c);
662     break;
663    
664 nigel 77 /* The handling of escape sequences consisting of a string of digits
665     starting with one that is not zero is not straightforward. By experiment,
666     the way Perl works seems to be as follows:
667    
668     Outside a character class, the digits are read as a decimal number. If the
669     number is less than 10, or if there are that many previous extracting
670     left brackets, then it is a back reference. Otherwise, up to three octal
671     digits are read to form an escaped byte. Thus \123 is likely to be octal
672     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
673     value is greater than 377, the least significant 8 bits are taken. Inside a
674     character class, \ followed by a digit is always an octal number. */
675    
676 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
677     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
678 nigel 77
679     if (!isclass)
680     {
681     oldptr = ptr;
682 ph10 391 c -= CHAR_0;
683 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
684 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
685 ph10 333 if (c < 0) /* Integer overflow */
686 ph10 213 {
687     *errorcodeptr = ERR61;
688 ph10 220 break;
689     }
690 nigel 77 if (c < 10 || c <= bracount)
691     {
692     c = -(ESC_REF + c);
693     break;
694     }
695     ptr = oldptr; /* Put the pointer back and fall through */
696     }
697    
698     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
699     generates a binary zero byte and treats the digit as a following literal.
700     Thus we have to pull back the pointer by one. */
701    
702 ph10 391 if ((c = *ptr) >= CHAR_8)
703 nigel 77 {
704     ptr--;
705     c = 0;
706     break;
707     }
708    
709     /* \0 always starts an octal number, but we may drop through to here with a
710 nigel 91 larger first octal digit. The original code used just to take the least
711     significant 8 bits of octal numbers (I think this is what early Perls used
712     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
713     than 3 octal digits. */
714 nigel 77
715 ph10 391 case CHAR_0:
716     c -= CHAR_0;
717     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
718     c = c * 8 + *(++ptr) - CHAR_0;
719 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
720 nigel 77 break;
721    
722 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
723     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
724     treated as a data character. */
725 nigel 77
726 ph10 391 case CHAR_x:
727     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
728 nigel 77 {
729     const uschar *pt = ptr + 2;
730 nigel 87 int count = 0;
731    
732 nigel 77 c = 0;
733     while ((digitab[*pt] & ctype_xdigit) != 0)
734     {
735 nigel 87 register int cc = *pt++;
736 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
737 nigel 77 count++;
738 nigel 87
739 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
740     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
741     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
742 ph10 97 #else /* EBCDIC coding */
743 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
744     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
745 nigel 77 #endif
746     }
747 nigel 87
748 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
749 nigel 77 {
750 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
751 nigel 77 ptr = pt;
752     break;
753     }
754 nigel 87
755 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
756     recognize this construct; fall through to the normal \x handling. */
757     }
758    
759 nigel 87 /* Read just a single-byte hex-defined char */
760 nigel 77
761     c = 0;
762     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
763     {
764 ph10 391 int cc; /* Some compilers don't like */
765     cc = *(++ptr); /* ++ in initializers */
766     #ifndef EBCDIC /* ASCII/UTF-8 coding */
767     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
768     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
769 ph10 97 #else /* EBCDIC coding */
770 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
771     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
772 nigel 77 #endif
773     }
774     break;
775    
776 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
777     This coding is ASCII-specific, but then the whole concept of \cx is
778     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
779 nigel 77
780 ph10 391 case CHAR_c:
781 nigel 77 c = *(++ptr);
782     if (c == 0)
783     {
784     *errorcodeptr = ERR2;
785 ph10 213 break;
786 nigel 77 }
787    
788 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
789     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
790 nigel 77 c ^= 0x40;
791 ph10 97 #else /* EBCDIC coding */
792 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
793 nigel 77 c ^= 0xC0;
794     #endif
795     break;
796    
797     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
798 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
799     otherwise, for Perl compatibility, it is a literal. This code looks a bit
800     odd, but there used to be some cases other than the default, and there may
801     be again in future, so I haven't "optimized" it. */
802 nigel 77
803     default:
804     if ((options & PCRE_EXTRA) != 0) switch(c)
805     {
806     default:
807     *errorcodeptr = ERR3;
808     break;
809     }
810     break;
811     }
812     }
813    
814     *ptrptr = ptr;
815     return c;
816     }
817    
818    
819    
820     #ifdef SUPPORT_UCP
821     /*************************************************
822     * Handle \P and \p *
823     *************************************************/
824    
825     /* This function is called after \P or \p has been encountered, provided that
826     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
827     pointing at the P or p. On exit, it is pointing at the final character of the
828     escape sequence.
829    
830     Argument:
831     ptrptr points to the pattern position pointer
832     negptr points to a boolean that is set TRUE for negation else FALSE
833 nigel 87 dptr points to an int that is set to the detailed property value
834 nigel 77 errorcodeptr points to the error code variable
835    
836 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
837 nigel 77 */
838    
839     static int
840 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
841 nigel 77 {
842     int c, i, bot, top;
843     const uschar *ptr = *ptrptr;
844 nigel 87 char name[32];
845 nigel 77
846     c = *(++ptr);
847     if (c == 0) goto ERROR_RETURN;
848    
849     *negptr = FALSE;
850    
851 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
852     negation. */
853 nigel 77
854 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
855 nigel 77 {
856 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
857 nigel 77 {
858     *negptr = TRUE;
859     ptr++;
860     }
861 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
862 nigel 77 {
863     c = *(++ptr);
864     if (c == 0) goto ERROR_RETURN;
865 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
866 nigel 77 name[i] = c;
867     }
868 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
869 nigel 77 name[i] = 0;
870     }
871    
872     /* Otherwise there is just one following character */
873    
874     else
875     {
876     name[0] = c;
877     name[1] = 0;
878     }
879    
880     *ptrptr = ptr;
881    
882     /* Search for a recognized property name using binary chop */
883    
884     bot = 0;
885     top = _pcre_utt_size;
886    
887     while (bot < top)
888     {
889 nigel 87 i = (bot + top) >> 1;
890 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
891 nigel 87 if (c == 0)
892     {
893     *dptr = _pcre_utt[i].value;
894     return _pcre_utt[i].type;
895     }
896 nigel 77 if (c > 0) bot = i + 1; else top = i;
897     }
898    
899     *errorcodeptr = ERR47;
900     *ptrptr = ptr;
901     return -1;
902    
903     ERROR_RETURN:
904     *errorcodeptr = ERR46;
905     *ptrptr = ptr;
906     return -1;
907     }
908     #endif
909    
910    
911    
912    
913     /*************************************************
914     * Check for counted repeat *
915     *************************************************/
916    
917     /* This function is called when a '{' is encountered in a place where it might
918     start a quantifier. It looks ahead to see if it really is a quantifier or not.
919     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
920     where the ddds are digits.
921    
922     Arguments:
923     p pointer to the first char after '{'
924    
925     Returns: TRUE or FALSE
926     */
927    
928     static BOOL
929     is_counted_repeat(const uschar *p)
930     {
931     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
932     while ((digitab[*p] & ctype_digit) != 0) p++;
933 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
934 nigel 77
935 ph10 391 if (*p++ != CHAR_COMMA) return FALSE;
936     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
937 nigel 77
938     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
939     while ((digitab[*p] & ctype_digit) != 0) p++;
940    
941 ph10 391 return (*p == CHAR_RIGHT_CURLY_BRACKET);
942 nigel 77 }
943    
944    
945    
946     /*************************************************
947     * Read repeat counts *
948     *************************************************/
949    
950     /* Read an item of the form {n,m} and return the values. This is called only
951     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
952     so the syntax is guaranteed to be correct, but we need to check the values.
953    
954     Arguments:
955     p pointer to first char after '{'
956     minp pointer to int for min
957     maxp pointer to int for max
958     returned as -1 if no max
959     errorcodeptr points to error code variable
960    
961     Returns: pointer to '}' on success;
962     current ptr on error, with errorcodeptr set non-zero
963     */
964    
965     static const uschar *
966     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
967     {
968     int min = 0;
969     int max = -1;
970    
971 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
972     an integer overflow. */
973    
974 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
975 nigel 81 if (min < 0 || min > 65535)
976     {
977     *errorcodeptr = ERR5;
978     return p;
979     }
980 nigel 77
981 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
982     Also, max must not be less than min. */
983    
984 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
985 nigel 77 {
986 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
987 nigel 77 {
988     max = 0;
989 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
990 nigel 81 if (max < 0 || max > 65535)
991     {
992     *errorcodeptr = ERR5;
993     return p;
994     }
995 nigel 77 if (max < min)
996     {
997     *errorcodeptr = ERR4;
998     return p;
999     }
1000     }
1001     }
1002    
1003 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1004     '}'. */
1005 nigel 77
1006 nigel 81 *minp = min;
1007     *maxp = max;
1008 nigel 77 return p;
1009     }
1010    
1011    
1012    
1013     /*************************************************
1014 ph10 408 * Subroutine for finding forward reference *
1015 nigel 91 *************************************************/
1016    
1017 ph10 408 /* This recursive function is called only from find_parens() below. The
1018     top-level call starts at the beginning of the pattern. All other calls must
1019     start at a parenthesis. It scans along a pattern's text looking for capturing
1020 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1021     name it is given, it returns its number. Alternatively, if the name is NULL, it
1022 ph10 408 returns when it reaches a given numbered subpattern. We know that if (?P< is
1023     encountered, the name will be terminated by '>' because that is checked in the
1024 ph10 411 first pass. Recursion is used to keep track of subpatterns that reset the
1025 ph10 408 capturing group numbers - the (?| feature.
1026 nigel 91
1027     Arguments:
1028 ph10 408 ptrptr address of the current character pointer (updated)
1029 ph10 345 cd compile background data
1030 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1031     lorn name length, or subpattern number if name is NULL
1032     xmode TRUE if we are in /x mode
1033 ph10 411 count pointer to the current capturing subpattern number (updated)
1034 nigel 91
1035     Returns: the number of the named subpattern, or -1 if not found
1036     */
1037    
1038     static int
1039 ph10 408 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1040     BOOL xmode, int *count)
1041 nigel 91 {
1042 ph10 408 uschar *ptr = *ptrptr;
1043     int start_count = *count;
1044     int hwm_count = start_count;
1045     BOOL dup_parens = FALSE;
1046 nigel 93
1047 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1048 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1049    
1050     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1051     {
1052     if (ptr[1] == CHAR_QUESTION_MARK &&
1053 ph10 411 ptr[2] == CHAR_VERTICAL_LINE)
1054 ph10 408 {
1055     ptr += 3;
1056 ph10 411 dup_parens = TRUE;
1057     }
1058 ph10 408
1059     /* Handle a normal, unnamed capturing parenthesis */
1060 ph10 411
1061 ph10 408 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1062     {
1063     *count += 1;
1064     if (name == NULL && *count == lorn) return *count;
1065 ph10 411 ptr++;
1066 ph10 408 }
1067    
1068     /* Handle a condition. If it is an assertion, just carry on so that it
1069     is processed as normal. If not, skip to the closing parenthesis of the
1070 ph10 411 condition (there can't be any nested parens. */
1071    
1072 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1073     {
1074 ph10 411 ptr += 2;
1075 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1076     {
1077     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1078 ph10 411 if (*ptr != 0) ptr++;
1079 ph10 408 }
1080 ph10 411 }
1081    
1082 ph10 408 /* We have either (? or (* and not a condition */
1083    
1084     else
1085 ph10 411 {
1086 ph10 408 ptr += 2;
1087     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1088    
1089     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1090 ph10 411
1091 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1092     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1093     {
1094     int term;
1095     const uschar *thisname;
1096     *count += 1;
1097     if (name == NULL && *count == lorn) return *count;
1098     term = *ptr++;
1099     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1100     thisname = ptr;
1101     while (*ptr != term) ptr++;
1102     if (name != NULL && lorn == ptr - thisname &&
1103     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1104     return *count;
1105 ph10 438 term++;
1106 ph10 411 }
1107 ph10 408 }
1108 ph10 411 }
1109 ph10 408
1110 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1111 ph10 408 bars. */
1112    
1113 nigel 91 for (; *ptr != 0; ptr++)
1114     {
1115 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1116    
1117 ph10 391 if (*ptr == CHAR_BACKSLASH)
1118 nigel 93 {
1119 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1120 ph10 391 if (*ptr == CHAR_Q) for (;;)
1121 nigel 93 {
1122 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1123 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1124 ph10 391 if (*(++ptr) == CHAR_E) break;
1125 nigel 93 }
1126     continue;
1127     }
1128    
1129 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1130     are handled for real. If the first character is '^', skip it. Also, if the
1131     first few characters (either before or after ^) are \Q\E or \E we skip them
1132 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1133 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1134 nigel 93
1135 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1136 nigel 93 {
1137 ph10 340 BOOL negate_class = FALSE;
1138     for (;;)
1139     {
1140 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1141 ph10 340 {
1142 ph10 438 if (ptr[2] == CHAR_E)
1143     ptr+= 2;
1144     else if (strncmp((const char *)ptr+2,
1145 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1146 ph10 438 ptr += 4;
1147 ph10 392 else
1148 ph10 391 break;
1149 ph10 340 }
1150 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1151     {
1152 ph10 340 negate_class = TRUE;
1153 ph10 438 ptr++;
1154     }
1155 ph10 340 else break;
1156     }
1157    
1158     /* If the next character is ']', it is a data character that must be
1159 ph10 341 skipped, except in JavaScript compatibility mode. */
1160 ph10 345
1161 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1162 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1163 ph10 345 ptr++;
1164    
1165 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1166 nigel 93 {
1167 ph10 220 if (*ptr == 0) return -1;
1168 ph10 391 if (*ptr == CHAR_BACKSLASH)
1169 nigel 93 {
1170 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1171 ph10 391 if (*ptr == CHAR_Q) for (;;)
1172 nigel 93 {
1173 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1174 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1175 ph10 391 if (*(++ptr) == CHAR_E) break;
1176 nigel 93 }
1177     continue;
1178     }
1179     }
1180     continue;
1181     }
1182    
1183     /* Skip comments in /x mode */
1184    
1185 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1186 nigel 93 {
1187 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1188 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1189 nigel 93 continue;
1190     }
1191    
1192 ph10 408 /* Check for the special metacharacters */
1193 ph10 411
1194 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1195 nigel 93 {
1196 ph10 408 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1197     if (rc > 0) return rc;
1198     if (*ptr == 0) goto FAIL_EXIT;
1199 nigel 93 }
1200 ph10 411
1201 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1202     {
1203     if (dup_parens && *count < hwm_count) *count = hwm_count;
1204     *ptrptr = ptr;
1205     return -1;
1206     }
1207 ph10 411
1208     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1209 ph10 408 {
1210     if (*count > hwm_count) hwm_count = *count;
1211     *count = start_count;
1212 ph10 411 }
1213 ph10 408 }
1214 nigel 93
1215 ph10 408 FAIL_EXIT:
1216     *ptrptr = ptr;
1217     return -1;
1218     }
1219 nigel 93
1220    
1221    
1222    
1223 ph10 408 /*************************************************
1224     * Find forward referenced subpattern *
1225     *************************************************/
1226 nigel 93
1227 ph10 408 /* This function scans along a pattern's text looking for capturing
1228     subpatterns, and counting them. If it finds a named pattern that matches the
1229     name it is given, it returns its number. Alternatively, if the name is NULL, it
1230     returns when it reaches a given numbered subpattern. This is used for forward
1231     references to subpatterns. We used to be able to start this scan from the
1232     current compiling point, using the current count value from cd->bracount, and
1233     do it all in a single loop, but the addition of the possibility of duplicate
1234     subpattern numbers means that we have to scan from the very start, in order to
1235     take account of such duplicates, and to use a recursive function to keep track
1236     of the different types of group.
1237    
1238     Arguments:
1239     cd compile background data
1240     name name to seek, or NULL if seeking a numbered subpattern
1241     lorn name length, or subpattern number if name is NULL
1242     xmode TRUE if we are in /x mode
1243    
1244     Returns: the number of the found subpattern, or -1 if not found
1245     */
1246    
1247     static int
1248     find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1249     {
1250     uschar *ptr = (uschar *)cd->start_pattern;
1251     int count = 0;
1252     int rc;
1253    
1254     /* If the pattern does not start with an opening parenthesis, the first call
1255     to find_parens_sub() will scan right to the end (if necessary). However, if it
1256     does start with a parenthesis, find_parens_sub() will return when it hits the
1257     matching closing parens. That is why we have to have a loop. */
1258    
1259 ph10 411 for (;;)
1260     {
1261 ph10 408 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1262 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1263     }
1264    
1265 ph10 408 return rc;
1266 nigel 91 }
1267    
1268    
1269    
1270 ph10 408
1271 nigel 91 /*************************************************
1272 nigel 77 * Find first significant op code *
1273     *************************************************/
1274    
1275     /* This is called by several functions that scan a compiled expression looking
1276     for a fixed first character, or an anchoring op code etc. It skips over things
1277     that do not influence this. For some calls, a change of option is important.
1278     For some calls, it makes sense to skip negative forward and all backward
1279     assertions, and also the \b assertion; for others it does not.
1280    
1281     Arguments:
1282     code pointer to the start of the group
1283     options pointer to external options
1284     optbit the option bit whose changing is significant, or
1285     zero if none are
1286     skipassert TRUE if certain assertions are to be skipped
1287    
1288     Returns: pointer to the first significant opcode
1289     */
1290    
1291     static const uschar*
1292     first_significant_code(const uschar *code, int *options, int optbit,
1293     BOOL skipassert)
1294     {
1295     for (;;)
1296     {
1297     switch ((int)*code)
1298     {
1299     case OP_OPT:
1300     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1301     *options = (int)code[1];
1302     code += 2;
1303     break;
1304    
1305     case OP_ASSERT_NOT:
1306     case OP_ASSERTBACK:
1307     case OP_ASSERTBACK_NOT:
1308     if (!skipassert) return code;
1309     do code += GET(code, 1); while (*code == OP_ALT);
1310     code += _pcre_OP_lengths[*code];
1311     break;
1312    
1313     case OP_WORD_BOUNDARY:
1314     case OP_NOT_WORD_BOUNDARY:
1315     if (!skipassert) return code;
1316     /* Fall through */
1317    
1318     case OP_CALLOUT:
1319     case OP_CREF:
1320 nigel 93 case OP_RREF:
1321     case OP_DEF:
1322 nigel 77 code += _pcre_OP_lengths[*code];
1323     break;
1324    
1325     default:
1326     return code;
1327     }
1328     }
1329     /* Control never reaches here */
1330     }
1331    
1332    
1333    
1334    
1335     /*************************************************
1336 ph10 454 * Find the fixed length of a branch *
1337 nigel 77 *************************************************/
1338    
1339 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1340 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1341 ph10 454 In UTF8 mode, the result is in characters rather than bytes. The branch is
1342     temporarily terminated with OP_END when this function is called.
1343 nigel 77
1344 ph10 454 This function is called when a backward assertion is encountered, so that if it
1345     fails, the error message can point to the correct place in the pattern.
1346     However, we cannot do this when the assertion contains subroutine calls,
1347     because they can be forward references. We solve this by remembering this case
1348     and doing the check at the end; a flag specifies which mode we are running in.
1349    
1350 nigel 77 Arguments:
1351     code points to the start of the pattern (the bracket)
1352     options the compiling options
1353 ph10 454 atend TRUE if called when the pattern is complete
1354     cd the "compile data" structure
1355 nigel 77
1356 ph10 454 Returns: the fixed length,
1357     or -1 if there is no fixed length,
1358 nigel 77 or -2 if \C was encountered
1359 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1360 nigel 77 */
1361    
1362     static int
1363 ph10 454 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1364 nigel 77 {
1365     int length = -1;
1366    
1367     register int branchlength = 0;
1368     register uschar *cc = code + 1 + LINK_SIZE;
1369    
1370     /* Scan along the opcodes for this branch. If we get to the end of the
1371     branch, check the length against that of the other branches. */
1372    
1373     for (;;)
1374     {
1375     int d;
1376 ph10 454 uschar *ce, *cs;
1377 nigel 77 register int op = *cc;
1378     switch (op)
1379     {
1380 nigel 93 case OP_CBRA:
1381 nigel 77 case OP_BRA:
1382     case OP_ONCE:
1383     case OP_COND:
1384 ph10 454 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1385 nigel 77 if (d < 0) return d;
1386     branchlength += d;
1387     do cc += GET(cc, 1); while (*cc == OP_ALT);
1388     cc += 1 + LINK_SIZE;
1389     break;
1390    
1391     /* Reached end of a branch; if it's a ket it is the end of a nested
1392     call. If it's ALT it is an alternation in a nested call. If it is
1393     END it's the end of the outer call. All can be handled by the same code. */
1394    
1395     case OP_ALT:
1396     case OP_KET:
1397     case OP_KETRMAX:
1398     case OP_KETRMIN:
1399     case OP_END:
1400     if (length < 0) length = branchlength;
1401     else if (length != branchlength) return -1;
1402     if (*cc != OP_ALT) return length;
1403     cc += 1 + LINK_SIZE;
1404     branchlength = 0;
1405     break;
1406 ph10 454
1407     /* A true recursion implies not fixed length, but a subroutine call may
1408     be OK. If the subroutine is a forward reference, we can't deal with
1409     it until the end of the pattern, so return -3. */
1410    
1411     case OP_RECURSE:
1412     if (!atend) return -3;
1413     cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1414     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1415     if (cc > cs && cc < ce) return -1; /* Recursion */
1416     d = find_fixedlength(cs + 2, options, atend, cd);
1417     if (d < 0) return d;
1418     branchlength += d;
1419     cc += 1 + LINK_SIZE;
1420     break;
1421 nigel 77
1422     /* Skip over assertive subpatterns */
1423    
1424     case OP_ASSERT:
1425     case OP_ASSERT_NOT:
1426     case OP_ASSERTBACK:
1427     case OP_ASSERTBACK_NOT:
1428     do cc += GET(cc, 1); while (*cc == OP_ALT);
1429     /* Fall through */
1430    
1431     /* Skip over things that don't match chars */
1432    
1433     case OP_REVERSE:
1434     case OP_CREF:
1435 nigel 93 case OP_RREF:
1436     case OP_DEF:
1437 nigel 77 case OP_OPT:
1438     case OP_CALLOUT:
1439     case OP_SOD:
1440     case OP_SOM:
1441     case OP_EOD:
1442     case OP_EODN:
1443     case OP_CIRC:
1444     case OP_DOLL:
1445     case OP_NOT_WORD_BOUNDARY:
1446     case OP_WORD_BOUNDARY:
1447     cc += _pcre_OP_lengths[*cc];
1448     break;
1449    
1450     /* Handle literal characters */
1451    
1452     case OP_CHAR:
1453     case OP_CHARNC:
1454 nigel 91 case OP_NOT:
1455 nigel 77 branchlength++;
1456     cc += 2;
1457     #ifdef SUPPORT_UTF8
1458 ph10 426 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1459     cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1460 nigel 77 #endif
1461     break;
1462    
1463     /* Handle exact repetitions. The count is already in characters, but we
1464     need to skip over a multibyte character in UTF8 mode. */
1465    
1466     case OP_EXACT:
1467     branchlength += GET2(cc,1);
1468     cc += 4;
1469     #ifdef SUPPORT_UTF8
1470 ph10 426 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1471     cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1472 nigel 77 #endif
1473     break;
1474    
1475     case OP_TYPEEXACT:
1476     branchlength += GET2(cc,1);
1477 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1478 nigel 77 cc += 4;
1479     break;
1480    
1481     /* Handle single-char matchers */
1482    
1483     case OP_PROP:
1484     case OP_NOTPROP:
1485 nigel 87 cc += 2;
1486 nigel 77 /* Fall through */
1487    
1488     case OP_NOT_DIGIT:
1489     case OP_DIGIT:
1490     case OP_NOT_WHITESPACE:
1491     case OP_WHITESPACE:
1492     case OP_NOT_WORDCHAR:
1493     case OP_WORDCHAR:
1494     case OP_ANY:
1495 ph10 342 case OP_ALLANY:
1496 nigel 77 branchlength++;
1497     cc++;
1498     break;
1499    
1500     /* The single-byte matcher isn't allowed */
1501    
1502     case OP_ANYBYTE:
1503     return -2;
1504    
1505     /* Check a class for variable quantification */
1506    
1507     #ifdef SUPPORT_UTF8
1508     case OP_XCLASS:
1509     cc += GET(cc, 1) - 33;
1510     /* Fall through */
1511     #endif
1512    
1513     case OP_CLASS:
1514     case OP_NCLASS:
1515     cc += 33;
1516    
1517     switch (*cc)
1518     {
1519     case OP_CRSTAR:
1520     case OP_CRMINSTAR:
1521     case OP_CRQUERY:
1522     case OP_CRMINQUERY:
1523     return -1;
1524    
1525     case OP_CRRANGE:
1526     case OP_CRMINRANGE:
1527     if (GET2(cc,1) != GET2(cc,3)) return -1;
1528     branchlength += GET2(cc,1);
1529     cc += 5;
1530     break;
1531    
1532     default:
1533     branchlength++;
1534     }
1535     break;
1536    
1537     /* Anything else is variable length */
1538    
1539     default:
1540     return -1;
1541     }
1542     }
1543     /* Control never gets here */
1544     }
1545    
1546    
1547    
1548    
1549     /*************************************************
1550 ph10 454 * Scan compiled regex for specific bracket *
1551 nigel 77 *************************************************/
1552    
1553     /* This little function scans through a compiled pattern until it finds a
1554 ph10 454 capturing bracket with the given number, or, if the number is negative, an
1555 ph10 455 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1556     so that it can be called from pcre_study() when finding the minimum matching
1557     length.
1558 nigel 77
1559     Arguments:
1560     code points to start of expression
1561     utf8 TRUE in UTF-8 mode
1562 ph10 454 number the required bracket number or negative to find a lookbehind
1563 nigel 77
1564     Returns: pointer to the opcode for the bracket, or NULL if not found
1565     */
1566    
1567 ph10 455 const uschar *
1568     _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1569 nigel 77 {
1570     for (;;)
1571     {
1572     register int c = *code;
1573     if (c == OP_END) return NULL;
1574 nigel 91
1575     /* XCLASS is used for classes that cannot be represented just by a bit
1576     map. This includes negated single high-valued characters. The length in
1577     the table is zero; the actual length is stored in the compiled code. */
1578    
1579     if (c == OP_XCLASS) code += GET(code, 1);
1580 ph10 454
1581     /* Handle recursion */
1582    
1583     else if (c == OP_REVERSE)
1584     {
1585     if (number < 0) return (uschar *)code;
1586     code += _pcre_OP_lengths[c];
1587     }
1588 nigel 91
1589 nigel 93 /* Handle capturing bracket */
1590 nigel 91
1591 nigel 93 else if (c == OP_CBRA)
1592 nigel 77 {
1593 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1594 nigel 77 if (n == number) return (uschar *)code;
1595 nigel 93 code += _pcre_OP_lengths[c];
1596 nigel 77 }
1597 nigel 91
1598 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1599     repeated character types, we have to test for \p and \P, which have an extra
1600 ph10 218 two bytes of parameters. */
1601 nigel 91
1602 nigel 77 else
1603     {
1604 ph10 218 switch(c)
1605     {
1606     case OP_TYPESTAR:
1607     case OP_TYPEMINSTAR:
1608     case OP_TYPEPLUS:
1609     case OP_TYPEMINPLUS:
1610     case OP_TYPEQUERY:
1611     case OP_TYPEMINQUERY:
1612     case OP_TYPEPOSSTAR:
1613     case OP_TYPEPOSPLUS:
1614     case OP_TYPEPOSQUERY:
1615     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1616 ph10 220 break;
1617 ph10 221
1618     case OP_TYPEUPTO:
1619     case OP_TYPEMINUPTO:
1620     case OP_TYPEEXACT:
1621     case OP_TYPEPOSUPTO:
1622     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1623     break;
1624 ph10 220 }
1625    
1626 ph10 218 /* Add in the fixed length from the table */
1627 ph10 220
1628 nigel 77 code += _pcre_OP_lengths[c];
1629 ph10 220
1630 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1631     a multi-byte character. The length in the table is a minimum, so we have to
1632     arrange to skip the extra bytes. */
1633 ph10 220
1634 ph10 107 #ifdef SUPPORT_UTF8
1635 nigel 77 if (utf8) switch(c)
1636     {
1637     case OP_CHAR:
1638     case OP_CHARNC:
1639     case OP_EXACT:
1640     case OP_UPTO:
1641     case OP_MINUPTO:
1642 nigel 93 case OP_POSUPTO:
1643 nigel 77 case OP_STAR:
1644     case OP_MINSTAR:
1645 nigel 93 case OP_POSSTAR:
1646 nigel 77 case OP_PLUS:
1647     case OP_MINPLUS:
1648 nigel 93 case OP_POSPLUS:
1649 nigel 77 case OP_QUERY:
1650     case OP_MINQUERY:
1651 nigel 93 case OP_POSQUERY:
1652     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1653 nigel 77 break;
1654     }
1655 ph10 369 #else
1656     (void)(utf8); /* Keep compiler happy by referencing function argument */
1657 ph10 111 #endif
1658 nigel 77 }
1659     }
1660     }
1661    
1662    
1663    
1664     /*************************************************
1665     * Scan compiled regex for recursion reference *
1666     *************************************************/
1667    
1668     /* This little function scans through a compiled pattern until it finds an
1669     instance of OP_RECURSE.
1670    
1671     Arguments:
1672     code points to start of expression
1673     utf8 TRUE in UTF-8 mode
1674    
1675     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1676     */
1677    
1678     static const uschar *
1679     find_recurse(const uschar *code, BOOL utf8)
1680     {
1681     for (;;)
1682     {
1683     register int c = *code;
1684     if (c == OP_END) return NULL;
1685 nigel 91 if (c == OP_RECURSE) return code;
1686 ph10 220
1687 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1688     map. This includes negated single high-valued characters. The length in
1689     the table is zero; the actual length is stored in the compiled code. */
1690    
1691     if (c == OP_XCLASS) code += GET(code, 1);
1692    
1693 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1694     repeated character types, we have to test for \p and \P, which have an extra
1695 ph10 218 two bytes of parameters. */
1696 nigel 91
1697 nigel 77 else
1698     {
1699 ph10 218 switch(c)
1700     {
1701     case OP_TYPESTAR:
1702     case OP_TYPEMINSTAR:
1703     case OP_TYPEPLUS:
1704     case OP_TYPEMINPLUS:
1705     case OP_TYPEQUERY:
1706     case OP_TYPEMINQUERY:
1707     case OP_TYPEPOSSTAR:
1708     case OP_TYPEPOSPLUS:
1709     case OP_TYPEPOSQUERY:
1710     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1711 ph10 220 break;
1712 ph10 221
1713     case OP_TYPEPOSUPTO:
1714     case OP_TYPEUPTO:
1715     case OP_TYPEMINUPTO:
1716     case OP_TYPEEXACT:
1717     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1718     break;
1719 ph10 220 }
1720    
1721 ph10 218 /* Add in the fixed length from the table */
1722    
1723 nigel 77 code += _pcre_OP_lengths[c];
1724 ph10 220
1725 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1726     by a multi-byte character. The length in the table is a minimum, so we have
1727     to arrange to skip the extra bytes. */
1728 ph10 220
1729 ph10 107 #ifdef SUPPORT_UTF8
1730 nigel 77 if (utf8) switch(c)
1731     {
1732     case OP_CHAR:
1733     case OP_CHARNC:
1734     case OP_EXACT:
1735     case OP_UPTO:
1736     case OP_MINUPTO:
1737 nigel 93 case OP_POSUPTO:
1738 nigel 77 case OP_STAR:
1739     case OP_MINSTAR:
1740 nigel 93 case OP_POSSTAR:
1741 nigel 77 case OP_PLUS:
1742     case OP_MINPLUS:
1743 nigel 93 case OP_POSPLUS:
1744 nigel 77 case OP_QUERY:
1745     case OP_MINQUERY:
1746 nigel 93 case OP_POSQUERY:
1747     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1748 nigel 77 break;
1749     }
1750 ph10 369 #else
1751     (void)(utf8); /* Keep compiler happy by referencing function argument */
1752 ph10 111 #endif
1753 nigel 77 }
1754     }
1755     }
1756    
1757    
1758    
1759     /*************************************************
1760     * Scan compiled branch for non-emptiness *
1761     *************************************************/
1762    
1763     /* This function scans through a branch of a compiled pattern to see whether it
1764 nigel 93 can match the empty string or not. It is called from could_be_empty()
1765     below and from compile_branch() when checking for an unlimited repeat of a
1766     group that can match nothing. Note that first_significant_code() skips over
1767 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1768     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1769     bracket whose current branch will already have been scanned.
1770 nigel 77
1771     Arguments:
1772     code points to start of search
1773     endcode points to where to stop
1774     utf8 TRUE if in UTF8 mode
1775    
1776     Returns: TRUE if what is matched could be empty
1777     */
1778    
1779     static BOOL
1780     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1781     {
1782     register int c;
1783 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1784 nigel 77 code < endcode;
1785     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1786     {
1787     const uschar *ccode;
1788    
1789     c = *code;
1790 ph10 286
1791     /* Skip over forward assertions; the other assertions are skipped by
1792 ph10 282 first_significant_code() with a TRUE final argument. */
1793 ph10 286
1794 ph10 282 if (c == OP_ASSERT)
1795 ph10 286 {
1796 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1797     c = *code;
1798     continue;
1799 ph10 286 }
1800 ph10 172
1801 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1802 nigel 77
1803 ph10 335 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1804 ph10 170 {
1805 ph10 172 code += _pcre_OP_lengths[c];
1806 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1807     c = *code;
1808     continue;
1809     }
1810    
1811     /* For other groups, scan the branches. */
1812 ph10 172
1813 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1814 nigel 77 {
1815     BOOL empty_branch;
1816     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1817 ph10 406
1818     /* If a conditional group has only one branch, there is a second, implied,
1819 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
1820     Otherwise, scan the individual branches of the group. */
1821 ph10 406
1822 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1823 nigel 77 code += GET(code, 1);
1824 ph10 395 else
1825 ph10 406 {
1826 ph10 395 empty_branch = FALSE;
1827     do
1828     {
1829     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1830     empty_branch = TRUE;
1831     code += GET(code, 1);
1832     }
1833     while (*code == OP_ALT);
1834     if (!empty_branch) return FALSE; /* All branches are non-empty */
1835 nigel 77 }
1836 ph10 406
1837 ph10 172 c = *code;
1838 nigel 93 continue;
1839 nigel 77 }
1840    
1841 nigel 93 /* Handle the other opcodes */
1842    
1843     switch (c)
1844 nigel 77 {
1845 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1846     cannot be represented just by a bit map. This includes negated single
1847     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1848 ph10 220 actual length is stored in the compiled code, so we must update "code"
1849 ph10 216 here. */
1850 nigel 77
1851     #ifdef SUPPORT_UTF8
1852     case OP_XCLASS:
1853 ph10 216 ccode = code += GET(code, 1);
1854 nigel 77 goto CHECK_CLASS_REPEAT;
1855     #endif
1856    
1857     case OP_CLASS:
1858     case OP_NCLASS:
1859     ccode = code + 33;
1860    
1861     #ifdef SUPPORT_UTF8
1862     CHECK_CLASS_REPEAT:
1863     #endif
1864    
1865     switch (*ccode)
1866     {
1867     case OP_CRSTAR: /* These could be empty; continue */
1868     case OP_CRMINSTAR:
1869     case OP_CRQUERY:
1870     case OP_CRMINQUERY:
1871     break;
1872    
1873     default: /* Non-repeat => class must match */
1874     case OP_CRPLUS: /* These repeats aren't empty */
1875     case OP_CRMINPLUS:
1876     return FALSE;
1877    
1878     case OP_CRRANGE:
1879     case OP_CRMINRANGE:
1880     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1881     break;
1882     }
1883     break;
1884    
1885     /* Opcodes that must match a character */
1886    
1887     case OP_PROP:
1888     case OP_NOTPROP:
1889     case OP_EXTUNI:
1890     case OP_NOT_DIGIT:
1891     case OP_DIGIT:
1892     case OP_NOT_WHITESPACE:
1893     case OP_WHITESPACE:
1894     case OP_NOT_WORDCHAR:
1895     case OP_WORDCHAR:
1896     case OP_ANY:
1897 ph10 345 case OP_ALLANY:
1898 nigel 77 case OP_ANYBYTE:
1899     case OP_CHAR:
1900     case OP_CHARNC:
1901     case OP_NOT:
1902     case OP_PLUS:
1903     case OP_MINPLUS:
1904 nigel 93 case OP_POSPLUS:
1905 nigel 77 case OP_EXACT:
1906     case OP_NOTPLUS:
1907     case OP_NOTMINPLUS:
1908 nigel 93 case OP_NOTPOSPLUS:
1909 nigel 77 case OP_NOTEXACT:
1910     case OP_TYPEPLUS:
1911     case OP_TYPEMINPLUS:
1912 nigel 93 case OP_TYPEPOSPLUS:
1913 nigel 77 case OP_TYPEEXACT:
1914     return FALSE;
1915 ph10 227
1916     /* These are going to continue, as they may be empty, but we have to
1917     fudge the length for the \p and \P cases. */
1918    
1919 ph10 224 case OP_TYPESTAR:
1920     case OP_TYPEMINSTAR:
1921     case OP_TYPEPOSSTAR:
1922     case OP_TYPEQUERY:
1923     case OP_TYPEMINQUERY:
1924     case OP_TYPEPOSQUERY:
1925     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1926 ph10 227 break;
1927    
1928 ph10 224 /* Same for these */
1929 ph10 227
1930 ph10 224 case OP_TYPEUPTO:
1931     case OP_TYPEMINUPTO:
1932     case OP_TYPEPOSUPTO:
1933     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1934     break;
1935 nigel 77
1936     /* End of branch */
1937    
1938     case OP_KET:
1939     case OP_KETRMAX:
1940     case OP_KETRMIN:
1941     case OP_ALT:
1942     return TRUE;
1943    
1944 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1945     MINUPTO, and POSUPTO may be followed by a multibyte character */
1946 nigel 77
1947     #ifdef SUPPORT_UTF8
1948     case OP_STAR:
1949     case OP_MINSTAR:
1950 nigel 93 case OP_POSSTAR:
1951 nigel 77 case OP_QUERY:
1952     case OP_MINQUERY:
1953 nigel 93 case OP_POSQUERY:
1954 ph10 426 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1955     break;
1956    
1957 nigel 77 case OP_UPTO:
1958     case OP_MINUPTO:
1959 nigel 93 case OP_POSUPTO:
1960 ph10 426 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
1961 nigel 77 break;
1962     #endif
1963     }
1964     }
1965    
1966     return TRUE;
1967     }
1968    
1969    
1970    
1971     /*************************************************
1972     * Scan compiled regex for non-emptiness *
1973     *************************************************/
1974    
1975     /* This function is called to check for left recursive calls. We want to check
1976     the current branch of the current pattern to see if it could match the empty
1977     string. If it could, we must look outwards for branches at other levels,
1978     stopping when we pass beyond the bracket which is the subject of the recursion.
1979    
1980     Arguments:
1981     code points to start of the recursion
1982     endcode points to where to stop (current RECURSE item)
1983     bcptr points to the chain of current (unclosed) branch starts
1984     utf8 TRUE if in UTF-8 mode
1985    
1986     Returns: TRUE if what is matched could be empty
1987     */
1988    
1989     static BOOL
1990     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1991     BOOL utf8)
1992     {
1993     while (bcptr != NULL && bcptr->current >= code)
1994     {
1995     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1996     bcptr = bcptr->outer;
1997     }
1998     return TRUE;
1999     }
2000    
2001    
2002    
2003     /*************************************************
2004     * Check for POSIX class syntax *
2005     *************************************************/
2006    
2007     /* This function is called when the sequence "[:" or "[." or "[=" is
2008 ph10 295 encountered in a character class. It checks whether this is followed by a
2009 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2010 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2011 nigel 77
2012 ph10 298 Originally, this function only recognized a sequence of letters between the
2013     terminators, but it seems that Perl recognizes any sequence of characters,
2014     though of course unknown POSIX names are subsequently rejected. Perl gives an
2015     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2016     didn't consider this to be a POSIX class. Likewise for [:1234:].
2017 ph10 295
2018 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2019     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2020     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2021     below handles the special case of \], but does not try to do any other escape
2022     processing. This makes it different from Perl for cases such as [:l\ower:]
2023 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2024 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2025 ph10 295 I think.
2026    
2027     Arguments:
2028 nigel 77 ptr pointer to the initial [
2029     endptr where to return the end pointer
2030    
2031     Returns: TRUE or FALSE
2032     */
2033    
2034     static BOOL
2035 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2036 nigel 77 {
2037     int terminator; /* Don't combine these lines; the Solaris cc */
2038     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2039 ph10 295 for (++ptr; *ptr != 0; ptr++)
2040 nigel 77 {
2041 ph10 391 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2042 ph10 298 {
2043 ph10 391 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2044     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2045 ph10 295 {
2046     *endptr = ptr;
2047     return TRUE;
2048 ph10 298 }
2049     }
2050     }
2051 nigel 77 return FALSE;
2052     }
2053    
2054    
2055    
2056    
2057     /*************************************************
2058     * Check POSIX class name *
2059     *************************************************/
2060    
2061     /* This function is called to check the name given in a POSIX-style class entry
2062     such as [:alnum:].
2063    
2064     Arguments:
2065     ptr points to the first letter
2066     len the length of the name
2067    
2068     Returns: a value representing the name, or -1 if unknown
2069     */
2070    
2071     static int
2072     check_posix_name(const uschar *ptr, int len)
2073     {
2074 ph10 240 const char *pn = posix_names;
2075 nigel 77 register int yield = 0;
2076     while (posix_name_lengths[yield] != 0)
2077     {
2078     if (len == posix_name_lengths[yield] &&
2079 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2080 ph10 243 pn += posix_name_lengths[yield] + 1;
2081 nigel 77 yield++;
2082     }
2083     return -1;
2084     }
2085    
2086    
2087     /*************************************************
2088     * Adjust OP_RECURSE items in repeated group *
2089     *************************************************/
2090    
2091     /* OP_RECURSE items contain an offset from the start of the regex to the group
2092     that is referenced. This means that groups can be replicated for fixed
2093     repetition simply by copying (because the recursion is allowed to refer to
2094     earlier groups that are outside the current group). However, when a group is
2095 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2096     inserted before it, after it has been compiled. This means that any OP_RECURSE
2097     items within it that refer to the group itself or any contained groups have to
2098     have their offsets adjusted. That one of the jobs of this function. Before it
2099     is called, the partially compiled regex must be temporarily terminated with
2100     OP_END.
2101 nigel 77
2102 nigel 93 This function has been extended with the possibility of forward references for
2103     recursions and subroutine calls. It must also check the list of such references
2104     for the group we are dealing with. If it finds that one of the recursions in
2105     the current group is on this list, it adjusts the offset in the list, not the
2106     value in the reference (which is a group number).
2107    
2108 nigel 77 Arguments:
2109     group points to the start of the group
2110     adjust the amount by which the group is to be moved
2111     utf8 TRUE in UTF-8 mode
2112     cd contains pointers to tables etc.
2113 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2114 nigel 77
2115     Returns: nothing
2116     */
2117    
2118     static void
2119 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2120     uschar *save_hwm)
2121 nigel 77 {
2122     uschar *ptr = group;
2123 ph10 224
2124 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2125     {
2126 nigel 93 int offset;
2127     uschar *hc;
2128    
2129     /* See if this recursion is on the forward reference list. If so, adjust the
2130     reference. */
2131 ph10 345
2132 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2133     {
2134     offset = GET(hc, 0);
2135     if (cd->start_code + offset == ptr + 1)
2136     {
2137     PUT(hc, 0, offset + adjust);
2138     break;
2139     }
2140     }
2141    
2142     /* Otherwise, adjust the recursion offset if it's after the start of this
2143     group. */
2144    
2145     if (hc >= cd->hwm)
2146     {
2147     offset = GET(ptr, 1);
2148     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2149     }
2150    
2151 nigel 77 ptr += 1 + LINK_SIZE;
2152     }
2153     }
2154    
2155    
2156    
2157     /*************************************************
2158     * Insert an automatic callout point *
2159     *************************************************/
2160    
2161     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2162     callout points before each pattern item.
2163    
2164     Arguments:
2165     code current code pointer
2166     ptr current pattern pointer
2167     cd pointers to tables etc
2168    
2169     Returns: new code pointer
2170     */
2171    
2172     static uschar *
2173     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2174     {
2175     *code++ = OP_CALLOUT;
2176     *code++ = 255;
2177     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2178     PUT(code, LINK_SIZE, 0); /* Default length */
2179     return code + 2*LINK_SIZE;
2180     }
2181    
2182    
2183    
2184     /*************************************************
2185     * Complete a callout item *
2186     *************************************************/
2187    
2188     /* A callout item contains the length of the next item in the pattern, which
2189     we can't fill in till after we have reached the relevant point. This is used
2190     for both automatic and manual callouts.
2191    
2192     Arguments:
2193     previous_callout points to previous callout item
2194     ptr current pattern pointer
2195     cd pointers to tables etc
2196    
2197     Returns: nothing
2198     */
2199    
2200     static void
2201     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2202     {
2203     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2204     PUT(previous_callout, 2 + LINK_SIZE, length);
2205     }
2206    
2207    
2208    
2209     #ifdef SUPPORT_UCP
2210     /*************************************************
2211     * Get othercase range *
2212     *************************************************/
2213    
2214     /* This function is passed the start and end of a class range, in UTF-8 mode
2215     with UCP support. It searches up the characters, looking for internal ranges of
2216     characters in the "other" case. Each call returns the next one, updating the
2217     start address.
2218    
2219     Arguments:
2220     cptr points to starting character value; updated
2221     d end value
2222     ocptr where to put start of othercase range
2223     odptr where to put end of othercase range
2224    
2225     Yield: TRUE when range returned; FALSE when no more
2226     */
2227    
2228     static BOOL
2229 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2230     unsigned int *odptr)
2231 nigel 77 {
2232 nigel 93 unsigned int c, othercase, next;
2233 nigel 77
2234     for (c = *cptr; c <= d; c++)
2235 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2236 nigel 77
2237     if (c > d) return FALSE;
2238    
2239     *ocptr = othercase;
2240     next = othercase + 1;
2241    
2242     for (++c; c <= d; c++)
2243     {
2244 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2245 nigel 77 next++;
2246     }
2247    
2248     *odptr = next - 1;
2249     *cptr = c;
2250    
2251     return TRUE;
2252     }
2253     #endif /* SUPPORT_UCP */
2254    
2255    
2256 nigel 93
2257 nigel 77 /*************************************************
2258 nigel 93 * Check if auto-possessifying is possible *
2259     *************************************************/
2260    
2261     /* This function is called for unlimited repeats of certain items, to see
2262     whether the next thing could possibly match the repeated item. If not, it makes
2263     sense to automatically possessify the repeated item.
2264    
2265     Arguments:
2266     op_code the repeated op code
2267     this data for this item, depends on the opcode
2268     utf8 TRUE in UTF-8 mode
2269     utf8_char used for utf8 character bytes, NULL if not relevant
2270     ptr next character in pattern
2271     options options bits
2272     cd contains pointers to tables etc.
2273    
2274     Returns: TRUE if possessifying is wanted
2275     */
2276    
2277     static BOOL
2278     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2279     const uschar *ptr, int options, compile_data *cd)
2280     {
2281     int next;
2282    
2283     /* Skip whitespace and comments in extended mode */
2284    
2285     if ((options & PCRE_EXTENDED) != 0)
2286     {
2287     for (;;)
2288     {
2289     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2290 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2291 nigel 93 {
2292     while (*(++ptr) != 0)
2293     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2294     }
2295     else break;
2296     }
2297     }
2298    
2299     /* If the next item is one that we can handle, get its value. A non-negative
2300     value is a character, a negative value is an escape value. */
2301    
2302 ph10 391 if (*ptr == CHAR_BACKSLASH)
2303 nigel 93 {
2304     int temperrorcode = 0;
2305     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2306     if (temperrorcode != 0) return FALSE;
2307     ptr++; /* Point after the escape sequence */
2308     }
2309    
2310     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2311     {
2312     #ifdef SUPPORT_UTF8
2313     if (utf8) { GETCHARINC(next, ptr); } else
2314     #endif
2315     next = *ptr++;
2316     }
2317    
2318     else return FALSE;
2319    
2320     /* Skip whitespace and comments in extended mode */
2321    
2322     if ((options & PCRE_EXTENDED) != 0)
2323     {
2324     for (;;)
2325     {
2326     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2327 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2328 nigel 93 {
2329     while (*(++ptr) != 0)
2330     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2331     }
2332     else break;
2333     }
2334     }
2335    
2336     /* If the next thing is itself optional, we have to give up. */
2337    
2338 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2339 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2340     return FALSE;
2341 nigel 93
2342     /* Now compare the next item with the previous opcode. If the previous is a
2343     positive single character match, "item" either contains the character or, if
2344     "item" is greater than 127 in utf8 mode, the character's bytes are in
2345     utf8_char. */
2346    
2347    
2348     /* Handle cases when the next item is a character. */
2349    
2350     if (next >= 0) switch(op_code)
2351     {
2352     case OP_CHAR:
2353     #ifdef SUPPORT_UTF8
2354     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2355 ph10 369 #else
2356     (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2357 nigel 93 #endif
2358     return item != next;
2359    
2360     /* For CHARNC (caseless character) we must check the other case. If we have
2361     Unicode property support, we can use it to test the other case of
2362     high-valued characters. */
2363    
2364     case OP_CHARNC:
2365     #ifdef SUPPORT_UTF8
2366     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2367     #endif
2368     if (item == next) return FALSE;
2369     #ifdef SUPPORT_UTF8
2370     if (utf8)
2371     {
2372     unsigned int othercase;
2373     if (next < 128) othercase = cd->fcc[next]; else
2374     #ifdef SUPPORT_UCP
2375 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2376 nigel 93 #else
2377     othercase = NOTACHAR;
2378     #endif
2379     return (unsigned int)item != othercase;
2380     }
2381     else
2382     #endif /* SUPPORT_UTF8 */
2383     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2384    
2385     /* For OP_NOT, "item" must be a single-byte character. */
2386    
2387     case OP_NOT:
2388     if (item == next) return TRUE;
2389     if ((options & PCRE_CASELESS) == 0) return FALSE;
2390     #ifdef SUPPORT_UTF8
2391     if (utf8)
2392     {
2393     unsigned int othercase;
2394     if (next < 128) othercase = cd->fcc[next]; else
2395     #ifdef SUPPORT_UCP
2396 ph10 349 othercase = UCD_OTHERCASE(next);
2397 nigel 93 #else
2398     othercase = NOTACHAR;
2399     #endif
2400     return (unsigned int)item == othercase;
2401     }
2402     else
2403     #endif /* SUPPORT_UTF8 */
2404     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2405    
2406     case OP_DIGIT:
2407     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2408    
2409     case OP_NOT_DIGIT:
2410     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2411    
2412     case OP_WHITESPACE:
2413     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2414    
2415     case OP_NOT_WHITESPACE:
2416     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2417    
2418     case OP_WORDCHAR:
2419     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2420    
2421     case OP_NOT_WORDCHAR:
2422     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2423    
2424 ph10 180 case OP_HSPACE:
2425     case OP_NOT_HSPACE:
2426     switch(next)
2427     {
2428     case 0x09:
2429     case 0x20:
2430     case 0xa0:
2431     case 0x1680:
2432     case 0x180e:
2433     case 0x2000:
2434     case 0x2001:
2435     case 0x2002:
2436     case 0x2003:
2437     case 0x2004:
2438     case 0x2005:
2439     case 0x2006:
2440     case 0x2007:
2441     case 0x2008:
2442     case 0x2009:
2443     case 0x200A:
2444     case 0x202f:
2445     case 0x205f:
2446     case 0x3000:
2447     return op_code != OP_HSPACE;
2448     default:
2449     return op_code == OP_HSPACE;
2450     }
2451    
2452     case OP_VSPACE:
2453     case OP_NOT_VSPACE:
2454     switch(next)
2455     {
2456     case 0x0a:
2457     case 0x0b:
2458     case 0x0c:
2459     case 0x0d:
2460     case 0x85:
2461     case 0x2028:
2462     case 0x2029:
2463     return op_code != OP_VSPACE;
2464     default:
2465     return op_code == OP_VSPACE;
2466     }
2467    
2468 nigel 93 default:
2469     return FALSE;
2470     }
2471    
2472    
2473     /* Handle the case when the next item is \d, \s, etc. */
2474    
2475     switch(op_code)
2476     {
2477     case OP_CHAR:
2478     case OP_CHARNC:
2479     #ifdef SUPPORT_UTF8
2480     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2481     #endif
2482     switch(-next)
2483     {
2484     case ESC_d:
2485     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2486    
2487     case ESC_D:
2488     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2489    
2490     case ESC_s:
2491     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2492    
2493     case ESC_S:
2494     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2495    
2496     case ESC_w:
2497     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2498    
2499     case ESC_W:
2500     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2501 ph10 182
2502 ph10 180 case ESC_h:
2503     case ESC_H:
2504     switch(item)
2505     {
2506     case 0x09:
2507     case 0x20:
2508     case 0xa0:
2509     case 0x1680:
2510     case 0x180e:
2511     case 0x2000:
2512     case 0x2001:
2513     case 0x2002:
2514     case 0x2003:
2515     case 0x2004:
2516     case 0x2005:
2517     case 0x2006:
2518     case 0x2007:
2519     case 0x2008:
2520     case 0x2009:
2521     case 0x200A:
2522     case 0x202f:
2523     case 0x205f:
2524     case 0x3000:
2525     return -next != ESC_h;
2526     default:
2527     return -next == ESC_h;
2528 ph10 182 }
2529    
2530 ph10 180 case ESC_v:
2531     case ESC_V:
2532     switch(item)
2533     {
2534     case 0x0a:
2535     case 0x0b:
2536     case 0x0c:
2537     case 0x0d:
2538     case 0x85:
2539     case 0x2028:
2540     case 0x2029:
2541     return -next != ESC_v;
2542     default:
2543     return -next == ESC_v;
2544 ph10 182 }
2545 nigel 93
2546     default:
2547     return FALSE;
2548     }
2549    
2550     case OP_DIGIT:
2551 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2552     next == -ESC_h || next == -ESC_v;
2553 nigel 93
2554     case OP_NOT_DIGIT:
2555     return next == -ESC_d;
2556    
2557     case OP_WHITESPACE:
2558     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2559    
2560     case OP_NOT_WHITESPACE:
2561 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2562 nigel 93
2563 ph10 180 case OP_HSPACE:
2564     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2565    
2566     case OP_NOT_HSPACE:
2567     return next == -ESC_h;
2568 ph10 182
2569 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2570 ph10 182 case OP_VSPACE:
2571 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2572    
2573     case OP_NOT_VSPACE:
2574 ph10 182 return next == -ESC_v;
2575 ph10 180
2576 nigel 93 case OP_WORDCHAR:
2577 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2578 nigel 93
2579     case OP_NOT_WORDCHAR:
2580     return next == -ESC_w || next == -ESC_d;
2581 ph10 182
2582 nigel 93 default:
2583     return FALSE;
2584     }
2585    
2586     /* Control does not reach here */
2587     }
2588    
2589    
2590    
2591     /*************************************************
2592 nigel 77 * Compile one branch *
2593     *************************************************/
2594    
2595 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2596 nigel 77 changed during the branch, the pointer is used to change the external options
2597 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2598     to find out the amount of memory needed, as well as during the real compile
2599     phase. The value of lengthptr distinguishes the two phases.
2600 nigel 77
2601     Arguments:
2602     optionsptr pointer to the option bits
2603     codeptr points to the pointer to the current code point
2604     ptrptr points to the current pattern pointer
2605     errorcodeptr points to error code variable
2606     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2607     reqbyteptr set to the last literal character required, else < 0
2608     bcptr points to current branch chain
2609     cd contains pointers to tables etc.
2610 nigel 93 lengthptr NULL during the real compile phase
2611     points to length accumulator during pre-compile phase
2612 nigel 77
2613     Returns: TRUE on success
2614     FALSE, with *errorcodeptr set non-zero on error
2615     */
2616    
2617     static BOOL
2618 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2619     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2620     compile_data *cd, int *lengthptr)
2621 nigel 77 {
2622     int repeat_type, op_type;
2623     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2624     int bravalue = 0;
2625     int greedy_default, greedy_non_default;
2626     int firstbyte, reqbyte;
2627     int zeroreqbyte, zerofirstbyte;
2628     int req_caseopt, reqvary, tempreqvary;
2629     int options = *optionsptr;
2630     int after_manual_callout = 0;
2631 nigel 93 int length_prevgroup = 0;
2632 nigel 77 register int c;
2633     register uschar *code = *codeptr;
2634 nigel 93 uschar *last_code = code;
2635     uschar *orig_code = code;
2636 nigel 77 uschar *tempcode;
2637     BOOL inescq = FALSE;
2638     BOOL groupsetfirstbyte = FALSE;
2639     const uschar *ptr = *ptrptr;
2640     const uschar *tempptr;
2641     uschar *previous = NULL;
2642     uschar *previous_callout = NULL;
2643 nigel 93 uschar *save_hwm = NULL;
2644 nigel 77 uschar classbits[32];
2645    
2646     #ifdef SUPPORT_UTF8
2647     BOOL class_utf8;
2648     BOOL utf8 = (options & PCRE_UTF8) != 0;
2649     uschar *class_utf8data;
2650 ph10 300 uschar *class_utf8data_base;
2651 nigel 77 uschar utf8_char[6];
2652     #else
2653     BOOL utf8 = FALSE;
2654 nigel 93 uschar *utf8_char = NULL;
2655 nigel 77 #endif
2656    
2657 nigel 93 #ifdef DEBUG
2658     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2659     #endif
2660    
2661 nigel 77 /* Set up the default and non-default settings for greediness */
2662    
2663     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2664     greedy_non_default = greedy_default ^ 1;
2665    
2666     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2667     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2668     matches a non-fixed char first char; reqbyte just remains unset if we never
2669     find one.
2670    
2671     When we hit a repeat whose minimum is zero, we may have to adjust these values
2672     to take the zero repeat into account. This is implemented by setting them to
2673     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2674     item types that can be repeated set these backoff variables appropriately. */
2675    
2676     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2677    
2678     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2679     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2680     value > 255. It is added into the firstbyte or reqbyte variables to record the
2681     case status of the value. This is used only for ASCII characters. */
2682    
2683     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2684    
2685     /* Switch on next character until the end of the branch */
2686    
2687     for (;; ptr++)
2688     {
2689     BOOL negate_class;
2690 ph10 286 BOOL should_flip_negation;
2691 nigel 77 BOOL possessive_quantifier;
2692     BOOL is_quantifier;
2693 nigel 93 BOOL is_recurse;
2694 ph10 180 BOOL reset_bracount;
2695 nigel 77 int class_charcount;
2696     int class_lastchar;
2697     int newoptions;
2698     int recno;
2699 ph10 172 int refsign;
2700 nigel 77 int skipbytes;
2701     int subreqbyte;
2702     int subfirstbyte;
2703 nigel 93 int terminator;
2704 nigel 77 int mclength;
2705     uschar mcbuffer[8];
2706    
2707 nigel 93 /* Get next byte in the pattern */
2708 nigel 77
2709     c = *ptr;
2710 ph10 345
2711 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2712     previous cycle of this loop. */
2713    
2714     if (lengthptr != NULL)
2715     {
2716     #ifdef DEBUG
2717     if (code > cd->hwm) cd->hwm = code; /* High water info */
2718     #endif
2719     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2720     {
2721     *errorcodeptr = ERR52;
2722     goto FAILED;
2723     }
2724    
2725     /* There is at least one situation where code goes backwards: this is the
2726     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2727     the class is simply eliminated. However, it is created first, so we have to
2728     allow memory for it. Therefore, don't ever reduce the length at this point.
2729     */
2730    
2731     if (code < last_code) code = last_code;
2732 ph10 202
2733     /* Paranoid check for integer overflow */
2734    
2735     if (OFLOW_MAX - *lengthptr < code - last_code)
2736     {
2737     *errorcodeptr = ERR20;
2738     goto FAILED;
2739     }
2740    
2741 nigel 93 *lengthptr += code - last_code;
2742     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2743    
2744     /* If "previous" is set and it is not at the start of the work space, move
2745     it back to there, in order to avoid filling up the work space. Otherwise,
2746     if "previous" is NULL, reset the current code pointer to the start. */
2747    
2748     if (previous != NULL)
2749     {
2750     if (previous > orig_code)
2751     {
2752     memmove(orig_code, previous, code - previous);
2753     code -= previous - orig_code;
2754     previous = orig_code;
2755     }
2756     }
2757     else code = orig_code;
2758    
2759     /* Remember where this code item starts so we can pick up the length
2760     next time round. */
2761    
2762     last_code = code;
2763     }
2764    
2765     /* In the real compile phase, just check the workspace used by the forward
2766     reference list. */
2767    
2768     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2769     {
2770     *errorcodeptr = ERR52;
2771     goto FAILED;
2772     }
2773    
2774 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2775    
2776     if (inescq && c != 0)
2777     {
2778 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2779 nigel 77 {
2780     inescq = FALSE;
2781     ptr++;
2782     continue;
2783     }
2784     else
2785     {
2786     if (previous_callout != NULL)
2787     {
2788 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2789     complete_callout(previous_callout, ptr, cd);
2790 nigel 77 previous_callout = NULL;
2791     }
2792     if ((options & PCRE_AUTO_CALLOUT) != 0)
2793     {
2794     previous_callout = code;
2795     code = auto_callout(code, ptr, cd);
2796     }
2797     goto NORMAL_CHAR;
2798     }
2799     }
2800    
2801     /* Fill in length of a previous callout, except when the next thing is
2802     a quantifier. */
2803    
2804 ph10 392 is_quantifier =
2805 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2806     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2807 nigel 77
2808     if (!is_quantifier && previous_callout != NULL &&
2809     after_manual_callout-- <= 0)
2810     {
2811 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2812     complete_callout(previous_callout, ptr, cd);
2813 nigel 77 previous_callout = NULL;
2814     }
2815    
2816     /* In extended mode, skip white space and comments */
2817    
2818     if ((options & PCRE_EXTENDED) != 0)
2819     {
2820     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2821 ph10 391 if (c == CHAR_NUMBER_SIGN)
2822 nigel 77 {
2823 nigel 93 while (*(++ptr) != 0)
2824 nigel 91 {
2825 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2826 nigel 91 }
2827 nigel 93 if (*ptr != 0) continue;
2828    
2829 nigel 91 /* Else fall through to handle end of string */
2830     c = 0;
2831 nigel 77 }
2832     }
2833    
2834     /* No auto callout for quantifiers. */
2835    
2836     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2837     {
2838     previous_callout = code;
2839     code = auto_callout(code, ptr, cd);
2840     }
2841    
2842     switch(c)
2843     {
2844 nigel 93 /* ===================================================================*/
2845     case 0: /* The branch terminates at string end */
2846 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
2847     case CHAR_RIGHT_PARENTHESIS:
2848 nigel 77 *firstbyteptr = firstbyte;
2849     *reqbyteptr = reqbyte;
2850     *codeptr = code;
2851     *ptrptr = ptr;
2852 nigel 93 if (lengthptr != NULL)
2853     {
2854 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2855     {
2856     *errorcodeptr = ERR20;
2857     goto FAILED;
2858     }
2859 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2860     DPRINTF((">> end branch\n"));
2861     }
2862 nigel 77 return TRUE;
2863    
2864 nigel 93
2865     /* ===================================================================*/
2866 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2867     the setting of any following char as a first character. */
2868    
2869 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
2870 nigel 77 if ((options & PCRE_MULTILINE) != 0)
2871     {
2872     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2873     }
2874     previous = NULL;
2875     *code++ = OP_CIRC;
2876     break;
2877    
2878 ph10 391 case CHAR_DOLLAR_SIGN:
2879 nigel 77 previous = NULL;
2880     *code++ = OP_DOLL;
2881     break;
2882    
2883     /* There can never be a first char if '.' is first, whatever happens about
2884     repeats. The value of reqbyte doesn't change either. */
2885    
2886 ph10 391 case CHAR_DOT:
2887 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2888     zerofirstbyte = firstbyte;
2889     zeroreqbyte = reqbyte;
2890     previous = code;
2891 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2892 nigel 77 break;
2893    
2894 nigel 93
2895     /* ===================================================================*/
2896 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2897     32-byte bitmap of the permitted characters, except in the special case
2898     where there is only one such character. For negated classes, we build the
2899     map as usual, then invert it at the end. However, we use a different opcode
2900     so that data characters > 255 can be handled correctly.
2901 nigel 77
2902     If the class contains characters outside the 0-255 range, a different
2903     opcode is compiled. It may optionally have a bit map for characters < 256,
2904     but those above are are explicitly listed afterwards. A flag byte tells
2905     whether the bitmap is present, and whether this is a negated class or not.
2906 ph10 345
2907 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
2908     default (Perl) mode, it is treated as a data character. */
2909 ph10 345
2910 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
2911 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2912     {
2913     *errorcodeptr = ERR64;
2914 ph10 345 goto FAILED;
2915 ph10 336 }
2916 ph10 345 goto NORMAL_CHAR;
2917 nigel 77
2918 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
2919 nigel 77 previous = code;
2920    
2921     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2922     they are encountered at the top level, so we'll do that too. */
2923    
2924 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2925 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
2926 ph10 295 check_posix_syntax(ptr, &tempptr))
2927 nigel 77 {
2928 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2929 nigel 77 goto FAILED;
2930     }
2931    
2932 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2933 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2934 ph10 205 skip them too. This makes for compatibility with Perl. */
2935 ph10 208
2936 ph10 205 negate_class = FALSE;
2937     for (;;)
2938 nigel 77 {
2939     c = *(++ptr);
2940 ph10 391 if (c == CHAR_BACKSLASH)
2941 ph10 205 {
2942 ph10 392 if (ptr[1] == CHAR_E)
2943 ph10 391 ptr++;
2944 ph10 392 else if (strncmp((const char *)ptr+1,
2945     STR_Q STR_BACKSLASH STR_E, 3) == 0)
2946 ph10 391 ptr += 3;
2947 ph10 392 else
2948 ph10 391 break;
2949 ph10 205 }
2950 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2951 ph10 205 negate_class = TRUE;
2952     else break;
2953 ph10 208 }
2954 ph10 345
2955     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2956     an initial ']' is taken as a data character -- the code below handles
2957 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2958     [^] must match any character, so generate OP_ALLANY. */
2959 ph10 345
2960 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2961 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2962 ph10 341 {
2963     *code++ = negate_class? OP_ALLANY : OP_FAIL;
2964     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2965     zerofirstbyte = firstbyte;
2966     break;
2967 ph10 345 }
2968 nigel 77
2969 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
2970     negation flag at the end, so that support for characters > 255 works
2971 ph10 264 correctly (they are all included in the class). */
2972    
2973     should_flip_negation = FALSE;
2974    
2975 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
2976 nigel 93 of just a single character (as long as it's < 256). However, For higher
2977     valued UTF-8 characters, we don't yet do any optimization. */
2978 nigel 77
2979     class_charcount = 0;
2980     class_lastchar = -1;
2981    
2982 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2983     temporary bit of memory, in case the class contains only 1 character (less
2984     than 256), because in that case the compiled code doesn't use the bit map.
2985     */
2986    
2987     memset(classbits, 0, 32 * sizeof(uschar));
2988    
2989 nigel 77 #ifdef SUPPORT_UTF8
2990     class_utf8 = FALSE; /* No chars >= 256 */
2991 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2992 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2993 nigel 77 #endif
2994    
2995     /* Process characters until ] is reached. By writing this as a "do" it
2996 nigel 93 means that an initial ] is taken as a data character. At the start of the
2997     loop, c contains the first byte of the character. */
2998 nigel 77
2999 nigel 93 if (c != 0) do
3000 nigel 77 {
3001 nigel 93 const uschar *oldptr;
3002    
3003 nigel 77 #ifdef SUPPORT_UTF8
3004     if (utf8 && c > 127)
3005     { /* Braces are required because the */
3006     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3007     }
3008 ph10 309
3009 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3010 ph10 309 data and reset the pointer. This is so that very large classes that
3011 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
3012 ph10 309 (which is on the stack). */
3013    
3014 ph10 300 if (lengthptr != NULL)
3015     {
3016     *lengthptr += class_utf8data - class_utf8data_base;
3017 ph10 309 class_utf8data = class_utf8data_base;
3018     }
3019    
3020 nigel 77 #endif
3021    
3022     /* Inside \Q...\E everything is literal except \E */
3023    
3024     if (inescq)
3025     {
3026 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3027 nigel 77 {
3028 nigel 93 inescq = FALSE; /* Reset literal state */
3029     ptr++; /* Skip the 'E' */
3030     continue; /* Carry on with next */
3031 nigel 77 }
3032 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
3033 nigel 77 }
3034    
3035     /* Handle POSIX class names. Perl allows a negation extension of the
3036     form [:^name:]. A square bracket that doesn't match the syntax is
3037     treated as a literal. We also recognize the POSIX constructions
3038     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3039     5.6 and 5.8 do. */
3040    
3041 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3042 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3043 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3044 nigel 77 {
3045     BOOL local_negate = FALSE;
3046 nigel 87 int posix_class, taboffset, tabopt;
3047 nigel 77 register const uschar *cbits = cd->cbits;
3048 nigel 87 uschar pbits[32];
3049 nigel 77
3050 ph10 391 if (ptr[1] != CHAR_COLON)
3051 nigel 77 {
3052     *errorcodeptr = ERR31;
3053     goto FAILED;
3054     }
3055    
3056     ptr += 2;
3057 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3058 nigel 77 {
3059     local_negate = TRUE;
3060 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3061 nigel 77 ptr++;
3062     }
3063    
3064     posix_class = check_posix_name(ptr, tempptr - ptr);
3065     if (posix_class < 0)
3066     {
3067     *errorcodeptr = ERR30;
3068     goto FAILED;
3069     }
3070    
3071     /* If matching is caseless, upper and lower are converted to
3072     alpha. This relies on the fact that the class table starts with
3073     alpha, lower, upper as the first 3 entries. */
3074    
3075     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3076     posix_class = 0;
3077    
3078 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
3079     because we may be adding and subtracting from it, and we don't want to
3080     subtract bits that may be in the main map already. At the end we or the
3081     result into the bit map that is being built. */
3082 nigel 77
3083     posix_class *= 3;
3084 nigel 87
3085     /* Copy in the first table (always present) */
3086    
3087     memcpy(pbits, cbits + posix_class_maps[posix_class],
3088     32 * sizeof(uschar));
3089    
3090     /* If there is a second table, add or remove it as required. */
3091    
3092     taboffset = posix_class_maps[posix_class + 1];
3093     tabopt = posix_class_maps[posix_class + 2];
3094    
3095     if (taboffset >= 0)
3096 nigel 77 {
3097 nigel 87 if (tabopt >= 0)
3098     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3099 nigel 77 else
3100 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3101 nigel 77 }
3102    
3103 nigel 87 /* Not see if we need to remove any special characters. An option
3104     value of 1 removes vertical space and 2 removes underscore. */
3105    
3106     if (tabopt < 0) tabopt = -tabopt;
3107     if (tabopt == 1) pbits[1] &= ~0x3c;
3108     else if (tabopt == 2) pbits[11] &= 0x7f;
3109    
3110     /* Add the POSIX table or its complement into the main table that is
3111     being built and we are done. */
3112    
3113     if (local_negate)
3114     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3115     else
3116     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3117    
3118 nigel 77 ptr = tempptr + 1;
3119     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3120     continue; /* End of POSIX syntax handling */
3121     }
3122    
3123     /* Backslash may introduce a single character, or it may introduce one
3124 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3125     case. Inside a class (and only there) it is treated as backspace.
3126     Elsewhere it marks a word boundary. Other escapes have preset maps ready
3127 ph10 205 to 'or' into the one we are building. We assume they have more than one
3128 nigel 77 character in them, so set class_charcount bigger than one. */
3129    
3130 ph10 391 if (c == CHAR_BACKSLASH)
3131 nigel 77 {
3132 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3133     if (*errorcodeptr != 0) goto FAILED;
3134 nigel 77
3135 ph10 391 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3136     else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
3137     else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
3138 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3139     {
3140 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3141 nigel 77 {
3142     ptr += 2; /* avoid empty string */
3143     }
3144     else inescq = TRUE;
3145     continue;
3146     }
3147 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3148 nigel 77
3149     if (c < 0)
3150     {
3151     register const uschar *cbits = cd->cbits;
3152     class_charcount += 2; /* Greater than 1 is what matters */
3153 nigel 93
3154     /* Save time by not doing this in the pre-compile phase. */
3155    
3156     if (lengthptr == NULL) switch (-c)
3157 nigel 77 {
3158     case ESC_d:
3159     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3160     continue;
3161    
3162     case ESC_D:
3163 ph10 286 should_flip_negation = TRUE;
3164 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3165     continue;
3166    
3167     case ESC_w:
3168     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3169     continue;
3170    
3171     case ESC_W:
3172 ph10 286 should_flip_negation = TRUE;
3173 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3174     continue;
3175    
3176     case ESC_s:
3177     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3178     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3179     continue;
3180    
3181     case ESC_S:
3182 ph10 286 should_flip_negation = TRUE;
3183 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3184     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3185     continue;
3186    
3187 nigel 93 default: /* Not recognized; fall through */
3188     break; /* Need "default" setting to stop compiler warning. */
3189     }
3190    
3191     /* In the pre-compile phase, just do the recognition. */
3192    
3193     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3194     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3195 ph10 180
3196 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
3197     they use extra memory. */
3198 ph10 180
3199 ph10 178 if (-c == ESC_h)
3200     {
3201     SETBIT(classbits, 0x09); /* VT */
3202     SETBIT(classbits, 0x20); /* SPACE */
3203 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
3204 ph10 178 #ifdef SUPPORT_UTF8
3205     if (utf8)
3206 ph10 180 {
3207 ph10 178 class_utf8 = TRUE;
3208     *class_utf8data++ = XCL_SINGLE;
3209 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3210 ph10 178 *class_utf8data++ = XCL_SINGLE;
3211 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3212     *class_utf8data++ = XCL_RANGE;
3213     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3214     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3215 ph10 178 *class_utf8data++ = XCL_SINGLE;
3216 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3217 ph10 178 *class_utf8data++ = XCL_SINGLE;
3218 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3219 ph10 178 *class_utf8data++ = XCL_SINGLE;
3220 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3221     }
3222     #endif
3223     continue;
3224     }
3225 nigel 93
3226 ph10 178 if (-c == ESC_H)
3227     {
3228     for (c = 0; c < 32; c++)
3229     {
3230     int x = 0xff;
3231     switch (c)
3232 ph10 180 {
3233 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
3234     case 0x20/8: x ^= 1 << (0x20%8); break;
3235     case 0xa0/8: x ^= 1 << (0xa0%8); break;
3236     default: break;
3237     }
3238     classbits[c] |= x;
3239 ph10 180 }
3240    
3241 ph10 178 #ifdef SUPPORT_UTF8
3242     if (utf8)
3243 ph10 180 {
3244 ph10 178 class_utf8 = TRUE;
3245 ph10 180 *class_utf8data++ = XCL_RANGE;
3246     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3247     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3248     *class_utf8data++ = XCL_RANGE;
3249     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3250     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3251     *class_utf8data++ = XCL_RANGE;
3252     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3253     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3254     *class_utf8data++ = XCL_RANGE;
3255     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3256     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3257     *class_utf8data++ = XCL_RANGE;
3258     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3259     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3260     *class_utf8data++ = XCL_RANGE;
3261     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3262     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3263     *class_utf8data++ = XCL_RANGE;
3264     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3265     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3266     }
3267     #endif
3268     continue;
3269     }
3270 ph10 178
3271     if (-c == ESC_v)
3272     {
3273     SETBIT(classbits, 0x0a); /* LF */
3274     SETBIT(classbits, 0x0b); /* VT */
3275 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3276     SETBIT(classbits, 0x0d); /* CR */
3277     SETBIT(classbits, 0x85); /* NEL */
3278 ph10 178 #ifdef SUPPORT_UTF8
3279     if (utf8)
3280 ph10 180 {
3281 ph10 178 class_utf8 = TRUE;
3282 ph10 180 *class_utf8data++ = XCL_RANGE;
3283     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3284     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3285     }
3286     #endif
3287     continue;
3288     }
3289 ph10 178
3290     if (-c == ESC_V)
3291     {
3292     for (c = 0; c < 32; c++)
3293     {
3294     int x = 0xff;
3295     switch (c)
3296 ph10 180 {
3297 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3298     x ^= 1 << (0x0b%8);
3299     x ^= 1 << (0x0c%8);
3300 ph10 180 x ^= 1 << (0x0d%8);
3301 ph10 178 break;
3302     case 0x85/8: x ^= 1 << (0x85%8); break;
3303     default: break;
3304     }
3305     classbits[c] |= x;
3306 ph10 180 }
3307    
3308 ph10 178 #ifdef SUPPORT_UTF8
3309     if (utf8)
3310 ph10 180 {
3311 ph10 178 class_utf8 = TRUE;
3312 ph10 180 *class_utf8data++ = XCL_RANGE;
3313     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3314     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3315     *class_utf8data++ = XCL_RANGE;
3316     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3317     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3318     }
3319     #endif
3320     continue;
3321     }
3322 ph10 178
3323 nigel 93 /* We need to deal with \P and \p in both phases. */
3324    
3325 nigel 77 #ifdef SUPPORT_UCP
3326 nigel 93 if (-c == ESC_p || -c == ESC_P)
3327     {
3328     BOOL negated;
3329     int pdata;
3330     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3331     if (ptype < 0) goto FAILED;
3332     class_utf8 = TRUE;
3333     *class_utf8data++ = ((-c == ESC_p) != negated)?
3334     XCL_PROP : XCL_NOTPROP;
3335     *class_utf8data++ = ptype;
3336     *class_utf8data++ = pdata;
3337     class_charcount -= 2; /* Not a < 256 character */
3338 nigel 77 continue;
3339 nigel 93 }
3340 nigel 77 #endif
3341 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
3342     strict mode. By default, for compatibility with Perl, they are
3343     treated as literals. */
3344 nigel 77
3345 nigel 93 if ((options & PCRE_EXTRA) != 0)
3346     {
3347     *errorcodeptr = ERR7;
3348     goto FAILED;
3349     }
3350 nigel 77
3351 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3352     c = *ptr; /* Get the final character and fall through */
3353 nigel 77 }
3354    
3355     /* Fall through if we have a single character (c >= 0). This may be
3356 nigel 93 greater than 256 in UTF-8 mode. */
3357 nigel 77
3358     } /* End of backslash handling */
3359    
3360     /* A single character may be followed by '-' to form a range. However,
3361     Perl does not permit ']' to be the end of the range. A '-' character
3362 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3363     entirely. The code for handling \Q and \E is messy. */
3364 nigel 77
3365 nigel 93 CHECK_RANGE:
3366 ph10 391 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3367 nigel 77 {
3368 nigel 93 inescq = FALSE;
3369     ptr += 2;
3370     }
3371    
3372     oldptr = ptr;
3373 ph10 231
3374 ph10 230 /* Remember \r or \n */
3375 ph10 231
3376 ph10 391 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3377 ph10 231
3378 ph10 230 /* Check for range */
3379 nigel 93
3380 ph10 391 if (!inescq && ptr[1] == CHAR_MINUS)
3381 nigel 93 {
3382 nigel 77 int d;
3383     ptr += 2;
3384 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3385 nigel 77
3386 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3387     mode. */
3388    
3389 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3390 nigel 93 {
3391     ptr += 2;
3392 ph10 392 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3393 ph10 391 { ptr += 2; continue; }
3394 nigel 93 inescq = TRUE;
3395     break;
3396     }
3397    
3398 ph10 391 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3399 nigel 93 {
3400     ptr = oldptr;
3401     goto LONE_SINGLE_CHARACTER;
3402     }
3403    
3404 nigel 77 #ifdef SUPPORT_UTF8
3405     if (utf8)
3406     { /* Braces are required because the */
3407     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3408     }
3409     else
3410     #endif
3411     d = *ptr; /* Not UTF-8 mode */
3412    
3413     /* The second part of a range can be a single-character escape, but
3414     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3415     in such circumstances. */
3416    
3417 ph10 391 if (!inescq && d == CHAR_BACKSLASH)
3418 nigel 77 {
3419 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3420     if (*errorcodeptr != 0) goto FAILED;
3421 nigel 77
3422 ph10 275 /* \b is backspace; \X is literal X; \R is literal R; any other
3423 nigel 93 special means the '-' was literal */
3424 nigel 77
3425     if (d < 0)
3426     {
3427 ph10 391 if (d == -ESC_b) d = CHAR_BS;
3428     else if (d == -ESC_X) d = CHAR_X;
3429     else if (d == -ESC_R) d = CHAR_R; else
3430 nigel 77 {
3431 nigel 93 ptr = oldptr;
3432 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3433     }
3434     }
3435     }
3436    
3437 nigel 93 /* Check that the two values are in the correct order. Optimize
3438     one-character ranges */
3439 nigel 77
3440 nigel 93 if (d < c)
3441     {
3442     *errorcodeptr = ERR8;
3443     goto FAILED;
3444     }
3445    
3446 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3447    
3448 ph10 230 /* Remember \r or \n */
3449 ph10 231
3450 ph10 391 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3451 ph10 231
3452 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3453     matching, we have to use an XCLASS with extra data items. Caseless
3454     matching for characters > 127 is available only if UCP support is
3455     available. */
3456    
3457     #ifdef SUPPORT_UTF8
3458     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3459     {
3460     class_utf8 = TRUE;
3461    
3462     /* With UCP support, we can find the other case equivalents of
3463     the relevant characters. There may be several ranges. Optimize how
3464     they fit with the basic range. */
3465    
3466     #ifdef SUPPORT_UCP
3467     if ((options & PCRE_CASELESS) != 0)
3468     {
3469 nigel 93 unsigned int occ, ocd;
3470     unsigned int cc = c;
3471     unsigned int origd = d;
3472 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3473     {
3474 ph10 180 if (occ >= (unsigned int)c &&
3475     ocd <= (unsigned int)d)
3476 ph10 176 continue; /* Skip embedded ranges */
3477 nigel 77
3478 ph10 180 if (occ < (unsigned int)c &&
3479 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3480 nigel 77 { /* if there is overlap, */
3481     c = occ; /* noting that if occ < c */
3482     continue; /* we can't have ocd > d */
3483     } /* because a subrange is */
3484 ph10 180 if (ocd > (unsigned int)d &&
3485 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3486 nigel 77 { /* the basic range. */
3487     d = ocd;
3488     continue;
3489     }
3490    
3491     if (occ == ocd)
3492     {
3493     *class_utf8data++ = XCL_SINGLE;
3494     }
3495     else
3496     {
3497     *class_utf8data++ = XCL_RANGE;
3498     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3499     }
3500     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3501     }
3502     }
3503     #endif /* SUPPORT_UCP */
3504    
3505     /* Now record the original range, possibly modified for UCP caseless
3506     overlapping ranges. */
3507    
3508     *class_utf8data++ = XCL_RANGE;
3509     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3510     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3511    
3512     /* With UCP support, we are done. Without UCP support, there is no
3513     caseless matching for UTF-8 characters > 127; we can use the bit map
3514     for the smaller ones. */
3515    
3516     #ifdef SUPPORT_UCP
3517     continue; /* With next character in the class */
3518     #else
3519     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3520    
3521     /* Adjust upper limit and fall through to set up the map */
3522    
3523     d = 127;
3524    
3525     #endif /* SUPPORT_UCP */
3526     }
3527     #endif /* SUPPORT_UTF8 */
3528    
3529     /* We use the bit map for all cases when not in UTF-8 mode; else
3530     ranges that lie entirely within 0-127 when there is UCP support; else
3531     for partial ranges without UCP support. */
3532    
3533 nigel 93 class_charcount += d - c + 1;
3534     class_lastchar = d;
3535    
3536     /* We can save a bit of time by skipping this in the pre-compile. */
3537    
3538     if (lengthptr == NULL) for (; c <= d; c++)
3539 nigel 77 {
3540     classbits[c/8] |= (1 << (c&7));
3541     if ((options & PCRE_CASELESS) != 0)
3542     {
3543     int uc = cd->fcc[c]; /* flip case */
3544     classbits[uc/8] |= (1 << (uc&7));
3545     }
3546     }
3547    
3548     continue; /* Go get the next char in the class */
3549     }
3550    
3551     /* Handle a lone single character - we can get here for a normal
3552     non-escape char, or after \ that introduces a single character or for an
3553     apparent range that isn't. */
3554    
3555     LONE_SINGLE_CHARACTER:
3556 ph10 231
3557 nigel 77 /* Handle a character that cannot go in the bit map */
3558    
3559     #ifdef SUPPORT_UTF8
3560     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3561     {
3562     class_utf8 = TRUE;
3563     *class_utf8data++ = XCL_SINGLE;
3564     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3565    
3566     #ifdef SUPPORT_UCP
3567     if ((options & PCRE_CASELESS) != 0)
3568     {
3569 nigel 93 unsigned int othercase;
3570 ph10 349 if ((othercase = UCD_OTHERCASE(c)) != c)
3571 nigel 77 {
3572     *class_utf8data++ = XCL_SINGLE;
3573     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3574     }
3575     }
3576     #endif /* SUPPORT_UCP */
3577    
3578     }
3579     else
3580     #endif /* SUPPORT_UTF8 */
3581    
3582     /* Handle a single-byte character */
3583     {
3584     classbits[c/8] |= (1 << (c&7));
3585     if ((options & PCRE_CASELESS) != 0)
3586     {
3587     c = cd->fcc[c]; /* flip case */
3588     classbits[c/8] |= (1 << (c&7));
3589     }
3590     class_charcount++;
3591     class_lastchar = c;
3592     }
3593     }
3594    
3595 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3596 nigel 77
3597 ph10 391 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3598 nigel 77
3599 nigel 93 if (c == 0) /* Missing terminating ']' */
3600     {
3601     *errorcodeptr = ERR6;
3602     goto FAILED;
3603     }
3604 ph10 231
3605    
3606 ph10 230 /* This code has been disabled because it would mean that \s counts as
3607     an explicit \r or \n reference, and that's not really what is wanted. Now
3608     we set the flag only if there is a literal "\r" or "\n" in the class. */
3609 ph10 227
3610 ph10 230 #if 0
3611 ph10 226 /* Remember whether \r or \n are in this class */
3612 ph10 227
3613 ph10 226 if (negate_class)
3614     {
3615 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3616 ph10 226 }
3617     else
3618     {
3619 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3620 ph10 227 }
3621 ph10 230 #endif
3622 ph10 227
3623 ph10 231
3624 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3625 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3626     use of \p or \P, in other words, no use of any XCLASS features, we can
3627     optimize.
3628    
3629 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3630     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3631     operate on single-bytes only. This is an historical hangover. Maybe one day
3632     we can tidy these opcodes to handle multi-byte characters.
3633 nigel 77
3634     The optimization throws away the bit map. We turn the item into a
3635     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3636     that OP_NOT does not support multibyte characters. In the positive case, it
3637     can cause firstbyte to be set. Otherwise, there can be no first char if
3638     this item is first, whatever repeat count may follow. In the case of
3639     reqbyte, save the previous value for reinstating. */
3640    
3641     #ifdef SUPPORT_UTF8
3642 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3643 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3644 nigel 77 #else
3645     if (class_charcount == 1)
3646     #endif
3647     {
3648     zeroreqbyte = reqbyte;
3649    
3650     /* The OP_NOT opcode works on one-byte characters only. */
3651    
3652     if (negate_class)
3653     {
3654     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3655     zerofirstbyte = firstbyte;
3656     *code++ = OP_NOT;
3657     *code++ = class_lastchar;
3658     break;
3659     }
3660    
3661     /* For a single, positive character, get the value into mcbuffer, and
3662     then we can handle this with the normal one-character code. */
3663    
3664     #ifdef SUPPORT_UTF8
3665     if (utf8 && class_lastchar > 127)
3666     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3667     else
3668     #endif
3669     {
3670     mcbuffer[0] = class_lastchar;
3671     mclength = 1;
3672     }
3673     goto ONE_CHAR;
3674     } /* End of 1-char optimization */
3675    
3676     /* The general case - not the one-char optimization. If this is the first
3677     thing in the branch, there can be no first char setting, whatever the
3678     repeat count. Any reqbyte setting must remain unchanged after any kind of
3679     repeat. */
3680    
3681     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3682     zerofirstbyte = firstbyte;
3683     zeroreqbyte = reqbyte;
3684    
3685     /* If there are characters with values > 255, we have to compile an
3686 ph10 286 extended class, with its own opcode, unless there was a negated special
3687     such as \S in the class, because in that case all characters > 255 are in
3688     the class, so any that were explicitly given as well can be ignored. If
3689 ph10 264 (when there are explicit characters > 255 that must be listed) there are no
3690     characters < 256, we can omit the bitmap in the actual compiled code. */
3691 nigel 77
3692     #ifdef SUPPORT_UTF8
3693 ph10 264 if (class_utf8 && !should_flip_negation)
3694 nigel 77 {
3695     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3696     *code++ = OP_XCLASS;
3697     code += LINK_SIZE;
3698     *code = negate_class? XCL_NOT : 0;
3699    
3700 nigel 93 /* If the map is required, move up the extra data to make room for it;
3701     otherwise just move the code pointer to the end of the extra data. */
3702 nigel 77
3703     if (class_charcount > 0)
3704     {
3705     *code++ |= XCL_MAP;
3706 nigel 93 memmove(code + 32, code, class_utf8data - code);
3707 nigel 77 memcpy(code, classbits, 32);
3708 nigel 93 code = class_utf8data + 32;
3709 nigel 77 }
3710 nigel 93 else code = class_utf8data;
3711 nigel 77
3712     /* Now fill in the complete length of the item */
3713    
3714     PUT(previous, 1, code - previous);
3715     break; /* End of class handling */
3716     }
3717     #endif
3718    
3719 ph10 286 /* If there are no characters > 255, set the opcode to OP_CLASS or
3720     OP_NCLASS, depending on whether the whole class was negated and whether
3721     there were negative specials such as \S in the class. Then copy the 32-byte
3722 ph10 264 map into the code vector, negating it if necessary. */
3723 ph10 286
3724 ph10 264 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3725 nigel 77 if (negate_class)
3726     {
3727 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3728     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3729 nigel 77 }
3730     else
3731     {
3732     memcpy(code, classbits, 32);
3733     }
3734     code += 32;
3735     break;
3736    
3737 nigel 93
3738     /* ===================================================================*/
3739 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3740     has been tested above. */
3741    
3742 ph10 391 case CHAR_LEFT_CURLY_BRACKET:
3743 nigel 77 if (!is_quantifier) goto NORMAL_CHAR;
3744     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3745     if (*errorcodeptr != 0) goto FAILED;
3746     goto REPEAT;
3747    
3748 ph10 391 case CHAR_ASTERISK:
3749 nigel 77 repeat_min = 0;
3750     repeat_max = -1;
3751     goto REPEAT;
3752    
3753 ph10 391 case CHAR_PLUS:
3754 nigel 77 repeat_min = 1;
3755     repeat_max = -1;
3756     goto REPEAT;
3757    
3758 ph10 391 case CHAR_QUESTION_MARK:
3759 nigel 77 repeat_min = 0;
3760     repeat_max = 1;
3761    
3762     REPEAT:
3763     if (previous == NULL)
3764     {
3765     *errorcodeptr = ERR9;
3766     goto FAILED;
3767     }
3768    
3769     if (repeat_min == 0)
3770     {
3771     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3772     reqbyte = zeroreqbyte; /* Ditto */
3773     }
3774    
3775     /* Remember whether this is a variable length repeat */
3776    
3777     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3778    
3779     op_type = 0; /* Default single-char op codes */
3780     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3781    
3782     /* Save start of previous item, in case we have to move it up to make space
3783     for an inserted OP_ONCE for the additional '+' extension. */
3784    
3785     tempcode = previous;
3786    
3787     /* If the next character is '+', we have a possessive quantifier. This
3788     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3789     If the next character is '?' this is a minimizing repeat, by default,
3790     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3791     repeat type to the non-default. */
3792    
3793 ph10 391 if (ptr[1] == CHAR_PLUS)
3794 nigel 77 {
3795     repeat_type = 0; /* Force greedy */
3796     possessive_quantifier = TRUE;
3797     ptr++;
3798     }
3799 ph10 391 else if (ptr[1] == CHAR_QUESTION_MARK)
3800 nigel 77 {
3801     repeat_type = greedy_non_default;
3802     ptr++;
3803     }
3804     else repeat_type = greedy_default;
3805    
3806     /* If previous was a character match, abolish the item and generate a
3807     repeat item instead. If a char item has a minumum of more than one, ensure
3808     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3809     the first thing in a branch because the x will have gone into firstbyte
3810     instead. */
3811    
3812     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3813     {
3814     /* Deal with UTF-8 characters that take up more than one byte. It's
3815     easier to write this out separately than try to macrify it. Use c to
3816     hold the length of the character in bytes, plus 0x80 to flag that it's a
3817     length rather than a small character. */
3818    
3819     #ifdef SUPPORT_UTF8
3820     if (utf8 && (code[-1] & 0x80) != 0)
3821     {
3822     uschar *lastchar = code - 1;
3823     while((*lastchar & 0xc0) == 0x80) lastchar--;
3824     c = code - lastchar; /* Length of UTF-8 character */
3825     memcpy(utf8_char, lastchar, c); /* Save the char */
3826     c |= 0x80; /* Flag c as a length */
3827     }
3828     else
3829     #endif
3830    
3831     /* Handle the case of a single byte - either with no UTF8 support, or
3832     with UTF-8 disabled, or for a UTF-8 character < 128. */
3833    
3834     {
3835     c = code[-1];
3836     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3837     }
3838    
3839 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3840     the line is something that cannot possibly match this character. If so,
3841     automatically possessifying this item gains some performance in the case
3842     where the match fails. */
3843    
3844     if (!possessive_quantifier &&
3845     repeat_max < 0 &&
3846     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3847     options, cd))
3848     {
3849     repeat_type = 0; /* Force greedy */
3850     possessive_quantifier = TRUE;
3851     }
3852    
3853 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3854     }
3855    
3856     /* If previous was a single negated character ([^a] or similar), we use
3857     one of the special opcodes, replacing it. The code is shared with single-
3858     character repeats by setting opt_type to add a suitable offset into
3859 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3860     currently used only for single-byte chars. */
3861 nigel 77
3862     else if (*previous == OP_NOT)
3863     {
3864     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3865     c = previous[1];
3866 nigel 93 if (!possessive_quantifier &&
3867     repeat_max < 0 &&
3868     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3869     {
3870     repeat_type = 0; /* Force greedy */
3871     possessive_quantifier = TRUE;
3872     }
3873 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3874     }
3875    
3876     /* If previous was a character type match (\d or similar), abolish it and
3877     create a suitable repeat item. The code is shared with single-character
3878     repeats by setting op_type to add a suitable offset into repeat_type. Note
3879     the the Unicode property types will be present only when SUPPORT_UCP is
3880     defined, but we don't wrap the little bits of code here because it just
3881     makes it horribly messy. */
3882    
3883     else if (*previous < OP_EODN)
3884     {
3885     uschar *oldcode;
3886 nigel 87 int prop_type, prop_value;
3887 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3888     c = *previous;
3889    
3890 nigel 93 if (!possessive_quantifier &&
3891     repeat_max < 0 &&
3892     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3893     {
3894     repeat_type = 0; /* Force greedy */
3895     possessive_quantifier = TRUE;
3896     }
3897    
3898 nigel 77 OUTPUT_SINGLE_REPEAT:
3899 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3900     {
3901     prop_type = previous[1];
3902     prop_value = previous[2];
3903     }
3904     else prop_type = prop_value = -1;
3905 nigel 77
3906     oldcode = code;
3907     code = previous; /* Usually overwrite previous item */
3908    
3909     /* If the maximum is zero then the minimum must also be zero; Perl allows
3910     this case, so we do too - by simply omitting the item altogether. */
3911    
3912     if (repeat_max == 0) goto END_REPEAT;
3913    
3914 ph10 426 /*--------------------------------------------------------------------*/
3915     /* This code is obsolete from release 8.00; the restriction was finally
3916     removed: */
3917    
3918 nigel 77 /* All real repeats make it impossible to handle partial matching (maybe
3919     one day we will be able to remove this restriction). */
3920 ph10 426
3921     /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3922     /*--------------------------------------------------------------------*/
3923 nigel 77
3924     /* Combine the op_type with the repeat_type */
3925    
3926     repeat_type += op_type;
3927    
3928     /* A minimum of zero is handled either as the special case * or ?, or as
3929     an UPTO, with the maximum given. */
3930    
3931     if (repeat_min == 0)
3932     {
3933     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3934     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3935     else
3936     {
3937     *code++ = OP_UPTO + repeat_type;
3938     PUT2INC(code, 0, repeat_max);
3939     }
3940     }
3941    
3942     /* A repeat minimum of 1 is optimized into some special cases. If the
3943 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3944 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3945     one less than the maximum. */
3946    
3947     else if (repeat_min == 1)
3948     {
3949     if (repeat_max == -1)
3950     *code++ = OP_PLUS + repeat_type;
3951     else
3952     {
3953     code = oldcode; /* leave previous item in place */
3954     if (repeat_max == 1) goto END_REPEAT;
3955     *code++ = OP_UPTO + repeat_type;
3956     PUT2INC(code, 0, repeat_max - 1);
3957     }
3958     }
3959    
3960     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3961     handled as an EXACT followed by an UPTO. */
3962    
3963     else
3964     {
3965     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3966     PUT2INC(code, 0, repeat_min);
3967    
3968     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3969