/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 459 - (hide annotations) (download)
Sun Oct 4 09:21:39 2009 UTC (4 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 219936 byte(s)
Fix problems with conditional references to duplicate named subpatterns.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 381 Copyright (c) 1997-2009 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 391 #ifndef EBCDIC
101    
102     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
103 ph10 392 in UTF-8 mode. */
104 ph10 391
105 ph10 392 static const short int escapes[] = {
106 ph10 391 0, 0,
107     0, 0,
108 ph10 392 0, 0,
109     0, 0,
110     0, 0,
111 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
112 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
113 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
114 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
115     -ESC_B, -ESC_C,
116     -ESC_D, -ESC_E,
117     0, -ESC_G,
118     -ESC_H, 0,
119     0, -ESC_K,
120 ph10 391 0, 0,
121 ph10 392 0, 0,
122 ph10 391 -ESC_P, -ESC_Q,
123     -ESC_R, -ESC_S,
124 ph10 392 0, 0,
125     -ESC_V, -ESC_W,
126     -ESC_X, 0,
127     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
128 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
129 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
130 ph10 391 CHAR_GRAVE_ACCENT, 7,
131 ph10 392 -ESC_b, 0,
132     -ESC_d, ESC_e,
133 ph10 391 ESC_f, 0,
134     -ESC_h, 0,
135 ph10 392 0, -ESC_k,
136 ph10 391 0, 0,
137     ESC_n, 0,
138 ph10 392 -ESC_p, 0,
139     ESC_r, -ESC_s,
140 ph10 391 ESC_tee, 0,
141 ph10 392 -ESC_v, -ESC_w,
142     0, 0,
143 ph10 391 -ESC_z
144 nigel 77 };
145    
146 ph10 392 #else
147 ph10 391
148     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
149    
150 nigel 77 static const short int escapes[] = {
151     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
152     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
153     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
154     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
155     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
156     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
157     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
158     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
159 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
160 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
161 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
162 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
163 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
164     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
165     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
166     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
167 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
168 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
169 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
170 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
171 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
172     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
173     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
174     };
175     #endif
176    
177    
178 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
179     searched linearly. Put all the names into a single string, in order to reduce
180 ph10 392 the number of relocations when a shared library is dynamically linked. The
181     string is built from string macros so that it works in UTF-8 mode on EBCDIC
182 ph10 391 platforms. */
183 ph10 210
184     typedef struct verbitem {
185     int len;
186     int op;
187 ph10 211 } verbitem;
188 ph10 210
189 ph10 240 static const char verbnames[] =
190 ph10 391 STRING_ACCEPT0
191     STRING_COMMIT0
192     STRING_F0
193     STRING_FAIL0
194     STRING_PRUNE0
195     STRING_SKIP0
196     STRING_THEN;
197 ph10 240
198 ph10 327 static const verbitem verbs[] = {
199 ph10 240 { 6, OP_ACCEPT },
200     { 6, OP_COMMIT },
201     { 1, OP_FAIL },
202     { 4, OP_FAIL },
203     { 5, OP_PRUNE },
204     { 4, OP_SKIP },
205     { 4, OP_THEN }
206 ph10 210 };
207    
208 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
209 ph10 210
210    
211 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
212     now all in a single string, to reduce the number of relocations when a shared
213 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
214     length entry. The first three must be alpha, lower, upper, as this is assumed
215     for handling case independence. */
216 nigel 77
217 ph10 240 static const char posix_names[] =
218 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
219     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
220 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
221     STRING_word0 STRING_xdigit;
222 nigel 77
223     static const uschar posix_name_lengths[] = {
224     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
225    
226 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
227     base map, with an optional addition or removal of another map. Then, for some
228     classes, there is some additional tweaking: for [:blank:] the vertical space
229     characters are removed, and for [:alpha:] and [:alnum:] the underscore
230     character is removed. The triples in the table consist of the base map offset,
231     second map offset or -1 if no second map, and a non-negative value for map
232     addition or a negative value for map subtraction (if there are two maps). The
233     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
234     remove vertical space characters, 2 => remove underscore. */
235 nigel 77
236     static const int posix_class_maps[] = {
237 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
238     cbit_lower, -1, 0, /* lower */
239     cbit_upper, -1, 0, /* upper */
240     cbit_word, -1, 2, /* alnum - word without underscore */
241     cbit_print, cbit_cntrl, 0, /* ascii */
242     cbit_space, -1, 1, /* blank - a GNU extension */
243     cbit_cntrl, -1, 0, /* cntrl */
244     cbit_digit, -1, 0, /* digit */
245     cbit_graph, -1, 0, /* graph */
246     cbit_print, -1, 0, /* print */
247     cbit_punct, -1, 0, /* punct */
248     cbit_space, -1, 0, /* space */
249     cbit_word, -1, 0, /* word - a Perl extension */
250     cbit_xdigit,-1, 0 /* xdigit */
251 nigel 77 };
252    
253    
254 nigel 93 #define STRING(a) # a
255     #define XSTRING(s) STRING(s)
256    
257 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
258 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
259     they are documented. Always add a new error instead. Messages marked DEAD below
260 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
261     the number of relocations needed when a shared library is loaded dynamically,
262     it is now one long string. We cannot use a table of offsets, because the
263     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
264     simply count through to the one we want - this isn't a performance issue
265 ph10 240 because these strings are used only when there is a compilation error. */
266 nigel 77
267 ph10 240 static const char error_texts[] =
268     "no error\0"
269     "\\ at end of pattern\0"
270     "\\c at end of pattern\0"
271     "unrecognized character follows \\\0"
272     "numbers out of order in {} quantifier\0"
273 nigel 77 /* 5 */
274 ph10 240 "number too big in {} quantifier\0"
275     "missing terminating ] for character class\0"
276     "invalid escape sequence in character class\0"
277     "range out of order in character class\0"
278     "nothing to repeat\0"
279 nigel 77 /* 10 */
280 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
281     "internal error: unexpected repeat\0"
282 ph10 269 "unrecognized character after (? or (?-\0"
283 ph10 240 "POSIX named classes are supported only within a class\0"
284     "missing )\0"
285 nigel 77 /* 15 */
286 ph10 240 "reference to non-existent subpattern\0"
287     "erroffset passed as NULL\0"
288     "unknown option bit(s) set\0"
289     "missing ) after comment\0"
290     "parentheses nested too deeply\0" /** DEAD **/
291 nigel 77 /* 20 */
292 ph10 240 "regular expression is too large\0"
293     "failed to get memory\0"
294     "unmatched parentheses\0"
295     "internal error: code overflow\0"
296     "unrecognized character after (?<\0"
297 nigel 77 /* 25 */
298 ph10 240 "lookbehind assertion is not fixed length\0"
299     "malformed number or name after (?(\0"
300     "conditional group contains more than two branches\0"
301     "assertion expected after (?(\0"
302     "(?R or (?[+-]digits must be followed by )\0"
303 nigel 77 /* 30 */
304 ph10 240 "unknown POSIX class name\0"
305     "POSIX collating elements are not supported\0"
306     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
307     "spare error\0" /** DEAD **/
308     "character value in \\x{...} sequence is too large\0"
309 nigel 77 /* 35 */
310 ph10 240 "invalid condition (?(0)\0"
311     "\\C not allowed in lookbehind assertion\0"
312     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
313     "number after (?C is > 255\0"
314     "closing ) for (?C expected\0"
315 nigel 77 /* 40 */
316 ph10 240 "recursive call could loop indefinitely\0"
317     "unrecognized character after (?P\0"
318     "syntax error in subpattern name (missing terminator)\0"
319     "two named subpatterns have the same name\0"
320     "invalid UTF-8 string\0"
321 nigel 77 /* 45 */
322 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
323     "malformed \\P or \\p sequence\0"
324     "unknown property name after \\P or \\p\0"
325     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
326     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
327 nigel 91 /* 50 */
328 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
329     "octal value is greater than \\377 (not in UTF-8 mode)\0"
330     "internal error: overran compiling workspace\0"
331     "internal error: previously-checked referenced subpattern not found\0"
332     "DEFINE group contains more than one branch\0"
333 nigel 93 /* 55 */
334 ph10 240 "repeating a DEFINE group is not allowed\0"
335     "inconsistent NEWLINE options\0"
336 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
337     "a numbered reference must not be zero\0"
338 ph10 240 "(*VERB) with an argument is not supported\0"
339 ph10 211 /* 60 */
340 ph10 240 "(*VERB) not recognized\0"
341 ph10 268 "number is too big\0"
342 ph10 272 "subpattern name expected\0"
343 ph10 336 "digit expected after (?+\0"
344 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
345     /* 65 */
346     "different names for subpatterns of the same number are not allowed";
347 nigel 77
348    
349     /* Table to identify digits and hex digits. This is used when compiling
350     patterns. Note that the tables in chartables are dependent on the locale, and
351     may mark arbitrary characters as digits - but the PCRE compiling code expects
352     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
353     a private table here. It costs 256 bytes, but it is a lot faster than doing
354     character value tests (at least in some simple cases I timed), and in some
355     applications one wants PCRE to compile efficiently as well as match
356     efficiently.
357    
358     For convenience, we use the same bit definitions as in chartables:
359    
360     0x04 decimal digit
361     0x08 hexadecimal digit
362    
363     Then we can use ctype_digit and ctype_xdigit in the code. */
364    
365 ph10 392 #ifndef EBCDIC
366 ph10 391
367 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
368 ph10 391 UTF-8 mode. */
369    
370 nigel 77 static const unsigned char digitab[] =
371     {
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
374     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
375     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
376     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
377     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
378     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
379     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
380     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
381     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
382     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
383     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
384     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
387     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
388     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
393     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
394     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
395     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
396     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
397     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
398     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
399     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
400     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
401     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
402     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
403     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
404    
405 ph10 392 #else
406 ph10 391
407     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
408    
409 nigel 77 static const unsigned char digitab[] =
410     {
411     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
412     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
413     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
414     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
415     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
416     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
417     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
418     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
419     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
420     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
421     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
422 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
423 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
424     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
425     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
426     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
427     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
428     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
429     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
430     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
431     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
432     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
433     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
434     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
435     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
436     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
437     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
438     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
439     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
440     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
441     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
442     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
443    
444     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
445     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
446     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
447     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
448     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
449     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
450     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
451     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
452     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
453     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
454     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
455     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
456 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
457 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
458     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
459     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
460     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
461     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
462     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
463     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
464     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
465     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
466     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
467     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
468     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
469     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
470     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
471     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
472     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
473     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
474     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
475     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
476     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
477     #endif
478    
479    
480     /* Definition to allow mutual recursion */
481    
482     static BOOL
483 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
484 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
485 nigel 77
486    
487    
488     /*************************************************
489 ph10 240 * Find an error text *
490     *************************************************/
491    
492 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
493     some of the text is of unknown length, we can't use a table of offsets.
494     Instead, just count through the strings. This is not a performance issue
495 ph10 240 because it happens only when there has been a compilation error.
496    
497     Argument: the error number
498     Returns: pointer to the error string
499     */
500    
501     static const char *
502     find_error_text(int n)
503     {
504     const char *s = error_texts;
505 ph10 369 for (; n > 0; n--) while (*s++ != 0) {};
506 ph10 240 return s;
507     }
508    
509    
510     /*************************************************
511 nigel 77 * Handle escapes *
512     *************************************************/
513    
514     /* This function is called when a \ has been encountered. It either returns a
515     positive value for a simple escape such as \n, or a negative value which
516 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
517     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
518     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
519     ptr is pointing at the \. On exit, it is on the final character of the escape
520     sequence.
521 nigel 77
522     Arguments:
523     ptrptr points to the pattern position pointer
524     errorcodeptr points to the errorcode variable
525     bracount number of previous extracting brackets
526     options the options bits
527     isclass TRUE if inside a character class
528    
529     Returns: zero or positive => a data character
530     negative => a special escape sequence
531 ph10 213 on error, errorcodeptr is set
532 nigel 77 */
533    
534     static int
535     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
536     int options, BOOL isclass)
537     {
538 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
539     const uschar *ptr = *ptrptr + 1;
540 nigel 77 int c, i;
541    
542 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
543     ptr--; /* Set pointer back to the last byte */
544    
545 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
546    
547     if (c == 0) *errorcodeptr = ERR1;
548    
549 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
550     in a table. A non-zero result is something that can be returned immediately.
551 nigel 77 Otherwise further processing may be required. */
552    
553 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
554     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
555     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
556 nigel 77
557 ph10 97 #else /* EBCDIC coding */
558 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
559 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
560     #endif
561    
562     /* Escapes that need further processing, or are illegal. */
563    
564     else
565     {
566     const uschar *oldptr;
567 nigel 93 BOOL braced, negated;
568    
569 nigel 77 switch (c)
570     {
571     /* A number of Perl escapes are not handled by PCRE. We give an explicit
572     error. */
573    
574 ph10 391 case CHAR_l:
575     case CHAR_L:
576     case CHAR_N:
577     case CHAR_u:
578     case CHAR_U:
579 nigel 77 *errorcodeptr = ERR37;
580     break;
581    
582 ph10 333 /* \g must be followed by one of a number of specific things:
583 ph10 345
584 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
585     backreference. If negative, it is a relative backreference. This is a Perl
586     5.10 feature.
587 ph10 345
588 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
589     is part of Perl's movement towards a unified syntax for back references. As
590     this is synonymous with \k{name}, we fudge it up by pretending it really
591     was \k.
592 ph10 345
593     (3) For Oniguruma compatibility we also support \g followed by a name or a
594     number either in angle brackets or in single quotes. However, these are
595     (possibly recursive) subroutine calls, _not_ backreferences. Just return
596 ph10 333 the -ESC_g code (cf \k). */
597 nigel 93
598 ph10 391 case CHAR_g:
599     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
600 ph10 333 {
601     c = -ESC_g;
602 ph10 345 break;
603     }
604 ph10 333
605     /* Handle the Perl-compatible cases */
606 ph10 345
607 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
608 nigel 93 {
609 ph10 171 const uschar *p;
610 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
611     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
612     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
613 ph10 171 {
614     c = -ESC_k;
615     break;
616 ph10 172 }
617 nigel 93 braced = TRUE;
618     ptr++;
619     }
620     else braced = FALSE;
621    
622 ph10 391 if (ptr[1] == CHAR_MINUS)
623 nigel 93 {
624     negated = TRUE;
625     ptr++;
626     }
627     else negated = FALSE;
628    
629     c = 0;
630     while ((digitab[ptr[1]] & ctype_digit) != 0)
631 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
632 ph10 220
633 ph10 333 if (c < 0) /* Integer overflow */
634 ph10 213 {
635     *errorcodeptr = ERR61;
636     break;
637 ph10 220 }
638 ph10 345
639 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
640 nigel 93 {
641     *errorcodeptr = ERR57;
642 ph10 213 break;
643 nigel 93 }
644 ph10 345
645 ph10 333 if (c == 0)
646     {
647     *errorcodeptr = ERR58;
648     break;
649 ph10 345 }
650 nigel 93
651     if (negated)
652     {
653     if (c > bracount)
654     {
655     *errorcodeptr = ERR15;
656 ph10 213 break;
657 nigel 93 }
658     c = bracount - (c - 1);
659     }
660    
661     c = -(ESC_REF + c);
662     break;
663    
664 nigel 77 /* The handling of escape sequences consisting of a string of digits
665     starting with one that is not zero is not straightforward. By experiment,
666     the way Perl works seems to be as follows:
667    
668     Outside a character class, the digits are read as a decimal number. If the
669     number is less than 10, or if there are that many previous extracting
670     left brackets, then it is a back reference. Otherwise, up to three octal
671     digits are read to form an escaped byte. Thus \123 is likely to be octal
672     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
673     value is greater than 377, the least significant 8 bits are taken. Inside a
674     character class, \ followed by a digit is always an octal number. */
675    
676 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
677     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
678 nigel 77
679     if (!isclass)
680     {
681     oldptr = ptr;
682 ph10 391 c -= CHAR_0;
683 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
684 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
685 ph10 333 if (c < 0) /* Integer overflow */
686 ph10 213 {
687     *errorcodeptr = ERR61;
688 ph10 220 break;
689     }
690 nigel 77 if (c < 10 || c <= bracount)
691     {
692     c = -(ESC_REF + c);
693     break;
694     }
695     ptr = oldptr; /* Put the pointer back and fall through */
696     }
697    
698     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
699     generates a binary zero byte and treats the digit as a following literal.
700     Thus we have to pull back the pointer by one. */
701    
702 ph10 391 if ((c = *ptr) >= CHAR_8)
703 nigel 77 {
704     ptr--;
705     c = 0;
706     break;
707     }
708    
709     /* \0 always starts an octal number, but we may drop through to here with a
710 nigel 91 larger first octal digit. The original code used just to take the least
711     significant 8 bits of octal numbers (I think this is what early Perls used
712     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
713     than 3 octal digits. */
714 nigel 77
715 ph10 391 case CHAR_0:
716     c -= CHAR_0;
717     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
718     c = c * 8 + *(++ptr) - CHAR_0;
719 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
720 nigel 77 break;
721    
722 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
723     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
724     treated as a data character. */
725 nigel 77
726 ph10 391 case CHAR_x:
727     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
728 nigel 77 {
729     const uschar *pt = ptr + 2;
730 nigel 87 int count = 0;
731    
732 nigel 77 c = 0;
733     while ((digitab[*pt] & ctype_xdigit) != 0)
734     {
735 nigel 87 register int cc = *pt++;
736 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
737 nigel 77 count++;
738 nigel 87
739 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
740     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
741     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
742 ph10 97 #else /* EBCDIC coding */
743 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
744     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
745 nigel 77 #endif
746     }
747 nigel 87
748 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
749 nigel 77 {
750 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
751 nigel 77 ptr = pt;
752     break;
753     }
754 nigel 87
755 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
756     recognize this construct; fall through to the normal \x handling. */
757     }
758    
759 nigel 87 /* Read just a single-byte hex-defined char */
760 nigel 77
761     c = 0;
762     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
763     {
764 ph10 391 int cc; /* Some compilers don't like */
765     cc = *(++ptr); /* ++ in initializers */
766     #ifndef EBCDIC /* ASCII/UTF-8 coding */
767     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
768     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
769 ph10 97 #else /* EBCDIC coding */
770 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
771     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
772 nigel 77 #endif
773     }
774     break;
775    
776 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
777     This coding is ASCII-specific, but then the whole concept of \cx is
778     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
779 nigel 77
780 ph10 391 case CHAR_c:
781 nigel 77 c = *(++ptr);
782     if (c == 0)
783     {
784     *errorcodeptr = ERR2;
785 ph10 213 break;
786 nigel 77 }
787    
788 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
789     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
790 nigel 77 c ^= 0x40;
791 ph10 97 #else /* EBCDIC coding */
792 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
793 nigel 77 c ^= 0xC0;
794     #endif
795     break;
796    
797     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
798 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
799     otherwise, for Perl compatibility, it is a literal. This code looks a bit
800     odd, but there used to be some cases other than the default, and there may
801     be again in future, so I haven't "optimized" it. */
802 nigel 77
803     default:
804     if ((options & PCRE_EXTRA) != 0) switch(c)
805     {
806     default:
807     *errorcodeptr = ERR3;
808     break;
809     }
810     break;
811     }
812     }
813    
814     *ptrptr = ptr;
815     return c;
816     }
817    
818    
819    
820     #ifdef SUPPORT_UCP
821     /*************************************************
822     * Handle \P and \p *
823     *************************************************/
824    
825     /* This function is called after \P or \p has been encountered, provided that
826     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
827     pointing at the P or p. On exit, it is pointing at the final character of the
828     escape sequence.
829    
830     Argument:
831     ptrptr points to the pattern position pointer
832     negptr points to a boolean that is set TRUE for negation else FALSE
833 nigel 87 dptr points to an int that is set to the detailed property value
834 nigel 77 errorcodeptr points to the error code variable
835    
836 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
837 nigel 77 */
838    
839     static int
840 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
841 nigel 77 {
842     int c, i, bot, top;
843     const uschar *ptr = *ptrptr;
844 nigel 87 char name[32];
845 nigel 77
846     c = *(++ptr);
847     if (c == 0) goto ERROR_RETURN;
848    
849     *negptr = FALSE;
850    
851 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
852     negation. */
853 nigel 77
854 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
855 nigel 77 {
856 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
857 nigel 77 {
858     *negptr = TRUE;
859     ptr++;
860     }
861 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
862 nigel 77 {
863     c = *(++ptr);
864     if (c == 0) goto ERROR_RETURN;
865 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
866 nigel 77 name[i] = c;
867     }
868 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
869 nigel 77 name[i] = 0;
870     }
871    
872     /* Otherwise there is just one following character */
873    
874     else
875     {
876     name[0] = c;
877     name[1] = 0;
878     }
879    
880     *ptrptr = ptr;
881    
882     /* Search for a recognized property name using binary chop */
883    
884     bot = 0;
885     top = _pcre_utt_size;
886    
887     while (bot < top)
888     {
889 nigel 87 i = (bot + top) >> 1;
890 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
891 nigel 87 if (c == 0)
892     {
893     *dptr = _pcre_utt[i].value;
894     return _pcre_utt[i].type;
895     }
896 nigel 77 if (c > 0) bot = i + 1; else top = i;
897     }
898    
899     *errorcodeptr = ERR47;
900     *ptrptr = ptr;
901     return -1;
902    
903     ERROR_RETURN:
904     *errorcodeptr = ERR46;
905     *ptrptr = ptr;
906     return -1;
907     }
908     #endif
909    
910    
911    
912    
913     /*************************************************
914     * Check for counted repeat *
915     *************************************************/
916    
917     /* This function is called when a '{' is encountered in a place where it might
918     start a quantifier. It looks ahead to see if it really is a quantifier or not.
919     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
920     where the ddds are digits.
921    
922     Arguments:
923     p pointer to the first char after '{'
924    
925     Returns: TRUE or FALSE
926     */
927    
928     static BOOL
929     is_counted_repeat(const uschar *p)
930     {
931     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
932     while ((digitab[*p] & ctype_digit) != 0) p++;
933 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
934 nigel 77
935 ph10 391 if (*p++ != CHAR_COMMA) return FALSE;
936     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
937 nigel 77
938     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
939     while ((digitab[*p] & ctype_digit) != 0) p++;
940    
941 ph10 391 return (*p == CHAR_RIGHT_CURLY_BRACKET);
942 nigel 77 }
943    
944    
945    
946     /*************************************************
947     * Read repeat counts *
948     *************************************************/
949    
950     /* Read an item of the form {n,m} and return the values. This is called only
951     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
952     so the syntax is guaranteed to be correct, but we need to check the values.
953    
954     Arguments:
955     p pointer to first char after '{'
956     minp pointer to int for min
957     maxp pointer to int for max
958     returned as -1 if no max
959     errorcodeptr points to error code variable
960    
961     Returns: pointer to '}' on success;
962     current ptr on error, with errorcodeptr set non-zero
963     */
964    
965     static const uschar *
966     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
967     {
968     int min = 0;
969     int max = -1;
970    
971 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
972     an integer overflow. */
973    
974 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
975 nigel 81 if (min < 0 || min > 65535)
976     {
977     *errorcodeptr = ERR5;
978     return p;
979     }
980 nigel 77
981 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
982     Also, max must not be less than min. */
983    
984 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
985 nigel 77 {
986 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
987 nigel 77 {
988     max = 0;
989 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
990 nigel 81 if (max < 0 || max > 65535)
991     {
992     *errorcodeptr = ERR5;
993     return p;
994     }
995 nigel 77 if (max < min)
996     {
997     *errorcodeptr = ERR4;
998     return p;
999     }
1000     }
1001     }
1002    
1003 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1004     '}'. */
1005 nigel 77
1006 nigel 81 *minp = min;
1007     *maxp = max;
1008 nigel 77 return p;
1009     }
1010    
1011    
1012    
1013     /*************************************************
1014 ph10 408 * Subroutine for finding forward reference *
1015 nigel 91 *************************************************/
1016    
1017 ph10 408 /* This recursive function is called only from find_parens() below. The
1018     top-level call starts at the beginning of the pattern. All other calls must
1019     start at a parenthesis. It scans along a pattern's text looking for capturing
1020 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1021     name it is given, it returns its number. Alternatively, if the name is NULL, it
1022 ph10 408 returns when it reaches a given numbered subpattern. We know that if (?P< is
1023     encountered, the name will be terminated by '>' because that is checked in the
1024 ph10 411 first pass. Recursion is used to keep track of subpatterns that reset the
1025 ph10 408 capturing group numbers - the (?| feature.
1026 nigel 91
1027     Arguments:
1028 ph10 408 ptrptr address of the current character pointer (updated)
1029 ph10 345 cd compile background data
1030 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1031     lorn name length, or subpattern number if name is NULL
1032     xmode TRUE if we are in /x mode
1033 ph10 411 count pointer to the current capturing subpattern number (updated)
1034 nigel 91
1035     Returns: the number of the named subpattern, or -1 if not found
1036     */
1037    
1038     static int
1039 ph10 408 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1040     BOOL xmode, int *count)
1041 nigel 91 {
1042 ph10 408 uschar *ptr = *ptrptr;
1043     int start_count = *count;
1044     int hwm_count = start_count;
1045     BOOL dup_parens = FALSE;
1046 nigel 93
1047 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1048 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1049    
1050     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1051     {
1052     if (ptr[1] == CHAR_QUESTION_MARK &&
1053 ph10 411 ptr[2] == CHAR_VERTICAL_LINE)
1054 ph10 408 {
1055     ptr += 3;
1056 ph10 411 dup_parens = TRUE;
1057     }
1058 ph10 408
1059     /* Handle a normal, unnamed capturing parenthesis */
1060 ph10 411
1061 ph10 408 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1062     {
1063     *count += 1;
1064     if (name == NULL && *count == lorn) return *count;
1065 ph10 411 ptr++;
1066 ph10 408 }
1067    
1068     /* Handle a condition. If it is an assertion, just carry on so that it
1069     is processed as normal. If not, skip to the closing parenthesis of the
1070 ph10 411 condition (there can't be any nested parens. */
1071    
1072 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1073     {
1074 ph10 411 ptr += 2;
1075 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1076     {
1077     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1078 ph10 411 if (*ptr != 0) ptr++;
1079 ph10 408 }
1080 ph10 411 }
1081    
1082 ph10 408 /* We have either (? or (* and not a condition */
1083    
1084     else
1085 ph10 411 {
1086 ph10 408 ptr += 2;
1087     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1088    
1089     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1090 ph10 411
1091 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1092     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1093     {
1094     int term;
1095     const uschar *thisname;
1096     *count += 1;
1097     if (name == NULL && *count == lorn) return *count;
1098     term = *ptr++;
1099     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1100     thisname = ptr;
1101     while (*ptr != term) ptr++;
1102     if (name != NULL && lorn == ptr - thisname &&
1103     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1104     return *count;
1105 ph10 438 term++;
1106 ph10 411 }
1107 ph10 408 }
1108 ph10 411 }
1109 ph10 408
1110 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1111 ph10 408 bars. */
1112    
1113 nigel 91 for (; *ptr != 0; ptr++)
1114     {
1115 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1116    
1117 ph10 391 if (*ptr == CHAR_BACKSLASH)
1118 nigel 93 {
1119 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1120 ph10 391 if (*ptr == CHAR_Q) for (;;)
1121 nigel 93 {
1122 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1123 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1124 ph10 391 if (*(++ptr) == CHAR_E) break;
1125 nigel 93 }
1126     continue;
1127     }
1128    
1129 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1130     are handled for real. If the first character is '^', skip it. Also, if the
1131     first few characters (either before or after ^) are \Q\E or \E we skip them
1132 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1133 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1134 nigel 93
1135 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1136 nigel 93 {
1137 ph10 340 BOOL negate_class = FALSE;
1138     for (;;)
1139     {
1140 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1141 ph10 340 {
1142 ph10 438 if (ptr[2] == CHAR_E)
1143     ptr+= 2;
1144     else if (strncmp((const char *)ptr+2,
1145 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1146 ph10 438 ptr += 4;
1147 ph10 392 else
1148 ph10 391 break;
1149 ph10 340 }
1150 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1151     {
1152 ph10 340 negate_class = TRUE;
1153 ph10 438 ptr++;
1154     }
1155 ph10 340 else break;
1156     }
1157    
1158     /* If the next character is ']', it is a data character that must be
1159 ph10 341 skipped, except in JavaScript compatibility mode. */
1160 ph10 345
1161 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1162 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1163 ph10 345 ptr++;
1164    
1165 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1166 nigel 93 {
1167 ph10 220 if (*ptr == 0) return -1;
1168 ph10 391 if (*ptr == CHAR_BACKSLASH)
1169 nigel 93 {
1170 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1171 ph10 391 if (*ptr == CHAR_Q) for (;;)
1172 nigel 93 {
1173 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1174 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1175 ph10 391 if (*(++ptr) == CHAR_E) break;
1176 nigel 93 }
1177     continue;
1178     }
1179     }
1180     continue;
1181     }
1182    
1183     /* Skip comments in /x mode */
1184    
1185 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1186 nigel 93 {
1187 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1188 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1189 nigel 93 continue;
1190     }
1191    
1192 ph10 408 /* Check for the special metacharacters */
1193 ph10 411
1194 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1195 nigel 93 {
1196 ph10 408 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1197     if (rc > 0) return rc;
1198     if (*ptr == 0) goto FAIL_EXIT;
1199 nigel 93 }
1200 ph10 411
1201 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1202     {
1203     if (dup_parens && *count < hwm_count) *count = hwm_count;
1204     *ptrptr = ptr;
1205     return -1;
1206     }
1207 ph10 411
1208     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1209 ph10 408 {
1210     if (*count > hwm_count) hwm_count = *count;
1211     *count = start_count;
1212 ph10 411 }
1213 ph10 408 }
1214 nigel 93
1215 ph10 408 FAIL_EXIT:
1216     *ptrptr = ptr;
1217     return -1;
1218     }
1219 nigel 93
1220    
1221    
1222    
1223 ph10 408 /*************************************************
1224     * Find forward referenced subpattern *
1225     *************************************************/
1226 nigel 93
1227 ph10 408 /* This function scans along a pattern's text looking for capturing
1228     subpatterns, and counting them. If it finds a named pattern that matches the
1229     name it is given, it returns its number. Alternatively, if the name is NULL, it
1230     returns when it reaches a given numbered subpattern. This is used for forward
1231     references to subpatterns. We used to be able to start this scan from the
1232     current compiling point, using the current count value from cd->bracount, and
1233     do it all in a single loop, but the addition of the possibility of duplicate
1234     subpattern numbers means that we have to scan from the very start, in order to
1235     take account of such duplicates, and to use a recursive function to keep track
1236     of the different types of group.
1237    
1238     Arguments:
1239     cd compile background data
1240     name name to seek, or NULL if seeking a numbered subpattern
1241     lorn name length, or subpattern number if name is NULL
1242     xmode TRUE if we are in /x mode
1243    
1244     Returns: the number of the found subpattern, or -1 if not found
1245     */
1246    
1247     static int
1248     find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1249     {
1250     uschar *ptr = (uschar *)cd->start_pattern;
1251     int count = 0;
1252     int rc;
1253    
1254     /* If the pattern does not start with an opening parenthesis, the first call
1255     to find_parens_sub() will scan right to the end (if necessary). However, if it
1256     does start with a parenthesis, find_parens_sub() will return when it hits the
1257     matching closing parens. That is why we have to have a loop. */
1258    
1259 ph10 411 for (;;)
1260     {
1261 ph10 408 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1262 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1263     }
1264    
1265 ph10 408 return rc;
1266 nigel 91 }
1267    
1268    
1269    
1270 ph10 408
1271 nigel 91 /*************************************************
1272 nigel 77 * Find first significant op code *
1273     *************************************************/
1274    
1275     /* This is called by several functions that scan a compiled expression looking
1276     for a fixed first character, or an anchoring op code etc. It skips over things
1277     that do not influence this. For some calls, a change of option is important.
1278     For some calls, it makes sense to skip negative forward and all backward
1279     assertions, and also the \b assertion; for others it does not.
1280    
1281     Arguments:
1282     code pointer to the start of the group
1283     options pointer to external options
1284     optbit the option bit whose changing is significant, or
1285     zero if none are
1286     skipassert TRUE if certain assertions are to be skipped
1287    
1288     Returns: pointer to the first significant opcode
1289     */
1290    
1291     static const uschar*
1292     first_significant_code(const uschar *code, int *options, int optbit,
1293     BOOL skipassert)
1294     {
1295     for (;;)
1296     {
1297     switch ((int)*code)
1298     {
1299     case OP_OPT:
1300     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1301     *options = (int)code[1];
1302     code += 2;
1303     break;
1304    
1305     case OP_ASSERT_NOT:
1306     case OP_ASSERTBACK:
1307     case OP_ASSERTBACK_NOT:
1308     if (!skipassert) return code;
1309     do code += GET(code, 1); while (*code == OP_ALT);
1310     code += _pcre_OP_lengths[*code];
1311     break;
1312    
1313     case OP_WORD_BOUNDARY:
1314     case OP_NOT_WORD_BOUNDARY:
1315     if (!skipassert) return code;
1316     /* Fall through */
1317    
1318     case OP_CALLOUT:
1319     case OP_CREF:
1320 ph10 459 case OP_NCREF:
1321 nigel 93 case OP_RREF:
1322 ph10 459 case OP_NRREF:
1323 nigel 93 case OP_DEF:
1324 nigel 77 code += _pcre_OP_lengths[*code];
1325     break;
1326    
1327     default:
1328     return code;
1329     }
1330     }
1331     /* Control never reaches here */
1332     }
1333    
1334    
1335    
1336    
1337     /*************************************************
1338 ph10 454 * Find the fixed length of a branch *
1339 nigel 77 *************************************************/
1340    
1341 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1342 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1343 ph10 454 In UTF8 mode, the result is in characters rather than bytes. The branch is
1344     temporarily terminated with OP_END when this function is called.
1345 nigel 77
1346 ph10 454 This function is called when a backward assertion is encountered, so that if it
1347     fails, the error message can point to the correct place in the pattern.
1348     However, we cannot do this when the assertion contains subroutine calls,
1349     because they can be forward references. We solve this by remembering this case
1350     and doing the check at the end; a flag specifies which mode we are running in.
1351    
1352 nigel 77 Arguments:
1353     code points to the start of the pattern (the bracket)
1354     options the compiling options
1355 ph10 454 atend TRUE if called when the pattern is complete
1356     cd the "compile data" structure
1357 nigel 77
1358 ph10 454 Returns: the fixed length,
1359     or -1 if there is no fixed length,
1360 nigel 77 or -2 if \C was encountered
1361 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1362 nigel 77 */
1363    
1364     static int
1365 ph10 454 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1366 nigel 77 {
1367     int length = -1;
1368    
1369     register int branchlength = 0;
1370     register uschar *cc = code + 1 + LINK_SIZE;
1371    
1372     /* Scan along the opcodes for this branch. If we get to the end of the
1373     branch, check the length against that of the other branches. */
1374    
1375     for (;;)
1376     {
1377     int d;
1378 ph10 454 uschar *ce, *cs;
1379 nigel 77 register int op = *cc;
1380     switch (op)
1381     {
1382 nigel 93 case OP_CBRA:
1383 nigel 77 case OP_BRA:
1384     case OP_ONCE:
1385     case OP_COND:
1386 ph10 454 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1387 nigel 77 if (d < 0) return d;
1388     branchlength += d;
1389     do cc += GET(cc, 1); while (*cc == OP_ALT);
1390     cc += 1 + LINK_SIZE;
1391     break;
1392    
1393     /* Reached end of a branch; if it's a ket it is the end of a nested
1394     call. If it's ALT it is an alternation in a nested call. If it is
1395     END it's the end of the outer call. All can be handled by the same code. */
1396    
1397     case OP_ALT:
1398     case OP_KET:
1399     case OP_KETRMAX:
1400     case OP_KETRMIN:
1401     case OP_END:
1402     if (length < 0) length = branchlength;
1403     else if (length != branchlength) return -1;
1404     if (*cc != OP_ALT) return length;
1405     cc += 1 + LINK_SIZE;
1406     branchlength = 0;
1407     break;
1408 ph10 454
1409     /* A true recursion implies not fixed length, but a subroutine call may
1410     be OK. If the subroutine is a forward reference, we can't deal with
1411     it until the end of the pattern, so return -3. */
1412    
1413     case OP_RECURSE:
1414     if (!atend) return -3;
1415     cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1416     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1417     if (cc > cs && cc < ce) return -1; /* Recursion */
1418     d = find_fixedlength(cs + 2, options, atend, cd);
1419     if (d < 0) return d;
1420     branchlength += d;
1421     cc += 1 + LINK_SIZE;
1422     break;
1423 nigel 77
1424     /* Skip over assertive subpatterns */
1425    
1426     case OP_ASSERT:
1427     case OP_ASSERT_NOT:
1428     case OP_ASSERTBACK:
1429     case OP_ASSERTBACK_NOT:
1430     do cc += GET(cc, 1); while (*cc == OP_ALT);
1431     /* Fall through */
1432    
1433     /* Skip over things that don't match chars */
1434    
1435     case OP_REVERSE:
1436     case OP_CREF:
1437 ph10 459 case OP_NCREF:
1438 nigel 93 case OP_RREF:
1439 ph10 459 case OP_NRREF:
1440 nigel 93 case OP_DEF:
1441 nigel 77 case OP_OPT:
1442     case OP_CALLOUT:
1443     case OP_SOD:
1444     case OP_SOM:
1445     case OP_EOD:
1446     case OP_EODN:
1447     case OP_CIRC:
1448     case OP_DOLL:
1449     case OP_NOT_WORD_BOUNDARY:
1450     case OP_WORD_BOUNDARY:
1451     cc += _pcre_OP_lengths[*cc];
1452     break;
1453    
1454     /* Handle literal characters */
1455    
1456     case OP_CHAR:
1457     case OP_CHARNC:
1458 nigel 91 case OP_NOT:
1459 nigel 77 branchlength++;
1460     cc += 2;
1461     #ifdef SUPPORT_UTF8
1462 ph10 426 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1463     cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1464 nigel 77 #endif
1465     break;
1466    
1467     /* Handle exact repetitions. The count is already in characters, but we
1468     need to skip over a multibyte character in UTF8 mode. */
1469    
1470     case OP_EXACT:
1471     branchlength += GET2(cc,1);
1472     cc += 4;
1473     #ifdef SUPPORT_UTF8
1474 ph10 426 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1475     cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1476 nigel 77 #endif
1477     break;
1478    
1479     case OP_TYPEEXACT:
1480     branchlength += GET2(cc,1);
1481 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1482 nigel 77 cc += 4;
1483     break;
1484    
1485     /* Handle single-char matchers */
1486    
1487     case OP_PROP:
1488     case OP_NOTPROP:
1489 nigel 87 cc += 2;
1490 nigel 77 /* Fall through */
1491    
1492     case OP_NOT_DIGIT:
1493     case OP_DIGIT:
1494     case OP_NOT_WHITESPACE:
1495     case OP_WHITESPACE:
1496     case OP_NOT_WORDCHAR:
1497     case OP_WORDCHAR:
1498     case OP_ANY:
1499 ph10 342 case OP_ALLANY:
1500 nigel 77 branchlength++;
1501     cc++;
1502     break;
1503    
1504     /* The single-byte matcher isn't allowed */
1505    
1506     case OP_ANYBYTE:
1507     return -2;
1508    
1509     /* Check a class for variable quantification */
1510    
1511     #ifdef SUPPORT_UTF8
1512     case OP_XCLASS:
1513     cc += GET(cc, 1) - 33;
1514     /* Fall through */
1515     #endif
1516    
1517     case OP_CLASS:
1518     case OP_NCLASS:
1519     cc += 33;
1520    
1521     switch (*cc)
1522     {
1523     case OP_CRSTAR:
1524     case OP_CRMINSTAR:
1525     case OP_CRQUERY:
1526     case OP_CRMINQUERY:
1527     return -1;
1528    
1529     case OP_CRRANGE:
1530     case OP_CRMINRANGE:
1531     if (GET2(cc,1) != GET2(cc,3)) return -1;
1532     branchlength += GET2(cc,1);
1533     cc += 5;
1534     break;
1535    
1536     default:
1537     branchlength++;
1538     }
1539     break;
1540    
1541     /* Anything else is variable length */
1542    
1543     default:
1544     return -1;
1545     }
1546     }
1547     /* Control never gets here */
1548     }
1549    
1550    
1551    
1552    
1553     /*************************************************
1554 ph10 454 * Scan compiled regex for specific bracket *
1555 nigel 77 *************************************************/
1556    
1557     /* This little function scans through a compiled pattern until it finds a
1558 ph10 454 capturing bracket with the given number, or, if the number is negative, an
1559 ph10 455 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1560     so that it can be called from pcre_study() when finding the minimum matching
1561     length.
1562 nigel 77
1563     Arguments:
1564     code points to start of expression
1565     utf8 TRUE in UTF-8 mode
1566 ph10 454 number the required bracket number or negative to find a lookbehind
1567 nigel 77
1568     Returns: pointer to the opcode for the bracket, or NULL if not found
1569     */
1570    
1571 ph10 455 const uschar *
1572     _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1573 nigel 77 {
1574     for (;;)
1575     {
1576     register int c = *code;
1577     if (c == OP_END) return NULL;
1578 nigel 91
1579     /* XCLASS is used for classes that cannot be represented just by a bit
1580     map. This includes negated single high-valued characters. The length in
1581     the table is zero; the actual length is stored in the compiled code. */
1582    
1583     if (c == OP_XCLASS) code += GET(code, 1);
1584 ph10 454
1585     /* Handle recursion */
1586    
1587     else if (c == OP_REVERSE)
1588     {
1589     if (number < 0) return (uschar *)code;
1590     code += _pcre_OP_lengths[c];
1591     }
1592 nigel 91
1593 nigel 93 /* Handle capturing bracket */
1594 nigel 91
1595 nigel 93 else if (c == OP_CBRA)
1596 nigel 77 {
1597 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1598 nigel 77 if (n == number) return (uschar *)code;
1599 nigel 93 code += _pcre_OP_lengths[c];
1600 nigel 77 }
1601 nigel 91
1602 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1603     repeated character types, we have to test for \p and \P, which have an extra
1604 ph10 218 two bytes of parameters. */
1605 nigel 91
1606 nigel 77 else
1607     {
1608 ph10 218 switch(c)
1609     {
1610     case OP_TYPESTAR:
1611     case OP_TYPEMINSTAR:
1612     case OP_TYPEPLUS:
1613     case OP_TYPEMINPLUS:
1614     case OP_TYPEQUERY:
1615     case OP_TYPEMINQUERY:
1616     case OP_TYPEPOSSTAR:
1617     case OP_TYPEPOSPLUS:
1618     case OP_TYPEPOSQUERY:
1619     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1620 ph10 220 break;
1621 ph10 221
1622     case OP_TYPEUPTO:
1623     case OP_TYPEMINUPTO:
1624     case OP_TYPEEXACT:
1625     case OP_TYPEPOSUPTO:
1626     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1627     break;
1628 ph10 220 }
1629    
1630 ph10 218 /* Add in the fixed length from the table */
1631 ph10 220
1632 nigel 77 code += _pcre_OP_lengths[c];
1633 ph10 220
1634 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1635     a multi-byte character. The length in the table is a minimum, so we have to
1636     arrange to skip the extra bytes. */
1637 ph10 220
1638 ph10 107 #ifdef SUPPORT_UTF8
1639 nigel 77 if (utf8) switch(c)
1640     {
1641     case OP_CHAR:
1642     case OP_CHARNC:
1643     case OP_EXACT:
1644     case OP_UPTO:
1645     case OP_MINUPTO:
1646 nigel 93 case OP_POSUPTO:
1647 nigel 77 case OP_STAR:
1648     case OP_MINSTAR:
1649 nigel 93 case OP_POSSTAR:
1650 nigel 77 case OP_PLUS:
1651     case OP_MINPLUS:
1652 nigel 93 case OP_POSPLUS:
1653 nigel 77 case OP_QUERY:
1654     case OP_MINQUERY:
1655 nigel 93 case OP_POSQUERY:
1656     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1657 nigel 77 break;
1658     }
1659 ph10 369 #else
1660     (void)(utf8); /* Keep compiler happy by referencing function argument */
1661 ph10 111 #endif
1662 nigel 77 }
1663     }
1664     }
1665    
1666    
1667    
1668     /*************************************************
1669     * Scan compiled regex for recursion reference *
1670     *************************************************/
1671    
1672     /* This little function scans through a compiled pattern until it finds an
1673     instance of OP_RECURSE.
1674    
1675     Arguments:
1676     code points to start of expression
1677     utf8 TRUE in UTF-8 mode
1678    
1679     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1680     */
1681    
1682     static const uschar *
1683     find_recurse(const uschar *code, BOOL utf8)
1684     {
1685     for (;;)
1686     {
1687     register int c = *code;
1688     if (c == OP_END) return NULL;
1689 nigel 91 if (c == OP_RECURSE) return code;
1690 ph10 220
1691 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1692     map. This includes negated single high-valued characters. The length in
1693     the table is zero; the actual length is stored in the compiled code. */
1694    
1695     if (c == OP_XCLASS) code += GET(code, 1);
1696    
1697 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1698     repeated character types, we have to test for \p and \P, which have an extra
1699 ph10 218 two bytes of parameters. */
1700 nigel 91
1701 nigel 77 else
1702     {
1703 ph10 218 switch(c)
1704     {
1705     case OP_TYPESTAR:
1706     case OP_TYPEMINSTAR:
1707     case OP_TYPEPLUS:
1708     case OP_TYPEMINPLUS:
1709     case OP_TYPEQUERY:
1710     case OP_TYPEMINQUERY:
1711     case OP_TYPEPOSSTAR:
1712     case OP_TYPEPOSPLUS:
1713     case OP_TYPEPOSQUERY:
1714     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1715 ph10 220 break;
1716 ph10 221
1717     case OP_TYPEPOSUPTO:
1718     case OP_TYPEUPTO:
1719     case OP_TYPEMINUPTO:
1720     case OP_TYPEEXACT:
1721     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1722     break;
1723 ph10 220 }
1724    
1725 ph10 218 /* Add in the fixed length from the table */
1726    
1727 nigel 77 code += _pcre_OP_lengths[c];
1728 ph10 220
1729 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1730     by a multi-byte character. The length in the table is a minimum, so we have
1731     to arrange to skip the extra bytes. */
1732 ph10 220
1733 ph10 107 #ifdef SUPPORT_UTF8
1734 nigel 77 if (utf8) switch(c)
1735     {
1736     case OP_CHAR:
1737     case OP_CHARNC:
1738     case OP_EXACT:
1739     case OP_UPTO:
1740     case OP_MINUPTO:
1741 nigel 93 case OP_POSUPTO:
1742 nigel 77 case OP_STAR:
1743     case OP_MINSTAR:
1744 nigel 93 case OP_POSSTAR:
1745 nigel 77 case OP_PLUS:
1746     case OP_MINPLUS:
1747 nigel 93 case OP_POSPLUS:
1748 nigel 77 case OP_QUERY:
1749     case OP_MINQUERY:
1750 nigel 93 case OP_POSQUERY:
1751     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1752 nigel 77 break;
1753     }
1754 ph10 369 #else
1755     (void)(utf8); /* Keep compiler happy by referencing function argument */
1756 ph10 111 #endif
1757 nigel 77 }
1758     }
1759     }
1760    
1761    
1762    
1763     /*************************************************
1764     * Scan compiled branch for non-emptiness *
1765     *************************************************/
1766    
1767     /* This function scans through a branch of a compiled pattern to see whether it
1768 nigel 93 can match the empty string or not. It is called from could_be_empty()
1769     below and from compile_branch() when checking for an unlimited repeat of a
1770     group that can match nothing. Note that first_significant_code() skips over
1771 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1772     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1773     bracket whose current branch will already have been scanned.
1774 nigel 77
1775     Arguments:
1776     code points to start of search
1777     endcode points to where to stop
1778     utf8 TRUE if in UTF8 mode
1779    
1780     Returns: TRUE if what is matched could be empty
1781     */
1782    
1783     static BOOL
1784     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1785     {
1786     register int c;
1787 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1788 nigel 77 code < endcode;
1789     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1790     {
1791     const uschar *ccode;
1792    
1793     c = *code;
1794 ph10 286
1795     /* Skip over forward assertions; the other assertions are skipped by
1796 ph10 282 first_significant_code() with a TRUE final argument. */
1797 ph10 286
1798 ph10 282 if (c == OP_ASSERT)
1799 ph10 286 {
1800 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1801     c = *code;
1802     continue;
1803 ph10 286 }
1804 ph10 172
1805 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1806 nigel 77
1807 ph10 335 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1808 ph10 170 {
1809 ph10 172 code += _pcre_OP_lengths[c];
1810 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1811     c = *code;
1812     continue;
1813     }
1814    
1815     /* For other groups, scan the branches. */
1816 ph10 172
1817 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1818 nigel 77 {
1819     BOOL empty_branch;
1820     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1821 ph10 406
1822     /* If a conditional group has only one branch, there is a second, implied,
1823 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
1824     Otherwise, scan the individual branches of the group. */
1825 ph10 406
1826 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1827 nigel 77 code += GET(code, 1);
1828 ph10 395 else
1829 ph10 406 {
1830 ph10 395 empty_branch = FALSE;
1831     do
1832     {
1833     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1834     empty_branch = TRUE;
1835     code += GET(code, 1);
1836     }
1837     while (*code == OP_ALT);
1838     if (!empty_branch) return FALSE; /* All branches are non-empty */
1839 nigel 77 }
1840 ph10 406
1841 ph10 172 c = *code;
1842 nigel 93 continue;
1843 nigel 77 }
1844    
1845 nigel 93 /* Handle the other opcodes */
1846    
1847     switch (c)
1848 nigel 77 {
1849 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1850     cannot be represented just by a bit map. This includes negated single
1851     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1852 ph10 220 actual length is stored in the compiled code, so we must update "code"
1853 ph10 216 here. */
1854 nigel 77
1855     #ifdef SUPPORT_UTF8
1856     case OP_XCLASS:
1857 ph10 216 ccode = code += GET(code, 1);
1858 nigel 77 goto CHECK_CLASS_REPEAT;
1859     #endif
1860    
1861     case OP_CLASS:
1862     case OP_NCLASS:
1863     ccode = code + 33;
1864    
1865     #ifdef SUPPORT_UTF8
1866     CHECK_CLASS_REPEAT:
1867     #endif
1868    
1869     switch (*ccode)
1870     {
1871     case OP_CRSTAR: /* These could be empty; continue */
1872     case OP_CRMINSTAR:
1873     case OP_CRQUERY:
1874     case OP_CRMINQUERY:
1875     break;
1876    
1877     default: /* Non-repeat => class must match */
1878     case OP_CRPLUS: /* These repeats aren't empty */
1879     case OP_CRMINPLUS:
1880     return FALSE;
1881    
1882     case OP_CRRANGE:
1883     case OP_CRMINRANGE:
1884     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1885     break;
1886     }
1887     break;
1888    
1889     /* Opcodes that must match a character */
1890    
1891     case OP_PROP:
1892     case OP_NOTPROP:
1893     case OP_EXTUNI:
1894     case OP_NOT_DIGIT:
1895     case OP_DIGIT:
1896     case OP_NOT_WHITESPACE:
1897     case OP_WHITESPACE:
1898     case OP_NOT_WORDCHAR:
1899     case OP_WORDCHAR:
1900     case OP_ANY:
1901 ph10 345 case OP_ALLANY:
1902 nigel 77 case OP_ANYBYTE:
1903     case OP_CHAR:
1904     case OP_CHARNC:
1905     case OP_NOT:
1906     case OP_PLUS:
1907     case OP_MINPLUS:
1908 nigel 93 case OP_POSPLUS:
1909 nigel 77 case OP_EXACT:
1910     case OP_NOTPLUS:
1911     case OP_NOTMINPLUS:
1912 nigel 93 case OP_NOTPOSPLUS:
1913 nigel 77 case OP_NOTEXACT:
1914     case OP_TYPEPLUS:
1915     case OP_TYPEMINPLUS:
1916 nigel 93 case OP_TYPEPOSPLUS:
1917 nigel 77 case OP_TYPEEXACT:
1918     return FALSE;
1919 ph10 227
1920     /* These are going to continue, as they may be empty, but we have to
1921     fudge the length for the \p and \P cases. */
1922    
1923 ph10 224 case OP_TYPESTAR:
1924     case OP_TYPEMINSTAR:
1925     case OP_TYPEPOSSTAR:
1926     case OP_TYPEQUERY:
1927     case OP_TYPEMINQUERY:
1928     case OP_TYPEPOSQUERY:
1929     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1930 ph10 227 break;
1931    
1932 ph10 224 /* Same for these */
1933 ph10 227
1934 ph10 224 case OP_TYPEUPTO:
1935     case OP_TYPEMINUPTO:
1936     case OP_TYPEPOSUPTO:
1937     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1938     break;
1939 nigel 77
1940     /* End of branch */
1941    
1942     case OP_KET:
1943     case OP_KETRMAX:
1944     case OP_KETRMIN:
1945     case OP_ALT:
1946     return TRUE;
1947    
1948 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1949     MINUPTO, and POSUPTO may be followed by a multibyte character */
1950 nigel 77
1951     #ifdef SUPPORT_UTF8
1952     case OP_STAR:
1953     case OP_MINSTAR:
1954 nigel 93 case OP_POSSTAR:
1955 nigel 77 case OP_QUERY:
1956     case OP_MINQUERY:
1957 nigel 93 case OP_POSQUERY:
1958 ph10 426 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1959     break;
1960    
1961 nigel 77 case OP_UPTO:
1962     case OP_MINUPTO:
1963 nigel 93 case OP_POSUPTO:
1964 ph10 426 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
1965 nigel 77 break;
1966     #endif
1967     }
1968     }
1969    
1970     return TRUE;
1971     }
1972    
1973    
1974    
1975     /*************************************************
1976     * Scan compiled regex for non-emptiness *
1977     *************************************************/
1978    
1979     /* This function is called to check for left recursive calls. We want to check
1980     the current branch of the current pattern to see if it could match the empty
1981     string. If it could, we must look outwards for branches at other levels,
1982     stopping when we pass beyond the bracket which is the subject of the recursion.
1983    
1984     Arguments:
1985     code points to start of the recursion
1986     endcode points to where to stop (current RECURSE item)
1987     bcptr points to the chain of current (unclosed) branch starts
1988     utf8 TRUE if in UTF-8 mode
1989    
1990     Returns: TRUE if what is matched could be empty
1991     */
1992    
1993     static BOOL
1994     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1995     BOOL utf8)
1996     {
1997     while (bcptr != NULL && bcptr->current >= code)
1998     {
1999     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
2000     bcptr = bcptr->outer;
2001     }
2002     return TRUE;
2003     }
2004    
2005    
2006    
2007     /*************************************************
2008     * Check for POSIX class syntax *
2009     *************************************************/
2010    
2011     /* This function is called when the sequence "[:" or "[." or "[=" is
2012 ph10 295 encountered in a character class. It checks whether this is followed by a
2013 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2014 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2015 nigel 77
2016 ph10 298 Originally, this function only recognized a sequence of letters between the
2017     terminators, but it seems that Perl recognizes any sequence of characters,
2018     though of course unknown POSIX names are subsequently rejected. Perl gives an
2019     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2020     didn't consider this to be a POSIX class. Likewise for [:1234:].
2021 ph10 295
2022 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2023     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2024     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2025     below handles the special case of \], but does not try to do any other escape
2026     processing. This makes it different from Perl for cases such as [:l\ower:]
2027 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2028 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2029 ph10 295 I think.
2030    
2031     Arguments:
2032 nigel 77 ptr pointer to the initial [
2033     endptr where to return the end pointer
2034    
2035     Returns: TRUE or FALSE
2036     */
2037    
2038     static BOOL
2039 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2040 nigel 77 {
2041     int terminator; /* Don't combine these lines; the Solaris cc */
2042     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2043 ph10 295 for (++ptr; *ptr != 0; ptr++)
2044 nigel 77 {
2045 ph10 391 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2046 ph10 298 {
2047 ph10 391 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2048     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2049 ph10 295 {
2050     *endptr = ptr;
2051     return TRUE;
2052 ph10 298 }
2053     }
2054     }
2055 nigel 77 return FALSE;
2056     }
2057    
2058    
2059    
2060    
2061     /*************************************************
2062     * Check POSIX class name *
2063     *************************************************/
2064    
2065     /* This function is called to check the name given in a POSIX-style class entry
2066     such as [:alnum:].
2067    
2068     Arguments:
2069     ptr points to the first letter
2070     len the length of the name
2071    
2072     Returns: a value representing the name, or -1 if unknown
2073     */
2074    
2075     static int
2076     check_posix_name(const uschar *ptr, int len)
2077     {
2078 ph10 240 const char *pn = posix_names;
2079 nigel 77 register int yield = 0;
2080     while (posix_name_lengths[yield] != 0)
2081     {
2082     if (len == posix_name_lengths[yield] &&
2083 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2084 ph10 243 pn += posix_name_lengths[yield] + 1;
2085 nigel 77 yield++;
2086     }
2087     return -1;
2088     }
2089    
2090    
2091     /*************************************************
2092     * Adjust OP_RECURSE items in repeated group *
2093     *************************************************/
2094    
2095     /* OP_RECURSE items contain an offset from the start of the regex to the group
2096     that is referenced. This means that groups can be replicated for fixed
2097     repetition simply by copying (because the recursion is allowed to refer to
2098     earlier groups that are outside the current group). However, when a group is
2099 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2100     inserted before it, after it has been compiled. This means that any OP_RECURSE
2101     items within it that refer to the group itself or any contained groups have to
2102     have their offsets adjusted. That one of the jobs of this function. Before it
2103     is called, the partially compiled regex must be temporarily terminated with
2104     OP_END.
2105 nigel 77
2106 nigel 93 This function has been extended with the possibility of forward references for
2107     recursions and subroutine calls. It must also check the list of such references
2108     for the group we are dealing with. If it finds that one of the recursions in
2109     the current group is on this list, it adjusts the offset in the list, not the
2110     value in the reference (which is a group number).
2111    
2112 nigel 77 Arguments:
2113     group points to the start of the group
2114     adjust the amount by which the group is to be moved
2115     utf8 TRUE in UTF-8 mode
2116     cd contains pointers to tables etc.
2117 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2118 nigel 77
2119     Returns: nothing
2120     */
2121    
2122     static void
2123 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2124     uschar *save_hwm)
2125 nigel 77 {
2126     uschar *ptr = group;
2127 ph10 224
2128 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2129     {
2130 nigel 93 int offset;
2131     uschar *hc;
2132    
2133     /* See if this recursion is on the forward reference list. If so, adjust the
2134     reference. */
2135 ph10 345
2136 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2137     {
2138     offset = GET(hc, 0);
2139     if (cd->start_code + offset == ptr + 1)
2140     {
2141     PUT(hc, 0, offset + adjust);
2142     break;
2143     }
2144     }
2145    
2146     /* Otherwise, adjust the recursion offset if it's after the start of this
2147     group. */
2148    
2149     if (hc >= cd->hwm)
2150     {
2151     offset = GET(ptr, 1);
2152     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2153     }
2154    
2155 nigel 77 ptr += 1 + LINK_SIZE;
2156     }
2157     }
2158    
2159    
2160    
2161     /*************************************************
2162     * Insert an automatic callout point *
2163     *************************************************/
2164    
2165     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2166     callout points before each pattern item.
2167    
2168     Arguments:
2169     code current code pointer
2170     ptr current pattern pointer
2171     cd pointers to tables etc
2172    
2173     Returns: new code pointer
2174     */
2175    
2176     static uschar *
2177     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2178     {
2179     *code++ = OP_CALLOUT;
2180     *code++ = 255;
2181     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2182     PUT(code, LINK_SIZE, 0); /* Default length */
2183     return code + 2*LINK_SIZE;
2184     }
2185    
2186    
2187    
2188     /*************************************************
2189     * Complete a callout item *
2190     *************************************************/
2191    
2192     /* A callout item contains the length of the next item in the pattern, which
2193     we can't fill in till after we have reached the relevant point. This is used
2194     for both automatic and manual callouts.
2195    
2196     Arguments:
2197     previous_callout points to previous callout item
2198     ptr current pattern pointer
2199     cd pointers to tables etc
2200    
2201     Returns: nothing
2202     */
2203    
2204     static void
2205     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2206     {
2207     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2208     PUT(previous_callout, 2 + LINK_SIZE, length);
2209     }
2210    
2211    
2212    
2213     #ifdef SUPPORT_UCP
2214     /*************************************************
2215     * Get othercase range *
2216     *************************************************/
2217    
2218     /* This function is passed the start and end of a class range, in UTF-8 mode
2219     with UCP support. It searches up the characters, looking for internal ranges of
2220     characters in the "other" case. Each call returns the next one, updating the
2221     start address.
2222    
2223     Arguments:
2224     cptr points to starting character value; updated
2225     d end value
2226     ocptr where to put start of othercase range
2227     odptr where to put end of othercase range
2228    
2229     Yield: TRUE when range returned; FALSE when no more
2230     */
2231    
2232     static BOOL
2233 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2234     unsigned int *odptr)
2235 nigel 77 {
2236 nigel 93 unsigned int c, othercase, next;
2237 nigel 77
2238     for (c = *cptr; c <= d; c++)
2239 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2240 nigel 77
2241     if (c > d) return FALSE;
2242    
2243     *ocptr = othercase;
2244     next = othercase + 1;
2245    
2246     for (++c; c <= d; c++)
2247     {
2248 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2249 nigel 77 next++;
2250     }
2251    
2252     *odptr = next - 1;
2253     *cptr = c;
2254    
2255     return TRUE;
2256     }
2257     #endif /* SUPPORT_UCP */
2258    
2259    
2260 nigel 93
2261 nigel 77 /*************************************************
2262 nigel 93 * Check if auto-possessifying is possible *
2263     *************************************************/
2264    
2265     /* This function is called for unlimited repeats of certain items, to see
2266     whether the next thing could possibly match the repeated item. If not, it makes
2267     sense to automatically possessify the repeated item.
2268    
2269     Arguments:
2270     op_code the repeated op code
2271     this data for this item, depends on the opcode
2272     utf8 TRUE in UTF-8 mode
2273     utf8_char used for utf8 character bytes, NULL if not relevant
2274     ptr next character in pattern
2275     options options bits
2276     cd contains pointers to tables etc.
2277    
2278     Returns: TRUE if possessifying is wanted
2279     */
2280    
2281     static BOOL
2282     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2283     const uschar *ptr, int options, compile_data *cd)
2284     {
2285     int next;
2286    
2287     /* Skip whitespace and comments in extended mode */
2288    
2289     if ((options & PCRE_EXTENDED) != 0)
2290     {
2291     for (;;)
2292     {
2293     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2294 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2295 nigel 93 {
2296     while (*(++ptr) != 0)
2297     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2298     }
2299     else break;
2300     }
2301     }
2302    
2303     /* If the next item is one that we can handle, get its value. A non-negative
2304     value is a character, a negative value is an escape value. */
2305    
2306 ph10 391 if (*ptr == CHAR_BACKSLASH)
2307 nigel 93 {
2308     int temperrorcode = 0;
2309     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2310     if (temperrorcode != 0) return FALSE;
2311     ptr++; /* Point after the escape sequence */
2312     }
2313    
2314     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2315     {
2316     #ifdef SUPPORT_UTF8
2317     if (utf8) { GETCHARINC(next, ptr); } else
2318     #endif
2319     next = *ptr++;
2320     }
2321    
2322     else return FALSE;
2323    
2324     /* Skip whitespace and comments in extended mode */
2325    
2326     if ((options & PCRE_EXTENDED) != 0)
2327     {
2328     for (;;)
2329     {
2330     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2331 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2332 nigel 93 {
2333     while (*(++ptr) != 0)
2334     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2335     }
2336     else break;
2337     }
2338     }
2339    
2340     /* If the next thing is itself optional, we have to give up. */
2341    
2342 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2343 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2344     return FALSE;
2345 nigel 93
2346     /* Now compare the next item with the previous opcode. If the previous is a
2347     positive single character match, "item" either contains the character or, if
2348     "item" is greater than 127 in utf8 mode, the character's bytes are in
2349     utf8_char. */
2350    
2351    
2352     /* Handle cases when the next item is a character. */
2353    
2354     if (next >= 0) switch(op_code)
2355     {
2356     case OP_CHAR:
2357     #ifdef SUPPORT_UTF8
2358     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2359 ph10 369 #else
2360     (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2361 nigel 93 #endif
2362     return item != next;
2363    
2364     /* For CHARNC (caseless character) we must check the other case. If we have
2365     Unicode property support, we can use it to test the other case of
2366     high-valued characters. */
2367    
2368     case OP_CHARNC:
2369     #ifdef SUPPORT_UTF8
2370     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2371     #endif
2372     if (item == next) return FALSE;
2373     #ifdef SUPPORT_UTF8
2374     if (utf8)
2375     {
2376     unsigned int othercase;
2377     if (next < 128) othercase = cd->fcc[next]; else
2378     #ifdef SUPPORT_UCP
2379 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2380 nigel 93 #else
2381     othercase = NOTACHAR;
2382     #endif
2383     return (unsigned int)item != othercase;
2384     }
2385     else
2386     #endif /* SUPPORT_UTF8 */
2387     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2388    
2389     /* For OP_NOT, "item" must be a single-byte character. */
2390    
2391     case OP_NOT:
2392     if (item == next) return TRUE;
2393     if ((options & PCRE_CASELESS) == 0) return FALSE;
2394     #ifdef SUPPORT_UTF8
2395     if (utf8)
2396     {
2397     unsigned int othercase;
2398     if (next < 128) othercase = cd->fcc[next]; else
2399     #ifdef SUPPORT_UCP
2400 ph10 349 othercase = UCD_OTHERCASE(next);
2401 nigel 93 #else
2402     othercase = NOTACHAR;
2403     #endif
2404     return (unsigned int)item == othercase;
2405     }
2406     else
2407     #endif /* SUPPORT_UTF8 */
2408     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2409    
2410     case OP_DIGIT:
2411     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2412    
2413     case OP_NOT_DIGIT:
2414     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2415    
2416     case OP_WHITESPACE:
2417     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2418    
2419     case OP_NOT_WHITESPACE:
2420     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2421    
2422     case OP_WORDCHAR:
2423     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2424    
2425     case OP_NOT_WORDCHAR:
2426     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2427    
2428 ph10 180 case OP_HSPACE:
2429     case OP_NOT_HSPACE:
2430     switch(next)
2431     {
2432     case 0x09:
2433     case 0x20:
2434     case 0xa0:
2435     case 0x1680:
2436     case 0x180e:
2437     case 0x2000:
2438     case 0x2001:
2439     case 0x2002:
2440     case 0x2003:
2441     case 0x2004:
2442     case 0x2005:
2443     case 0x2006:
2444     case 0x2007:
2445     case 0x2008:
2446     case 0x2009:
2447     case 0x200A:
2448     case 0x202f:
2449     case 0x205f:
2450     case 0x3000:
2451     return op_code != OP_HSPACE;
2452     default:
2453     return op_code == OP_HSPACE;
2454     }
2455    
2456     case OP_VSPACE:
2457     case OP_NOT_VSPACE:
2458     switch(next)
2459     {
2460     case 0x0a:
2461     case 0x0b:
2462     case 0x0c:
2463     case 0x0d:
2464     case 0x85:
2465     case 0x2028:
2466     case 0x2029:
2467     return op_code != OP_VSPACE;
2468     default:
2469     return op_code == OP_VSPACE;
2470     }
2471    
2472 nigel 93 default:
2473     return FALSE;
2474     }
2475    
2476    
2477     /* Handle the case when the next item is \d, \s, etc. */
2478    
2479     switch(op_code)
2480     {
2481     case OP_CHAR:
2482     case OP_CHARNC:
2483     #ifdef SUPPORT_UTF8
2484     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2485     #endif
2486     switch(-next)
2487     {
2488     case ESC_d:
2489     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2490    
2491     case ESC_D:
2492     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2493    
2494     case ESC_s:
2495     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2496    
2497     case ESC_S:
2498     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2499    
2500     case ESC_w:
2501     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2502    
2503     case ESC_W:
2504     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2505 ph10 182
2506 ph10 180 case ESC_h:
2507     case ESC_H:
2508     switch(item)
2509     {
2510     case 0x09:
2511     case 0x20:
2512     case 0xa0:
2513     case 0x1680:
2514     case 0x180e:
2515     case 0x2000:
2516     case 0x2001:
2517     case 0x2002:
2518     case 0x2003:
2519     case 0x2004:
2520     case 0x2005:
2521     case 0x2006:
2522     case 0x2007:
2523     case 0x2008:
2524     case 0x2009:
2525     case 0x200A:
2526     case 0x202f:
2527     case 0x205f:
2528     case 0x3000:
2529     return -next != ESC_h;
2530     default:
2531     return -next == ESC_h;
2532 ph10 182 }
2533    
2534 ph10 180 case ESC_v:
2535     case ESC_V:
2536     switch(item)
2537     {
2538     case 0x0a:
2539     case 0x0b:
2540     case 0x0c:
2541     case 0x0d:
2542     case 0x85:
2543     case 0x2028:
2544     case 0x2029:
2545     return -next != ESC_v;
2546     default:
2547     return -next == ESC_v;
2548 ph10 182 }
2549 nigel 93
2550     default:
2551     return FALSE;
2552     }
2553    
2554     case OP_DIGIT:
2555 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2556     next == -ESC_h || next == -ESC_v;
2557 nigel 93
2558     case OP_NOT_DIGIT:
2559     return next == -ESC_d;
2560    
2561     case OP_WHITESPACE:
2562     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2563    
2564     case OP_NOT_WHITESPACE:
2565 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2566 nigel 93
2567 ph10 180 case OP_HSPACE:
2568     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2569    
2570     case OP_NOT_HSPACE:
2571     return next == -ESC_h;
2572 ph10 182
2573 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2574 ph10 182 case OP_VSPACE:
2575 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2576    
2577     case OP_NOT_VSPACE:
2578 ph10 182 return next == -ESC_v;
2579 ph10 180
2580 nigel 93 case OP_WORDCHAR:
2581 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2582 nigel 93
2583     case OP_NOT_WORDCHAR:
2584     return next == -ESC_w || next == -ESC_d;
2585 ph10 182
2586 nigel 93 default:
2587     return FALSE;
2588     }
2589    
2590     /* Control does not reach here */
2591     }
2592    
2593    
2594    
2595     /*************************************************
2596 nigel 77 * Compile one branch *
2597     *************************************************/
2598    
2599 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2600 nigel 77 changed during the branch, the pointer is used to change the external options
2601 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2602     to find out the amount of memory needed, as well as during the real compile
2603     phase. The value of lengthptr distinguishes the two phases.
2604 nigel 77
2605     Arguments:
2606     optionsptr pointer to the option bits
2607     codeptr points to the pointer to the current code point
2608     ptrptr points to the current pattern pointer
2609     errorcodeptr points to error code variable
2610     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2611     reqbyteptr set to the last literal character required, else < 0
2612     bcptr points to current branch chain
2613     cd contains pointers to tables etc.
2614 nigel 93 lengthptr NULL during the real compile phase
2615     points to length accumulator during pre-compile phase
2616 nigel 77
2617     Returns: TRUE on success
2618     FALSE, with *errorcodeptr set non-zero on error
2619     */
2620    
2621     static BOOL
2622 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2623     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2624     compile_data *cd, int *lengthptr)
2625 nigel 77 {
2626     int repeat_type, op_type;
2627     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2628     int bravalue = 0;
2629     int greedy_default, greedy_non_default;
2630     int firstbyte, reqbyte;
2631     int zeroreqbyte, zerofirstbyte;
2632     int req_caseopt, reqvary, tempreqvary;
2633     int options = *optionsptr;
2634     int after_manual_callout = 0;
2635 nigel 93 int length_prevgroup = 0;
2636 nigel 77 register int c;
2637     register uschar *code = *codeptr;
2638 nigel 93 uschar *last_code = code;
2639     uschar *orig_code = code;
2640 nigel 77 uschar *tempcode;
2641     BOOL inescq = FALSE;
2642     BOOL groupsetfirstbyte = FALSE;
2643     const uschar *ptr = *ptrptr;
2644     const uschar *tempptr;
2645     uschar *previous = NULL;
2646     uschar *previous_callout = NULL;
2647 nigel 93 uschar *save_hwm = NULL;
2648 nigel 77 uschar classbits[32];
2649    
2650     #ifdef SUPPORT_UTF8
2651     BOOL class_utf8;
2652     BOOL utf8 = (options & PCRE_UTF8) != 0;
2653     uschar *class_utf8data;
2654 ph10 300 uschar *class_utf8data_base;
2655 nigel 77 uschar utf8_char[6];
2656     #else
2657     BOOL utf8 = FALSE;
2658 nigel 93 uschar *utf8_char = NULL;
2659 nigel 77 #endif
2660    
2661 nigel 93 #ifdef DEBUG
2662     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2663     #endif
2664    
2665 nigel 77 /* Set up the default and non-default settings for greediness */
2666    
2667     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2668     greedy_non_default = greedy_default ^ 1;
2669    
2670     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2671     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2672     matches a non-fixed char first char; reqbyte just remains unset if we never
2673     find one.
2674    
2675     When we hit a repeat whose minimum is zero, we may have to adjust these values
2676     to take the zero repeat into account. This is implemented by setting them to
2677     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2678     item types that can be repeated set these backoff variables appropriately. */
2679    
2680     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2681    
2682     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2683     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2684     value > 255. It is added into the firstbyte or reqbyte variables to record the
2685     case status of the value. This is used only for ASCII characters. */
2686    
2687     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2688    
2689     /* Switch on next character until the end of the branch */
2690    
2691     for (;; ptr++)
2692     {
2693     BOOL negate_class;
2694 ph10 286 BOOL should_flip_negation;
2695 nigel 77 BOOL possessive_quantifier;
2696     BOOL is_quantifier;
2697 nigel 93 BOOL is_recurse;
2698 ph10 180 BOOL reset_bracount;
2699 nigel 77 int class_charcount;
2700     int class_lastchar;
2701     int newoptions;
2702     int recno;
2703 ph10 172 int refsign;
2704 nigel 77 int skipbytes;
2705     int subreqbyte;
2706     int subfirstbyte;
2707 nigel 93 int terminator;
2708 nigel 77 int mclength;
2709     uschar mcbuffer[8];
2710    
2711 nigel 93 /* Get next byte in the pattern */
2712 nigel 77
2713     c = *ptr;
2714 ph10 345
2715 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2716     previous cycle of this loop. */
2717    
2718     if (lengthptr != NULL)
2719     {
2720     #ifdef DEBUG
2721     if (code > cd->hwm) cd->hwm = code; /* High water info */
2722     #endif
2723     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2724     {
2725     *errorcodeptr = ERR52;
2726     goto FAILED;
2727     }
2728    
2729     /* There is at least one situation where code goes backwards: this is the
2730     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2731     the class is simply eliminated. However, it is created first, so we have to
2732     allow memory for it. Therefore, don't ever reduce the length at this point.
2733     */
2734    
2735     if (code < last_code) code = last_code;
2736 ph10 202
2737     /* Paranoid check for integer overflow */
2738    
2739     if (OFLOW_MAX - *lengthptr < code - last_code)
2740     {
2741     *errorcodeptr = ERR20;
2742     goto FAILED;
2743     }
2744    
2745 nigel 93 *lengthptr += code - last_code;
2746     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2747    
2748     /* If "previous" is set and it is not at the start of the work space, move
2749     it back to there, in order to avoid filling up the work space. Otherwise,
2750     if "previous" is NULL, reset the current code pointer to the start. */
2751    
2752     if (previous != NULL)
2753     {
2754     if (previous > orig_code)
2755     {
2756     memmove(orig_code, previous, code - previous);
2757     code -= previous - orig_code;
2758     previous = orig_code;
2759     }
2760     }
2761     else code = orig_code;
2762    
2763     /* Remember where this code item starts so we can pick up the length
2764     next time round. */
2765    
2766     last_code = code;
2767     }
2768    
2769     /* In the real compile phase, just check the workspace used by the forward
2770     reference list. */
2771    
2772     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2773     {
2774     *errorcodeptr = ERR52;
2775     goto FAILED;
2776     }
2777    
2778 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2779    
2780     if (inescq && c != 0)
2781     {
2782 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2783 nigel 77 {
2784     inescq = FALSE;
2785     ptr++;
2786     continue;
2787     }
2788     else
2789     {
2790     if (previous_callout != NULL)
2791     {
2792 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2793     complete_callout(previous_callout, ptr, cd);
2794 nigel 77 previous_callout = NULL;
2795     }
2796     if ((options & PCRE_AUTO_CALLOUT) != 0)
2797     {
2798     previous_callout = code;
2799     code = auto_callout(code, ptr, cd);
2800     }
2801     goto NORMAL_CHAR;
2802     }
2803     }
2804    
2805     /* Fill in length of a previous callout, except when the next thing is
2806     a quantifier. */
2807    
2808 ph10 392 is_quantifier =
2809 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2810     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2811 nigel 77
2812     if (!is_quantifier && previous_callout != NULL &&
2813     after_manual_callout-- <= 0)
2814     {
2815 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2816     complete_callout(previous_callout, ptr, cd);
2817 nigel 77 previous_callout = NULL;
2818     }
2819    
2820     /* In extended mode, skip white space and comments */
2821    
2822     if ((options & PCRE_EXTENDED) != 0)
2823     {
2824     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2825 ph10 391 if (c == CHAR_NUMBER_SIGN)
2826 nigel 77 {
2827 nigel 93 while (*(++ptr) != 0)
2828 nigel 91 {
2829 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2830 nigel 91 }
2831 nigel 93 if (*ptr != 0) continue;
2832    
2833 nigel 91 /* Else fall through to handle end of string */
2834     c = 0;
2835 nigel 77 }
2836     }
2837    
2838     /* No auto callout for quantifiers. */
2839    
2840     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2841     {
2842     previous_callout = code;
2843     code = auto_callout(code, ptr, cd);
2844     }
2845    
2846     switch(c)
2847     {
2848 nigel 93 /* ===================================================================*/
2849     case 0: /* The branch terminates at string end */
2850 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
2851     case CHAR_RIGHT_PARENTHESIS:
2852 nigel 77 *firstbyteptr = firstbyte;
2853     *reqbyteptr = reqbyte;
2854     *codeptr = code;
2855     *ptrptr = ptr;
2856 nigel 93 if (lengthptr != NULL)
2857     {
2858 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2859     {
2860     *errorcodeptr = ERR20;
2861     goto FAILED;
2862     }
2863 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2864     DPRINTF((">> end branch\n"));
2865     }
2866 nigel 77 return TRUE;
2867    
2868 nigel 93
2869     /* ===================================================================*/
2870 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2871     the setting of any following char as a first character. */
2872    
2873 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
2874 nigel 77 if ((options & PCRE_MULTILINE) != 0)
2875     {
2876     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2877     }
2878     previous = NULL;
2879     *code++ = OP_CIRC;
2880     break;
2881    
2882 ph10 391 case CHAR_DOLLAR_SIGN:
2883 nigel 77 previous = NULL;
2884     *code++ = OP_DOLL;
2885     break;
2886    
2887     /* There can never be a first char if '.' is first, whatever happens about
2888     repeats. The value of reqbyte doesn't change either. */
2889    
2890 ph10 391 case CHAR_DOT:
2891 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2892     zerofirstbyte = firstbyte;
2893     zeroreqbyte = reqbyte;
2894     previous = code;
2895 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2896 nigel 77 break;
2897    
2898 nigel 93
2899     /* ===================================================================*/
2900 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2901     32-byte bitmap of the permitted characters, except in the special case
2902     where there is only one such character. For negated classes, we build the
2903     map as usual, then invert it at the end. However, we use a different opcode
2904     so that data characters > 255 can be handled correctly.
2905 nigel 77
2906     If the class contains characters outside the 0-255 range, a different
2907     opcode is compiled. It may optionally have a bit map for characters < 256,
2908     but those above are are explicitly listed afterwards. A flag byte tells
2909     whether the bitmap is present, and whether this is a negated class or not.
2910 ph10 345
2911 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
2912     default (Perl) mode, it is treated as a data character. */
2913 ph10 345
2914 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
2915 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2916     {
2917     *errorcodeptr = ERR64;
2918 ph10 345 goto FAILED;
2919 ph10 336 }
2920 ph10 345 goto NORMAL_CHAR;
2921 nigel 77
2922 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
2923 nigel 77 previous = code;
2924    
2925     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2926     they are encountered at the top level, so we'll do that too. */
2927    
2928 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2929 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
2930 ph10 295 check_posix_syntax(ptr, &tempptr))
2931 nigel 77 {
2932 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2933 nigel 77 goto FAILED;
2934     }
2935    
2936 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2937 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2938 ph10 205 skip them too. This makes for compatibility with Perl. */
2939 ph10 208
2940 ph10 205 negate_class = FALSE;
2941     for (;;)
2942 nigel 77 {
2943     c = *(++ptr);
2944 ph10 391 if (c == CHAR_BACKSLASH)
2945 ph10 205 {
2946 ph10 392 if (ptr[1] == CHAR_E)
2947 ph10 391 ptr++;
2948 ph10 392 else if (strncmp((const char *)ptr+1,
2949     STR_Q STR_BACKSLASH STR_E, 3) == 0)
2950 ph10 391 ptr += 3;
2951 ph10 392 else
2952 ph10 391 break;
2953 ph10 205 }
2954 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2955 ph10 205 negate_class = TRUE;
2956     else break;
2957 ph10 208 }
2958 ph10 345
2959     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2960     an initial ']' is taken as a data character -- the code below handles
2961 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2962     [^] must match any character, so generate OP_ALLANY. */
2963 ph10 345
2964 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2965 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2966 ph10 341 {
2967     *code++ = negate_class? OP_ALLANY : OP_FAIL;
2968     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2969     zerofirstbyte = firstbyte;
2970     break;
2971 ph10 345 }
2972 nigel 77
2973 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
2974     negation flag at the end, so that support for characters > 255 works
2975 ph10 264 correctly (they are all included in the class). */
2976    
2977     should_flip_negation = FALSE;
2978    
2979 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
2980 nigel 93 of just a single character (as long as it's < 256). However, For higher
2981     valued UTF-8 characters, we don't yet do any optimization. */
2982 nigel 77
2983     class_charcount = 0;
2984     class_lastchar = -1;
2985    
2986 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2987     temporary bit of memory, in case the class contains only 1 character (less
2988     than 256), because in that case the compiled code doesn't use the bit map.
2989     */
2990    
2991     memset(classbits, 0, 32 * sizeof(uschar));
2992    
2993 nigel 77 #ifdef SUPPORT_UTF8
2994     class_utf8 = FALSE; /* No chars >= 256 */
2995 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2996 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2997 nigel 77 #endif
2998    
2999     /* Process characters until ] is reached. By writing this as a "do" it
3000 nigel 93 means that an initial ] is taken as a data character. At the start of the
3001     loop, c contains the first byte of the character. */
3002 nigel 77
3003 nigel 93 if (c != 0) do
3004 nigel 77 {
3005 nigel 93 const uschar *oldptr;
3006    
3007 nigel 77 #ifdef SUPPORT_UTF8
3008     if (utf8 && c > 127)
3009     { /* Braces are required because the */
3010     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3011     }
3012 ph10 309
3013 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3014 ph10 309 data and reset the pointer. This is so that very large classes that
3015 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
3016 ph10 309 (which is on the stack). */
3017    
3018 ph10 300 if (lengthptr != NULL)
3019     {
3020     *lengthptr += class_utf8data - class_utf8data_base;
3021 ph10 309 class_utf8data = class_utf8data_base;
3022     }
3023    
3024 nigel 77 #endif
3025    
3026     /* Inside \Q...\E everything is literal except \E */
3027    
3028     if (inescq)
3029     {
3030 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3031 nigel 77 {
3032 nigel 93 inescq = FALSE; /* Reset literal state */
3033     ptr++; /* Skip the 'E' */
3034     continue; /* Carry on with next */
3035 nigel 77 }
3036 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
3037 nigel 77 }
3038    
3039     /* Handle POSIX class names. Perl allows a negation extension of the
3040     form [:^name:]. A square bracket that doesn't match the syntax is
3041     treated as a literal. We also recognize the POSIX constructions
3042     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3043     5.6 and 5.8 do. */
3044    
3045 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3046 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3047 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3048 nigel 77 {
3049     BOOL local_negate = FALSE;
3050 nigel 87 int posix_class, taboffset, tabopt;
3051 nigel 77 register const uschar *cbits = cd->cbits;
3052 nigel 87 uschar pbits[32];
3053 nigel 77
3054 ph10 391 if (ptr[1] != CHAR_COLON)
3055 nigel 77 {
3056     *errorcodeptr = ERR31;
3057     goto FAILED;
3058     }
3059    
3060     ptr += 2;
3061 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3062 nigel 77 {
3063     local_negate = TRUE;
3064 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3065 nigel 77 ptr++;
3066     }
3067    
3068     posix_class = check_posix_name(ptr, tempptr - ptr);
3069     if (posix_class < 0)
3070     {
3071     *errorcodeptr = ERR30;
3072     goto FAILED;
3073     }
3074    
3075     /* If matching is caseless, upper and lower are converted to
3076     alpha. This relies on the fact that the class table starts with
3077     alpha, lower, upper as the first 3 entries. */
3078    
3079     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3080     posix_class = 0;
3081    
3082 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
3083     because we may be adding and subtracting from it, and we don't want to
3084     subtract bits that may be in the main map already. At the end we or the
3085     result into the bit map that is being built. */
3086 nigel 77
3087     posix_class *= 3;
3088 nigel 87
3089     /* Copy in the first table (always present) */
3090    
3091     memcpy(pbits, cbits + posix_class_maps[posix_class],
3092     32 * sizeof(uschar));
3093    
3094     /* If there is a second table, add or remove it as required. */
3095    
3096     taboffset = posix_class_maps[posix_class + 1];
3097     tabopt = posix_class_maps[posix_class + 2];
3098    
3099     if (taboffset >= 0)
3100 nigel 77 {
3101 nigel 87 if (tabopt >= 0)
3102     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3103 nigel 77 else
3104 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3105 nigel 77 }
3106    
3107 nigel 87 /* Not see if we need to remove any special characters. An option
3108     value of 1 removes vertical space and 2 removes underscore. */
3109    
3110     if (tabopt < 0) tabopt = -tabopt;
3111     if (tabopt == 1) pbits[1] &= ~0x3c;
3112     else if (tabopt == 2) pbits[11] &= 0x7f;
3113    
3114     /* Add the POSIX table or its complement into the main table that is
3115     being built and we are done. */
3116    
3117     if (local_negate)
3118     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3119     else
3120     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3121    
3122 nigel 77 ptr = tempptr + 1;
3123     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3124     continue; /* End of POSIX syntax handling */
3125     }
3126    
3127     /* Backslash may introduce a single character, or it may introduce one
3128 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3129     case. Inside a class (and only there) it is treated as backspace.
3130     Elsewhere it marks a word boundary. Other escapes have preset maps ready
3131 ph10 205 to 'or' into the one we are building. We assume they have more than one
3132 nigel 77 character in them, so set class_charcount bigger than one. */
3133    
3134 ph10 391 if (c == CHAR_BACKSLASH)
3135 nigel 77 {
3136 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3137     if (*errorcodeptr != 0) goto FAILED;
3138 nigel 77
3139 ph10 391 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3140     else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
3141     else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
3142 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3143     {
3144 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3145 nigel 77 {
3146     ptr += 2; /* avoid empty string */
3147     }
3148     else inescq = TRUE;
3149     continue;
3150     }
3151 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3152 nigel 77
3153     if (c < 0)
3154     {
3155     register const uschar *cbits = cd->cbits;
3156     class_charcount += 2; /* Greater than 1 is what matters */
3157 nigel 93
3158     /* Save time by not doing this in the pre-compile phase. */
3159    
3160     if (lengthptr == NULL) switch (-c)
3161 nigel 77 {
3162     case ESC_d:
3163     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3164     continue;
3165    
3166     case ESC_D:
3167 ph10 286 should_flip_negation = TRUE;
3168 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3169     continue;
3170    
3171     case ESC_w:
3172     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3173     continue;
3174    
3175     case ESC_W:
3176 ph10 286 should_flip_negation = TRUE;
3177 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3178     continue;
3179    
3180     case ESC_s:
3181     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3182     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3183     continue;
3184    
3185     case ESC_S:
3186 ph10 286 should_flip_negation = TRUE;
3187 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3188     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3189     continue;
3190    
3191 nigel 93 default: /* Not recognized; fall through */
3192     break; /* Need "default" setting to stop compiler warning. */
3193     }
3194    
3195     /* In the pre-compile phase, just do the recognition. */
3196    
3197     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3198     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3199 ph10 180
3200 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
3201     they use extra memory. */
3202 ph10 180
3203 ph10 178 if (-c == ESC_h)
3204     {
3205     SETBIT(classbits, 0x09); /* VT */
3206     SETBIT(classbits, 0x20); /* SPACE */
3207 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
3208 ph10 178 #ifdef SUPPORT_UTF8
3209     if (utf8)
3210 ph10 180 {
3211 ph10 178 class_utf8 = TRUE;
3212     *class_utf8data++ = XCL_SINGLE;
3213 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3214 ph10 178 *class_utf8data++ = XCL_SINGLE;
3215 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3216     *class_utf8data++ = XCL_RANGE;
3217     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3218     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3219 ph10 178 *class_utf8data++ = XCL_SINGLE;
3220 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3221 ph10 178 *class_utf8data++ = XCL_SINGLE;
3222 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3223 ph10 178 *class_utf8data++ = XCL_SINGLE;
3224 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3225     }
3226     #endif
3227     continue;
3228     }
3229 nigel 93
3230 ph10 178 if (-c == ESC_H)
3231     {
3232     for (c = 0; c < 32; c++)
3233     {
3234     int x = 0xff;
3235     switch (c)
3236 ph10 180 {
3237 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
3238     case 0x20/8: x ^= 1 << (0x20%8); break;
3239     case 0xa0/8: x ^= 1 << (0xa0%8); break;
3240     default: break;
3241     }
3242     classbits[c] |= x;
3243 ph10 180 }
3244    
3245 ph10 178 #ifdef SUPPORT_UTF8
3246     if (utf8)
3247 ph10 180 {
3248 ph10 178 class_utf8 = TRUE;
3249 ph10 180 *class_utf8data++ = XCL_RANGE;
3250     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3251     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3252     *class_utf8data++ = XCL_RANGE;
3253     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3254     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3255     *class_utf8data++ = XCL_RANGE;
3256     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3257     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3258     *class_utf8data++ = XCL_RANGE;
3259     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3260     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3261     *class_utf8data++ = XCL_RANGE;
3262     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3263     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3264     *class_utf8data++ = XCL_RANGE;
3265     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3266     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3267     *class_utf8data++ = XCL_RANGE;
3268     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3269     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3270     }
3271     #endif
3272     continue;
3273     }
3274 ph10 178
3275     if (-c == ESC_v)
3276     {
3277     SETBIT(classbits, 0x0a); /* LF */
3278     SETBIT(classbits, 0x0b); /* VT */
3279 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3280     SETBIT(classbits, 0x0d); /* CR */
3281     SETBIT(classbits, 0x85); /* NEL */
3282 ph10 178 #ifdef SUPPORT_UTF8
3283     if (utf8)
3284 ph10 180 {
3285 ph10 178 class_utf8 = TRUE;
3286 ph10 180 *class_utf8data++ = XCL_RANGE;
3287     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3288     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3289     }
3290     #endif
3291     continue;
3292     }
3293 ph10 178
3294     if (-c == ESC_V)
3295     {
3296     for (c = 0; c < 32; c++)
3297     {
3298     int x = 0xff;
3299     switch (c)
3300 ph10 180 {
3301 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3302     x ^= 1 << (0x0b%8);
3303     x ^= 1 << (0x0c%8);
3304 ph10 180 x ^= 1 << (0x0d%8);
3305 ph10 178 break;
3306     case 0x85/8: x ^= 1 << (0x85%8); break;
3307     default: break;
3308     }
3309     classbits[c] |= x;
3310 ph10 180 }
3311    
3312 ph10 178 #ifdef SUPPORT_UTF8
3313     if (utf8)
3314 ph10 180 {
3315 ph10 178 class_utf8 = TRUE;
3316 ph10 180 *class_utf8data++ = XCL_RANGE;
3317     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3318     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3319     *class_utf8data++ = XCL_RANGE;
3320     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3321     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3322     }
3323     #endif
3324     continue;
3325     }
3326 ph10 178
3327 nigel 93 /* We need to deal with \P and \p in both phases. */
3328    
3329 nigel 77 #ifdef SUPPORT_UCP
3330 nigel 93 if (-c == ESC_p || -c == ESC_P)
3331     {
3332     BOOL negated;
3333     int pdata;
3334     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3335     if (ptype < 0) goto FAILED;
3336     class_utf8 = TRUE;
3337     *class_utf8data++ = ((-c == ESC_p) != negated)?
3338     XCL_PROP : XCL_NOTPROP;
3339     *class_utf8data++ = ptype;
3340     *class_utf8data++ = pdata;
3341     class_charcount -= 2; /* Not a < 256 character */
3342 nigel 77 continue;
3343 nigel 93 }
3344 nigel 77 #endif
3345 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
3346     strict mode. By default, for compatibility with Perl, they are
3347     treated as literals. */
3348 nigel 77
3349 nigel 93 if ((options & PCRE_EXTRA) != 0)
3350     {
3351     *errorcodeptr = ERR7;
3352     goto FAILED;
3353     }
3354 nigel 77
3355 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3356     c = *ptr; /* Get the final character and fall through */
3357 nigel 77 }
3358    
3359     /* Fall through if we have a single character (c >= 0). This may be
3360 nigel 93 greater than 256 in UTF-8 mode. */
3361 nigel 77
3362     } /* End of backslash handling */
3363    
3364     /* A single character may be followed by '-' to form a range. However,
3365     Perl does not permit ']' to be the end of the range. A '-' character
3366 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3367     entirely. The code for handling \Q and \E is messy. */
3368 nigel 77
3369 nigel 93 CHECK_RANGE:
3370 ph10 391 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3371 nigel 77 {
3372 nigel 93 inescq = FALSE;
3373     ptr += 2;
3374     }
3375    
3376     oldptr = ptr;
3377 ph10 231
3378 ph10 230 /* Remember \r or \n */
3379 ph10 231
3380 ph10 391 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3381 ph10 231
3382 ph10 230 /* Check for range */
3383 nigel 93
3384 ph10 391 if (!inescq && ptr[1] == CHAR_MINUS)
3385 nigel 93 {
3386 nigel 77 int d;
3387     ptr += 2;
3388 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3389 nigel 77
3390 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3391     mode. */
3392    
3393 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3394 nigel 93 {
3395     ptr += 2;
3396 ph10 392 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3397 ph10 391 { ptr += 2; continue; }
3398 nigel 93 inescq = TRUE;
3399     break;
3400     }
3401    
3402 ph10 391 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3403 nigel 93 {
3404     ptr = oldptr;
3405     goto LONE_SINGLE_CHARACTER;
3406     }
3407    
3408 nigel 77 #ifdef SUPPORT_UTF8
3409     if (utf8)
3410     { /* Braces are required because the */
3411     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3412     }
3413     else
3414     #endif
3415     d = *ptr; /* Not UTF-8 mode */
3416    
3417     /* The second part of a range can be a single-character escape, but
3418     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3419     in such circumstances. */
3420    
3421 ph10 391 if (!inescq && d == CHAR_BACKSLASH)
3422 nigel 77 {
3423 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3424     if (*errorcodeptr != 0) goto FAILED;
3425 nigel 77
3426 ph10 275 /* \b is backspace; \X is literal X; \R is literal R; any other
3427 nigel 93 special means the '-' was literal */
3428 nigel 77
3429     if (d < 0)
3430     {
3431 ph10 391 if (d == -ESC_b) d = CHAR_BS;
3432     else if (d == -ESC_X) d = CHAR_X;
3433     else if (d == -ESC_R) d = CHAR_R; else
3434 nigel 77 {
3435 nigel 93 ptr = oldptr;
3436 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3437     }
3438     }
3439     }
3440    
3441 nigel 93 /* Check that the two values are in the correct order. Optimize
3442     one-character ranges */
3443 nigel 77
3444 nigel 93 if (d < c)
3445     {
3446     *errorcodeptr = ERR8;
3447     goto FAILED;
3448     }
3449    
3450 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3451    
3452 ph10 230 /* Remember \r or \n */
3453 ph10 231
3454 ph10 391 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3455 ph10 231
3456 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3457     matching, we have to use an XCLASS with extra data items. Caseless
3458     matching for characters > 127 is available only if UCP support is
3459     available. */
3460    
3461     #ifdef SUPPORT_UTF8
3462     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3463     {
3464     class_utf8 = TRUE;
3465    
3466     /* With UCP support, we can find the other case equivalents of
3467     the relevant characters. There may be several ranges. Optimize how
3468     they fit with the basic range. */
3469    
3470     #ifdef SUPPORT_UCP
3471     if ((options & PCRE_CASELESS) != 0)
3472     {
3473 nigel 93 unsigned int occ, ocd;
3474     unsigned int cc = c;
3475     unsigned int origd = d;
3476 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3477     {
3478 ph10 180 if (occ >= (unsigned int)c &&
3479     ocd <= (unsigned int)d)
3480 ph10 176 continue; /* Skip embedded ranges */
3481 nigel 77
3482 ph10 180 if (occ < (unsigned int)c &&
3483 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3484 nigel 77 { /* if there is overlap, */
3485     c = occ; /* noting that if occ < c */
3486     continue; /* we can't have ocd > d */
3487     } /* because a subrange is */
3488 ph10 180 if (ocd > (unsigned int)d &&
3489 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3490 nigel 77 { /* the basic range. */
3491     d = ocd;
3492     continue;
3493     }
3494    
3495     if (occ == ocd)
3496     {
3497     *class_utf8data++ = XCL_SINGLE;
3498     }
3499     else
3500     {
3501     *class_utf8data++ = XCL_RANGE;
3502     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3503     }
3504     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3505     }
3506     }
3507     #endif /* SUPPORT_UCP */
3508    
3509     /* Now record the original range, possibly modified for UCP caseless
3510     overlapping ranges. */
3511    
3512     *class_utf8data++ = XCL_RANGE;
3513     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3514     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3515    
3516     /* With UCP support, we are done. Without UCP support, there is no
3517     caseless matching for UTF-8 characters > 127; we can use the bit map
3518     for the smaller ones. */
3519    
3520     #ifdef SUPPORT_UCP
3521     continue; /* With next character in the class */
3522     #else
3523     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3524    
3525     /* Adjust upper limit and fall through to set up the map */
3526    
3527     d = 127;
3528    
3529     #endif /* SUPPORT_UCP */
3530     }
3531     #endif /* SUPPORT_UTF8 */
3532    
3533     /* We use the bit map for all cases when not in UTF-8 mode; else
3534     ranges that lie entirely within 0-127 when there is UCP support; else
3535     for partial ranges without UCP support. */
3536    
3537 nigel 93 class_charcount += d - c + 1;
3538     class_lastchar = d;
3539    
3540     /* We can save a bit of time by skipping this in the pre-compile. */
3541    
3542     if (lengthptr == NULL) for (; c <= d; c++)
3543 nigel 77 {
3544     classbits[c/8] |= (1 << (c&7));
3545     if ((options & PCRE_CASELESS) != 0)
3546     {
3547     int uc = cd->fcc[c]; /* flip case */
3548     classbits[uc/8] |= (1 << (uc&7));
3549     }
3550     }
3551    
3552     continue; /* Go get the next char in the class */
3553     }
3554    
3555     /* Handle a lone single character - we can get here for a normal
3556     non-escape char, or after \ that introduces a single character or for an
3557     apparent range that isn't. */
3558    
3559     LONE_SINGLE_CHARACTER:
3560 ph10 231
3561 nigel 77 /* Handle a character that cannot go in the bit map */
3562    
3563     #ifdef SUPPORT_UTF8
3564     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3565     {
3566     class_utf8 = TRUE;
3567     *class_utf8data++ = XCL_SINGLE;
3568     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3569    
3570     #ifdef SUPPORT_UCP
3571     if ((options & PCRE_CASELESS) != 0)
3572     {
3573 nigel 93 unsigned int othercase;
3574 ph10 349 if ((othercase = UCD_OTHERCASE(c)) != c)
3575 nigel 77 {
3576     *class_utf8data++ = XCL_SINGLE;
3577     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3578     }
3579     }
3580     #endif /* SUPPORT_UCP */
3581    
3582     }
3583     else
3584     #endif /* SUPPORT_UTF8 */
3585    
3586     /* Handle a single-byte character */
3587     {
3588     classbits[c/8] |= (1 << (c&7));
3589     if ((options & PCRE_CASELESS) != 0)
3590     {
3591     c = cd->fcc[c]; /* flip case */
3592     classbits[c/8] |= (1 << (c&7));
3593     }
3594     class_charcount++;
3595     class_lastchar = c;
3596     }
3597     }
3598    
3599 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3600 nigel 77
3601 ph10 391 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3602 nigel 77
3603 nigel 93 if (c == 0) /* Missing terminating ']' */
3604     {
3605     *errorcodeptr = ERR6;
3606     goto FAILED;
3607     }
3608 ph10 231
3609    
3610 ph10 230 /* This code has been disabled because it would mean that \s counts as
3611     an explicit \r or \n reference, and that's not really what is wanted. Now
3612     we set the flag only if there is a literal "\r" or "\n" in the class. */
3613 ph10 227
3614 ph10 230 #if 0
3615 ph10 226 /* Remember whether \r or \n are in this class */
3616 ph10 227
3617 ph10 226 if (negate_class)
3618     {
3619 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3620 ph10 226 }
3621     else
3622     {
3623 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3624 ph10 227 }
3625 ph10 230 #endif
3626 ph10 227
3627 ph10 231
3628 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3629 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3630     use of \p or \P, in other words, no use of any XCLASS features, we can
3631     optimize.
3632    
3633 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3634     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3635     operate on single-bytes only. This is an historical hangover. Maybe one day
3636     we can tidy these opcodes to handle multi-byte characters.
3637 nigel 77
3638     The optimization throws away the bit map. We turn the item into a
3639     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3640     that OP_NOT does not support multibyte characters. In the positive case, it
3641     can cause firstbyte to be set. Otherwise, there can be no first char if
3642     this item is first, whatever repeat count may follow. In the case of
3643     reqbyte, save the previous value for reinstating. */
3644    
3645     #ifdef SUPPORT_UTF8
3646 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3647 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3648 nigel 77 #else
3649     if (class_charcount == 1)
3650     #endif
3651     {
3652     zeroreqbyte = reqbyte;
3653    
3654     /* The OP_NOT opcode works on one-byte characters only. */
3655    
3656     if (negate_class)
3657     {
3658     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3659     zerofirstbyte = firstbyte;
3660     *code++ = OP_NOT;
3661     *code++ = class_lastchar;
3662     break;
3663     }
3664    
3665     /* For a single, positive character, get the value into mcbuffer, and
3666     then we can handle this with the normal one-character code. */
3667    
3668     #ifdef SUPPORT_UTF8
3669     if (utf8 && class_lastchar > 127)
3670     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3671     else
3672     #endif
3673     {
3674     mcbuffer[0] = class_lastchar;
3675     mclength = 1;
3676     }
3677     goto ONE_CHAR;
3678     } /* End of 1-char optimization */
3679    
3680     /* The general case - not the one-char optimization. If this is the first
3681     thing in the branch, there can be no first char setting, whatever the
3682     repeat count. Any reqbyte setting must remain unchanged after any kind of
3683     repeat. */
3684    
3685     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3686     zerofirstbyte = firstbyte;
3687     zeroreqbyte = reqbyte;
3688    
3689     /* If there are characters with values > 255, we have to compile an
3690 ph10 286 extended class, with its own opcode, unless there was a negated special
3691     such as \S in the class, because in that case all characters > 255 are in
3692     the class, so any that were explicitly given as well can be ignored. If
3693 ph10 264 (when there are explicit characters > 255 that must be listed) there are no
3694     characters < 256, we can omit the bitmap in the actual compiled code. */
3695 nigel 77
3696     #ifdef SUPPORT_UTF8
3697 ph10 264 if (class_utf8 && !should_flip_negation)
3698 nigel 77 {
3699     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3700     *code++ = OP_XCLASS;
3701     code += LINK_SIZE;
3702     *code = negate_class? XCL_NOT : 0;
3703    
3704 nigel 93 /* If the map is required, move up the extra data to make room for it;
3705     otherwise just move the code pointer to the end of the extra data. */
3706 nigel 77
3707     if (class_charcount > 0)
3708     {
3709     *code++ |= XCL_MAP;
3710 nigel 93 memmove(code + 32, code, class_utf8data - code);
3711 nigel 77 memcpy(code, classbits, 32);
3712 nigel 93 code = class_utf8data + 32;
3713 nigel 77 }
3714 nigel 93 else code = class_utf8data;
3715 nigel 77
3716     /* Now fill in the complete length of the item */
3717    
3718     PUT(previous, 1, code - previous);
3719     break; /* End of class handling */
3720     }
3721     #endif
3722    
3723 ph10 286 /* If there are no characters > 255, set the opcode to OP_CLASS or
3724     OP_NCLASS, depending on whether the whole class was negated and whether
3725     there were negative specials such as \S in the class. Then copy the 32-byte
3726 ph10 264 map into the code vector, negating it if necessary. */
3727 ph10 286
3728 ph10 264 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3729 nigel 77 if (negate_class)
3730     {
3731 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3732     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3733 nigel 77 }
3734     else
3735     {
3736     memcpy(code, classbits, 32);
3737     }
3738     code += 32;
3739     break;
3740    
3741 nigel 93
3742     /* ===================================================================*/
3743 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3744     has been tested above. */
3745    
3746 ph10 391 case CHAR_LEFT_CURLY_BRACKET:
3747 nigel 77 if (!is_quantifier) goto NORMAL_CHAR;
3748     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3749     if (*errorcodeptr != 0) goto FAILED;
3750     goto REPEAT;
3751    
3752 ph10 391 case CHAR_ASTERISK:
3753 nigel 77 repeat_min = 0;
3754     repeat_max = -1;
3755     goto REPEAT;
3756    
3757 ph10 391 case CHAR_PLUS:
3758 nigel 77 repeat_min = 1;
3759     repeat_max = -1;
3760     goto REPEAT;
3761    
3762 ph10 391 case CHAR_QUESTION_MARK:
3763 nigel 77 repeat_min = 0;
3764     repeat_max = 1;
3765    
3766     REPEAT:
3767     if (previous == NULL)
3768     {
3769     *errorcodeptr = ERR9;
3770     goto FAILED;
3771     }
3772    
3773     if (repeat_min == 0)
3774     {
3775     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3776     reqbyte = zeroreqbyte; /* Ditto */
3777     }
3778    
3779     /* Remember whether this is a variable length repeat */
3780    
3781     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3782    
3783     op_type = 0; /* Default single-char op codes */
3784     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3785    
3786     /* Save start of previous item, in case we have to move it up to make space
3787     for an inserted OP_ONCE for the additional '+' extension. */
3788    
3789     tempcode = previous;
3790    
3791     /* If the next character is '+', we have a possessive quantifier. This
3792     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3793     If the next character is '?' this is a minimizing repeat, by default,
3794     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3795     repeat type to the non-default. */
3796    
3797 ph10 391 if (ptr[1] == CHAR_PLUS)
3798 nigel 77 {
3799     repeat_type = 0; /* Force greedy */
3800     possessive_quantifier = TRUE;
3801     ptr++;
3802     }
3803 ph10 391 else if (ptr[1] == CHAR_QUESTION_MARK)
3804 nigel 77 {
3805     repeat_type = greedy_non_default;
3806     ptr++;
3807     }
3808     else repeat_type = greedy_default;
3809    
3810     /* If previous was a character match, abolish the item and generate a
3811     repeat item instead. If a char item has a minumum of more than one, ensure
3812     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3813     the first thing in a branch because the x will have gone into firstbyte
3814     instead. */
3815    
3816     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3817     {
3818     /* Deal with UTF-8 characters that take up more than one byte. It's
3819     easier to write this out separately than try to macrify it. Use c to
3820     hold the length of the character in bytes, plus 0x80 to flag that it's a
3821     length rather than a small character. */
3822    
3823     #ifdef SUPPORT_UTF8
3824     if (utf8 && (code[-1] & 0x80) != 0)
3825     {
3826     uschar *lastchar = code - 1;
3827     while((*lastchar & 0xc0) == 0x80) lastchar--;
3828     c = code - lastchar; /* Length of UTF-8 character */
3829     memcpy(utf8_char, lastchar, c); /* Save the char */
3830     c |= 0x80; /* Flag c as a length */
3831     }
3832     else
3833     #endif
3834    
3835     /* Handle the case of a single byte - either with no UTF8 support, or
3836     with UTF-8 disabled, or for a UTF-8 character < 128. */
3837    
3838     {
3839     c = code[-1];
3840     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3841     }
3842    
3843 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3844     the line is something that cannot possibly match this character. If so,
3845     automatically possessifying this item gains some performance in the case
3846     where the match fails. */
3847    
3848     if (!possessive_quantifier &&
3849     repeat_max < 0 &&
3850     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3851     options, cd))
3852     {
3853     repeat_type = 0; /* Force greedy */
3854     possessive_quantifier = TRUE;
3855     }
3856    
3857 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3858     }
3859    
3860     /* If previous was a single negated character ([^a] or similar), we use
3861     one of the special opcodes, replacing it. The code is shared with single-
3862     character repeats by setting opt_type to add a suitable offset into
3863 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3864     currently used only for single-byte chars. */
3865 nigel 77
3866     else if (*previous == OP_NOT)
3867     {
3868     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3869     c = previous[1];
3870 nigel 93 if (!possessive_quantifier &&
3871     repeat_max < 0 &&
3872     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3873     {
3874     repeat_type = 0; /* Force greedy */
3875     possessive_quantifier = TRUE;
3876     }
3877 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3878     }
3879    
3880     /* If previous was a character type match (\d or similar), abolish it and
3881     create a suitable repeat item. The code is shared with single-character
3882     repeats by setting op_type to add a suitable offset into repeat_type. Note
3883     the the Unicode property types will be present only when SUPPORT_UCP is
3884     defined, but we don't wrap the little bits of code here because it just
3885     makes it horribly messy. */
3886    
3887     else if (*previous < OP_EODN)
3888     {
3889     uschar *oldcode;
3890 nigel 87 int prop_type, prop_value;
3891 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3892     c = *previous;
3893    
3894 nigel 93 if (!possessive_quantifier &&
3895     repeat_max < 0 &&
3896     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3897     {
3898     repeat_type = 0; /* Force greedy */
3899     possessive_quantifier = TRUE;
3900     }
3901    
3902 nigel 77 OUTPUT_SINGLE_REPEAT:
3903 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3904     {
3905     prop_type = previous[1];
3906     prop_value = previous[2];
3907     }
3908     else prop_type = prop_value = -1;
3909 nigel 77
3910     oldcode = code;
3911     code = previous; /* Usually overwrite previous item */
3912    
3913     /* If the maximum is zero then the minimum must also be zero; Perl allows
3914     this case, so we do too - by simply omitting the item altogether. */
3915    
3916     if (repeat_max == 0) goto END_REPEAT;
3917    
3918 ph10 426 /*--------------------------------------------------------------------*/
3919     /* This code is obsolete from release 8.00; the restriction was finally
3920     removed: */
3921    
3922 nigel 77 /* All real repeats make it impossible to handle partial matching (maybe
3923     one day we will be able to remove this restriction). */
3924 ph10 426
3925     /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3926     /*--------------------------------------------------------------------*/
3927 nigel 77
3928     /* Combine the op_type with the repeat_type */
3929    
3930     repeat_type += op_type;
3931    
3932     /* A minimum of zero is handled either as the special case * or ?, or as
3933     an UPTO, with the maximum given. */
3934    
3935     if (repeat_min == 0)
3936     {
3937     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3938     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3939     else
3940     {
3941     *code++ = OP_UPTO + repeat_type;
3942     PUT2INC(code, 0, repeat_max);
3943     }
3944     }
3945    
3946     /* A repeat minimum of 1 is optimized into some special cases. If the
3947 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3948 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3949     one less than the maximum. */
3950    
3951     else if (repeat_min == 1)
3952     {
3953     if (repeat_max == -1)
3954     *code++ = OP_PLUS + repeat_type;
3955     else
3956     {
3957     code = oldcode; /* leave previous item in place */
3958     if (repeat_max == 1) goto END_REPEAT;
3959     *code++ = OP_UPTO + repeat_type;
3960     PUT2INC(code, 0, repeat_max - 1);
3961     }
3962     }
3963    
3964     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3965     handled as an EXACT followed by an UPTO. */
3966    
3967     else
3968     {
3969     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3970     PUT2INC(code, 0, repeat_min);
3971    
3972     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3973     we have to insert the character for the previous code. For a repeated
3974 nigel 87 Unicode property match, there are two extra bytes that define the
3975 nigel