/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 395 - (hide annotations) (download)
Fri Mar 20 11:22:42 2009 UTC (4 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 208090 byte(s)
Fix looping bug by recognizing that a conditional with only one branch may 
match an empty string.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 381 Copyright (c) 1997-2009 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 391 #ifndef EBCDIC
101    
102     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
103 ph10 392 in UTF-8 mode. */
104 ph10 391
105 ph10 392 static const short int escapes[] = {
106 ph10 391 0, 0,
107     0, 0,
108 ph10 392 0, 0,
109     0, 0,
110     0, 0,
111 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
112 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
113 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
114 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
115     -ESC_B, -ESC_C,
116     -ESC_D, -ESC_E,
117     0, -ESC_G,
118     -ESC_H, 0,
119     0, -ESC_K,
120 ph10 391 0, 0,
121 ph10 392 0, 0,
122 ph10 391 -ESC_P, -ESC_Q,
123     -ESC_R, -ESC_S,
124 ph10 392 0, 0,
125     -ESC_V, -ESC_W,
126     -ESC_X, 0,
127     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
128 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
129 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
130 ph10 391 CHAR_GRAVE_ACCENT, 7,
131 ph10 392 -ESC_b, 0,
132     -ESC_d, ESC_e,
133 ph10 391 ESC_f, 0,
134     -ESC_h, 0,
135 ph10 392 0, -ESC_k,
136 ph10 391 0, 0,
137     ESC_n, 0,
138 ph10 392 -ESC_p, 0,
139     ESC_r, -ESC_s,
140 ph10 391 ESC_tee, 0,
141 ph10 392 -ESC_v, -ESC_w,
142     0, 0,
143 ph10 391 -ESC_z
144 nigel 77 };
145    
146 ph10 392 #else
147 ph10 391
148     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
149    
150 nigel 77 static const short int escapes[] = {
151     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
152     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
153     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
154     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
155     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
156     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
157     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
158     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
159 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
160 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
161 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
162 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
163 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
164     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
165     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
166     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
167 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
168 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
169 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
170 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
171 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
172     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
173     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
174     };
175     #endif
176    
177    
178 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
179     searched linearly. Put all the names into a single string, in order to reduce
180 ph10 392 the number of relocations when a shared library is dynamically linked. The
181     string is built from string macros so that it works in UTF-8 mode on EBCDIC
182 ph10 391 platforms. */
183 ph10 210
184     typedef struct verbitem {
185     int len;
186     int op;
187 ph10 211 } verbitem;
188 ph10 210
189 ph10 240 static const char verbnames[] =
190 ph10 391 STRING_ACCEPT0
191     STRING_COMMIT0
192     STRING_F0
193     STRING_FAIL0
194     STRING_PRUNE0
195     STRING_SKIP0
196     STRING_THEN;
197 ph10 240
198 ph10 327 static const verbitem verbs[] = {
199 ph10 240 { 6, OP_ACCEPT },
200     { 6, OP_COMMIT },
201     { 1, OP_FAIL },
202     { 4, OP_FAIL },
203     { 5, OP_PRUNE },
204     { 4, OP_SKIP },
205     { 4, OP_THEN }
206 ph10 210 };
207    
208 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
209 ph10 210
210    
211 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
212     now all in a single string, to reduce the number of relocations when a shared
213 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
214     length entry. The first three must be alpha, lower, upper, as this is assumed
215     for handling case independence. */
216 nigel 77
217 ph10 240 static const char posix_names[] =
218 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
219     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
220 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
221     STRING_word0 STRING_xdigit;
222 nigel 77
223     static const uschar posix_name_lengths[] = {
224     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
225    
226 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
227     base map, with an optional addition or removal of another map. Then, for some
228     classes, there is some additional tweaking: for [:blank:] the vertical space
229     characters are removed, and for [:alpha:] and [:alnum:] the underscore
230     character is removed. The triples in the table consist of the base map offset,
231     second map offset or -1 if no second map, and a non-negative value for map
232     addition or a negative value for map subtraction (if there are two maps). The
233     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
234     remove vertical space characters, 2 => remove underscore. */
235 nigel 77
236     static const int posix_class_maps[] = {
237 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
238     cbit_lower, -1, 0, /* lower */
239     cbit_upper, -1, 0, /* upper */
240     cbit_word, -1, 2, /* alnum - word without underscore */
241     cbit_print, cbit_cntrl, 0, /* ascii */
242     cbit_space, -1, 1, /* blank - a GNU extension */
243     cbit_cntrl, -1, 0, /* cntrl */
244     cbit_digit, -1, 0, /* digit */
245     cbit_graph, -1, 0, /* graph */
246     cbit_print, -1, 0, /* print */
247     cbit_punct, -1, 0, /* punct */
248     cbit_space, -1, 0, /* space */
249     cbit_word, -1, 0, /* word - a Perl extension */
250     cbit_xdigit,-1, 0 /* xdigit */
251 nigel 77 };
252    
253    
254 nigel 93 #define STRING(a) # a
255     #define XSTRING(s) STRING(s)
256    
257 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
258 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
259     they are documented. Always add a new error instead. Messages marked DEAD below
260 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
261     the number of relocations needed when a shared library is loaded dynamically,
262     it is now one long string. We cannot use a table of offsets, because the
263     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
264     simply count through to the one we want - this isn't a performance issue
265 ph10 240 because these strings are used only when there is a compilation error. */
266 nigel 77
267 ph10 240 static const char error_texts[] =
268     "no error\0"
269     "\\ at end of pattern\0"
270     "\\c at end of pattern\0"
271     "unrecognized character follows \\\0"
272     "numbers out of order in {} quantifier\0"
273 nigel 77 /* 5 */
274 ph10 240 "number too big in {} quantifier\0"
275     "missing terminating ] for character class\0"
276     "invalid escape sequence in character class\0"
277     "range out of order in character class\0"
278     "nothing to repeat\0"
279 nigel 77 /* 10 */
280 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
281     "internal error: unexpected repeat\0"
282 ph10 269 "unrecognized character after (? or (?-\0"
283 ph10 240 "POSIX named classes are supported only within a class\0"
284     "missing )\0"
285 nigel 77 /* 15 */
286 ph10 240 "reference to non-existent subpattern\0"
287     "erroffset passed as NULL\0"
288     "unknown option bit(s) set\0"
289     "missing ) after comment\0"
290     "parentheses nested too deeply\0" /** DEAD **/
291 nigel 77 /* 20 */
292 ph10 240 "regular expression is too large\0"
293     "failed to get memory\0"
294     "unmatched parentheses\0"
295     "internal error: code overflow\0"
296     "unrecognized character after (?<\0"
297 nigel 77 /* 25 */
298 ph10 240 "lookbehind assertion is not fixed length\0"
299     "malformed number or name after (?(\0"
300     "conditional group contains more than two branches\0"
301     "assertion expected after (?(\0"
302     "(?R or (?[+-]digits must be followed by )\0"
303 nigel 77 /* 30 */
304 ph10 240 "unknown POSIX class name\0"
305     "POSIX collating elements are not supported\0"
306     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
307     "spare error\0" /** DEAD **/
308     "character value in \\x{...} sequence is too large\0"
309 nigel 77 /* 35 */
310 ph10 240 "invalid condition (?(0)\0"
311     "\\C not allowed in lookbehind assertion\0"
312     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
313     "number after (?C is > 255\0"
314     "closing ) for (?C expected\0"
315 nigel 77 /* 40 */
316 ph10 240 "recursive call could loop indefinitely\0"
317     "unrecognized character after (?P\0"
318     "syntax error in subpattern name (missing terminator)\0"
319     "two named subpatterns have the same name\0"
320     "invalid UTF-8 string\0"
321 nigel 77 /* 45 */
322 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
323     "malformed \\P or \\p sequence\0"
324     "unknown property name after \\P or \\p\0"
325     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
326     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
327 nigel 91 /* 50 */
328 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
329     "octal value is greater than \\377 (not in UTF-8 mode)\0"
330     "internal error: overran compiling workspace\0"
331     "internal error: previously-checked referenced subpattern not found\0"
332     "DEFINE group contains more than one branch\0"
333 nigel 93 /* 55 */
334 ph10 240 "repeating a DEFINE group is not allowed\0"
335     "inconsistent NEWLINE options\0"
336 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
337     "a numbered reference must not be zero\0"
338 ph10 240 "(*VERB) with an argument is not supported\0"
339 ph10 211 /* 60 */
340 ph10 240 "(*VERB) not recognized\0"
341 ph10 268 "number is too big\0"
342 ph10 272 "subpattern name expected\0"
343 ph10 336 "digit expected after (?+\0"
344 ph10 345 "] is an invalid data character in JavaScript compatibility mode";
345 nigel 77
346    
347     /* Table to identify digits and hex digits. This is used when compiling
348     patterns. Note that the tables in chartables are dependent on the locale, and
349     may mark arbitrary characters as digits - but the PCRE compiling code expects
350     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
351     a private table here. It costs 256 bytes, but it is a lot faster than doing
352     character value tests (at least in some simple cases I timed), and in some
353     applications one wants PCRE to compile efficiently as well as match
354     efficiently.
355    
356     For convenience, we use the same bit definitions as in chartables:
357    
358     0x04 decimal digit
359     0x08 hexadecimal digit
360    
361     Then we can use ctype_digit and ctype_xdigit in the code. */
362    
363 ph10 392 #ifndef EBCDIC
364 ph10 391
365 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
366 ph10 391 UTF-8 mode. */
367    
368 nigel 77 static const unsigned char digitab[] =
369     {
370     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
371     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
374     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
375     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
376     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
377     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
378     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
379     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
380     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
381     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
382     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
383     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
387     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
388     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
393     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
394     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
395     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
396     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
397     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
398     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
399     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
400     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
401     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
402    
403 ph10 392 #else
404 ph10 391
405     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
406    
407 nigel 77 static const unsigned char digitab[] =
408     {
409     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
410     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
411     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
412     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
413     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
414     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
415     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
416     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
417     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
418     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
419     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
420 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
421 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
422     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
423     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
424     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
425     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
426     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
427     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
428     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
429     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
430     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
431     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
432     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
433     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
434     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
435     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
436     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
437     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
438     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
439     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
440     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
441    
442     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
443     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
444     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
445     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
446     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
447     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
448     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
449     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
450     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
451     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
452     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
453     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
454 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
455 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
456     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
457     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
458     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
459     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
460     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
461     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
462     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
463     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
464     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
465     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
466     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
467     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
468     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
469     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
470     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
471     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
472     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
473     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
474     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
475     #endif
476    
477    
478     /* Definition to allow mutual recursion */
479    
480     static BOOL
481 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
482 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
483 nigel 77
484    
485    
486     /*************************************************
487 ph10 240 * Find an error text *
488     *************************************************/
489    
490 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
491     some of the text is of unknown length, we can't use a table of offsets.
492     Instead, just count through the strings. This is not a performance issue
493 ph10 240 because it happens only when there has been a compilation error.
494    
495     Argument: the error number
496     Returns: pointer to the error string
497     */
498    
499     static const char *
500     find_error_text(int n)
501     {
502     const char *s = error_texts;
503 ph10 369 for (; n > 0; n--) while (*s++ != 0) {};
504 ph10 240 return s;
505     }
506    
507    
508     /*************************************************
509 nigel 77 * Handle escapes *
510     *************************************************/
511    
512     /* This function is called when a \ has been encountered. It either returns a
513     positive value for a simple escape such as \n, or a negative value which
514 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
515     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
516     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
517     ptr is pointing at the \. On exit, it is on the final character of the escape
518     sequence.
519 nigel 77
520     Arguments:
521     ptrptr points to the pattern position pointer
522     errorcodeptr points to the errorcode variable
523     bracount number of previous extracting brackets
524     options the options bits
525     isclass TRUE if inside a character class
526    
527     Returns: zero or positive => a data character
528     negative => a special escape sequence
529 ph10 213 on error, errorcodeptr is set
530 nigel 77 */
531    
532     static int
533     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
534     int options, BOOL isclass)
535     {
536 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
537     const uschar *ptr = *ptrptr + 1;
538 nigel 77 int c, i;
539    
540 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
541     ptr--; /* Set pointer back to the last byte */
542    
543 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
544    
545     if (c == 0) *errorcodeptr = ERR1;
546    
547 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
548     in a table. A non-zero result is something that can be returned immediately.
549 nigel 77 Otherwise further processing may be required. */
550    
551 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
552     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
553     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
554 nigel 77
555 ph10 97 #else /* EBCDIC coding */
556 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
557 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
558     #endif
559    
560     /* Escapes that need further processing, or are illegal. */
561    
562     else
563     {
564     const uschar *oldptr;
565 nigel 93 BOOL braced, negated;
566    
567 nigel 77 switch (c)
568     {
569     /* A number of Perl escapes are not handled by PCRE. We give an explicit
570     error. */
571    
572 ph10 391 case CHAR_l:
573     case CHAR_L:
574     case CHAR_N:
575     case CHAR_u:
576     case CHAR_U:
577 nigel 77 *errorcodeptr = ERR37;
578     break;
579    
580 ph10 333 /* \g must be followed by one of a number of specific things:
581 ph10 345
582 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
583     backreference. If negative, it is a relative backreference. This is a Perl
584     5.10 feature.
585 ph10 345
586 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
587     is part of Perl's movement towards a unified syntax for back references. As
588     this is synonymous with \k{name}, we fudge it up by pretending it really
589     was \k.
590 ph10 345
591     (3) For Oniguruma compatibility we also support \g followed by a name or a
592     number either in angle brackets or in single quotes. However, these are
593     (possibly recursive) subroutine calls, _not_ backreferences. Just return
594 ph10 333 the -ESC_g code (cf \k). */
595 nigel 93
596 ph10 391 case CHAR_g:
597     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
598 ph10 333 {
599     c = -ESC_g;
600 ph10 345 break;
601     }
602 ph10 333
603     /* Handle the Perl-compatible cases */
604 ph10 345
605 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
606 nigel 93 {
607 ph10 171 const uschar *p;
608 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
609     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
610     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
611 ph10 171 {
612     c = -ESC_k;
613     break;
614 ph10 172 }
615 nigel 93 braced = TRUE;
616     ptr++;
617     }
618     else braced = FALSE;
619    
620 ph10 391 if (ptr[1] == CHAR_MINUS)
621 nigel 93 {
622     negated = TRUE;
623     ptr++;
624     }
625     else negated = FALSE;
626    
627     c = 0;
628     while ((digitab[ptr[1]] & ctype_digit) != 0)
629 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
630 ph10 220
631 ph10 333 if (c < 0) /* Integer overflow */
632 ph10 213 {
633     *errorcodeptr = ERR61;
634     break;
635 ph10 220 }
636 ph10 345
637 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
638 nigel 93 {
639     *errorcodeptr = ERR57;
640 ph10 213 break;
641 nigel 93 }
642 ph10 345
643 ph10 333 if (c == 0)
644     {
645     *errorcodeptr = ERR58;
646     break;
647 ph10 345 }
648 nigel 93
649     if (negated)
650     {
651     if (c > bracount)
652     {
653     *errorcodeptr = ERR15;
654 ph10 213 break;
655 nigel 93 }
656     c = bracount - (c - 1);
657     }
658    
659     c = -(ESC_REF + c);
660     break;
661    
662 nigel 77 /* The handling of escape sequences consisting of a string of digits
663     starting with one that is not zero is not straightforward. By experiment,
664     the way Perl works seems to be as follows:
665    
666     Outside a character class, the digits are read as a decimal number. If the
667     number is less than 10, or if there are that many previous extracting
668     left brackets, then it is a back reference. Otherwise, up to three octal
669     digits are read to form an escaped byte. Thus \123 is likely to be octal
670     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
671     value is greater than 377, the least significant 8 bits are taken. Inside a
672     character class, \ followed by a digit is always an octal number. */
673    
674 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
675     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
676 nigel 77
677     if (!isclass)
678     {
679     oldptr = ptr;
680 ph10 391 c -= CHAR_0;
681 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
682 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
683 ph10 333 if (c < 0) /* Integer overflow */
684 ph10 213 {
685     *errorcodeptr = ERR61;
686 ph10 220 break;
687     }
688 nigel 77 if (c < 10 || c <= bracount)
689     {
690     c = -(ESC_REF + c);
691     break;
692     }
693     ptr = oldptr; /* Put the pointer back and fall through */
694     }
695    
696     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
697     generates a binary zero byte and treats the digit as a following literal.
698     Thus we have to pull back the pointer by one. */
699    
700 ph10 391 if ((c = *ptr) >= CHAR_8)
701 nigel 77 {
702     ptr--;
703     c = 0;
704     break;
705     }
706    
707     /* \0 always starts an octal number, but we may drop through to here with a
708 nigel 91 larger first octal digit. The original code used just to take the least
709     significant 8 bits of octal numbers (I think this is what early Perls used
710     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
711     than 3 octal digits. */
712 nigel 77
713 ph10 391 case CHAR_0:
714     c -= CHAR_0;
715     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
716     c = c * 8 + *(++ptr) - CHAR_0;
717 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
718 nigel 77 break;
719    
720 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
721     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
722     treated as a data character. */
723 nigel 77
724 ph10 391 case CHAR_x:
725     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
726 nigel 77 {
727     const uschar *pt = ptr + 2;
728 nigel 87 int count = 0;
729    
730 nigel 77 c = 0;
731     while ((digitab[*pt] & ctype_xdigit) != 0)
732     {
733 nigel 87 register int cc = *pt++;
734 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
735 nigel 77 count++;
736 nigel 87
737 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
738     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
739     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
740 ph10 97 #else /* EBCDIC coding */
741 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
742     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
743 nigel 77 #endif
744     }
745 nigel 87
746 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
747 nigel 77 {
748 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
749 nigel 77 ptr = pt;
750     break;
751     }
752 nigel 87
753 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
754     recognize this construct; fall through to the normal \x handling. */
755     }
756    
757 nigel 87 /* Read just a single-byte hex-defined char */
758 nigel 77
759     c = 0;
760     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
761     {
762 ph10 391 int cc; /* Some compilers don't like */
763     cc = *(++ptr); /* ++ in initializers */
764     #ifndef EBCDIC /* ASCII/UTF-8 coding */
765     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
766     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
767 ph10 97 #else /* EBCDIC coding */
768 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
769     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
770 nigel 77 #endif
771     }
772     break;
773    
774 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
775     This coding is ASCII-specific, but then the whole concept of \cx is
776     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
777 nigel 77
778 ph10 391 case CHAR_c:
779 nigel 77 c = *(++ptr);
780     if (c == 0)
781     {
782     *errorcodeptr = ERR2;
783 ph10 213 break;
784 nigel 77 }
785    
786 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
787     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
788 nigel 77 c ^= 0x40;
789 ph10 97 #else /* EBCDIC coding */
790 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
791 nigel 77 c ^= 0xC0;
792     #endif
793     break;
794    
795     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
796 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
797     otherwise, for Perl compatibility, it is a literal. This code looks a bit
798     odd, but there used to be some cases other than the default, and there may
799     be again in future, so I haven't "optimized" it. */
800 nigel 77
801     default:
802     if ((options & PCRE_EXTRA) != 0) switch(c)
803     {
804     default:
805     *errorcodeptr = ERR3;
806     break;
807     }
808     break;
809     }
810     }
811    
812     *ptrptr = ptr;
813     return c;
814     }
815    
816    
817    
818     #ifdef SUPPORT_UCP
819     /*************************************************
820     * Handle \P and \p *
821     *************************************************/
822    
823     /* This function is called after \P or \p has been encountered, provided that
824     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
825     pointing at the P or p. On exit, it is pointing at the final character of the
826     escape sequence.
827    
828     Argument:
829     ptrptr points to the pattern position pointer
830     negptr points to a boolean that is set TRUE for negation else FALSE
831 nigel 87 dptr points to an int that is set to the detailed property value
832 nigel 77 errorcodeptr points to the error code variable
833    
834 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
835 nigel 77 */
836    
837     static int
838 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
839 nigel 77 {
840     int c, i, bot, top;
841     const uschar *ptr = *ptrptr;
842 nigel 87 char name[32];
843 nigel 77
844     c = *(++ptr);
845     if (c == 0) goto ERROR_RETURN;
846    
847     *negptr = FALSE;
848    
849 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
850     negation. */
851 nigel 77
852 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
853 nigel 77 {
854 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
855 nigel 77 {
856     *negptr = TRUE;
857     ptr++;
858     }
859 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
860 nigel 77 {
861     c = *(++ptr);
862     if (c == 0) goto ERROR_RETURN;
863 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
864 nigel 77 name[i] = c;
865     }
866 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
867 nigel 77 name[i] = 0;
868     }
869    
870     /* Otherwise there is just one following character */
871    
872     else
873     {
874     name[0] = c;
875     name[1] = 0;
876     }
877    
878     *ptrptr = ptr;
879    
880     /* Search for a recognized property name using binary chop */
881    
882     bot = 0;
883     top = _pcre_utt_size;
884    
885     while (bot < top)
886     {
887 nigel 87 i = (bot + top) >> 1;
888 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
889 nigel 87 if (c == 0)
890     {
891     *dptr = _pcre_utt[i].value;
892     return _pcre_utt[i].type;
893     }
894 nigel 77 if (c > 0) bot = i + 1; else top = i;
895     }
896    
897     *errorcodeptr = ERR47;
898     *ptrptr = ptr;
899     return -1;
900    
901     ERROR_RETURN:
902     *errorcodeptr = ERR46;
903     *ptrptr = ptr;
904     return -1;
905     }
906     #endif
907    
908    
909    
910    
911     /*************************************************
912     * Check for counted repeat *
913     *************************************************/
914    
915     /* This function is called when a '{' is encountered in a place where it might
916     start a quantifier. It looks ahead to see if it really is a quantifier or not.
917     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
918     where the ddds are digits.
919    
920     Arguments:
921     p pointer to the first char after '{'
922    
923     Returns: TRUE or FALSE
924     */
925    
926     static BOOL
927     is_counted_repeat(const uschar *p)
928     {
929     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
930     while ((digitab[*p] & ctype_digit) != 0) p++;
931 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
932 nigel 77
933 ph10 391 if (*p++ != CHAR_COMMA) return FALSE;
934     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
935 nigel 77
936     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
937     while ((digitab[*p] & ctype_digit) != 0) p++;
938    
939 ph10 391 return (*p == CHAR_RIGHT_CURLY_BRACKET);
940 nigel 77 }
941    
942    
943    
944     /*************************************************
945     * Read repeat counts *
946     *************************************************/
947    
948     /* Read an item of the form {n,m} and return the values. This is called only
949     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
950     so the syntax is guaranteed to be correct, but we need to check the values.
951    
952     Arguments:
953     p pointer to first char after '{'
954     minp pointer to int for min
955     maxp pointer to int for max
956     returned as -1 if no max
957     errorcodeptr points to error code variable
958    
959     Returns: pointer to '}' on success;
960     current ptr on error, with errorcodeptr set non-zero
961     */
962    
963     static const uschar *
964     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
965     {
966     int min = 0;
967     int max = -1;
968    
969 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
970     an integer overflow. */
971    
972 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
973 nigel 81 if (min < 0 || min > 65535)
974     {
975     *errorcodeptr = ERR5;
976     return p;
977     }
978 nigel 77
979 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
980     Also, max must not be less than min. */
981    
982 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
983 nigel 77 {
984 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
985 nigel 77 {
986     max = 0;
987 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
988 nigel 81 if (max < 0 || max > 65535)
989     {
990     *errorcodeptr = ERR5;
991     return p;
992     }
993 nigel 77 if (max < min)
994     {
995     *errorcodeptr = ERR4;
996     return p;
997     }
998     }
999     }
1000    
1001 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1002     '}'. */
1003 nigel 77
1004 nigel 81 *minp = min;
1005     *maxp = max;
1006 nigel 77 return p;
1007     }
1008    
1009    
1010    
1011     /*************************************************
1012 nigel 93 * Find forward referenced subpattern *
1013 nigel 91 *************************************************/
1014    
1015 nigel 93 /* This function scans along a pattern's text looking for capturing
1016     subpatterns, and counting them. If it finds a named pattern that matches the
1017     name it is given, it returns its number. Alternatively, if the name is NULL, it
1018     returns when it reaches a given numbered subpattern. This is used for forward
1019     references to subpatterns. We know that if (?P< is encountered, the name will
1020     be terminated by '>' because that is checked in the first pass.
1021 nigel 91
1022     Arguments:
1023 nigel 93 ptr current position in the pattern
1024 ph10 345 cd compile background data
1025 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1026     lorn name length, or subpattern number if name is NULL
1027     xmode TRUE if we are in /x mode
1028 nigel 91
1029     Returns: the number of the named subpattern, or -1 if not found
1030     */
1031    
1032     static int
1033 ph10 341 find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,
1034 nigel 93 BOOL xmode)
1035 nigel 91 {
1036     const uschar *thisname;
1037 ph10 341 int count = cd->bracount;
1038 nigel 93
1039 nigel 91 for (; *ptr != 0; ptr++)
1040     {
1041 nigel 93 int term;
1042    
1043     /* Skip over backslashed characters and also entire \Q...\E */
1044    
1045 ph10 391 if (*ptr == CHAR_BACKSLASH)
1046 nigel 93 {
1047     if (*(++ptr) == 0) return -1;
1048 ph10 391 if (*ptr == CHAR_Q) for (;;)
1049 nigel 93 {
1050 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1051 nigel 93 if (*ptr == 0) return -1;
1052 ph10 391 if (*(++ptr) == CHAR_E) break;
1053 nigel 93 }
1054     continue;
1055     }
1056    
1057 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1058     are handled for real. If the first character is '^', skip it. Also, if the
1059     first few characters (either before or after ^) are \Q\E or \E we skip them
1060 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1061 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1062 nigel 93
1063 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1064 nigel 93 {
1065 ph10 340 BOOL negate_class = FALSE;
1066     for (;;)
1067     {
1068     int c = *(++ptr);
1069 ph10 391 if (c == CHAR_BACKSLASH)
1070 ph10 340 {
1071 ph10 392 if (ptr[1] == CHAR_E)
1072 ph10 391 ptr++;
1073 ph10 392 else if (strncmp((const char *)ptr+1,
1074     STR_Q STR_BACKSLASH STR_E, 3) == 0)
1075 ph10 391 ptr += 3;
1076 ph10 392 else
1077 ph10 391 break;
1078 ph10 340 }
1079 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
1080 ph10 340 negate_class = TRUE;
1081     else break;
1082     }
1083    
1084     /* If the next character is ']', it is a data character that must be
1085 ph10 341 skipped, except in JavaScript compatibility mode. */
1086 ph10 345
1087 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1088 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1089 ph10 345 ptr++;
1090    
1091 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1092 nigel 93 {
1093 ph10 220 if (*ptr == 0) return -1;
1094 ph10 391 if (*ptr == CHAR_BACKSLASH)
1095 nigel 93 {
1096     if (*(++ptr) == 0) return -1;
1097 ph10 391 if (*ptr == CHAR_Q) for (;;)
1098 nigel 93 {
1099 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1100 nigel 93 if (*ptr == 0) return -1;
1101 ph10 391 if (*(++ptr) == CHAR_E) break;
1102 nigel 93 }
1103     continue;
1104     }
1105     }
1106     continue;
1107     }
1108    
1109     /* Skip comments in /x mode */
1110    
1111 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1112 nigel 93 {
1113 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1114 nigel 93 if (*ptr == 0) return -1;
1115     continue;
1116     }
1117    
1118     /* An opening parens must now be a real metacharacter */
1119    
1120 ph10 391 if (*ptr != CHAR_LEFT_PARENTHESIS) continue;
1121     if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1122 nigel 93 {
1123     count++;
1124     if (name == NULL && count == lorn) return count;
1125     continue;
1126     }
1127    
1128     ptr += 2;
1129 ph10 391 if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1130 nigel 93
1131     /* We have to disambiguate (?<! and (?<= from (?<name> */
1132    
1133 ph10 392 if ((*ptr != CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_EXCLAMATION_MARK ||
1134 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && *ptr != CHAR_APOSTROPHE)
1135 nigel 93 continue;
1136    
1137 nigel 91 count++;
1138 nigel 93
1139     if (name == NULL && count == lorn) return count;
1140     term = *ptr++;
1141 ph10 391 if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1142 nigel 91 thisname = ptr;
1143 nigel 93 while (*ptr != term) ptr++;
1144     if (name != NULL && lorn == ptr - thisname &&
1145     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1146 nigel 91 return count;
1147     }
1148 nigel 93
1149 nigel 91 return -1;
1150     }
1151    
1152    
1153    
1154     /*************************************************
1155 nigel 77 * Find first significant op code *
1156     *************************************************/
1157    
1158     /* This is called by several functions that scan a compiled expression looking
1159     for a fixed first character, or an anchoring op code etc. It skips over things
1160     that do not influence this. For some calls, a change of option is important.
1161     For some calls, it makes sense to skip negative forward and all backward
1162     assertions, and also the \b assertion; for others it does not.
1163    
1164     Arguments:
1165     code pointer to the start of the group
1166     options pointer to external options
1167     optbit the option bit whose changing is significant, or
1168     zero if none are
1169     skipassert TRUE if certain assertions are to be skipped
1170    
1171     Returns: pointer to the first significant opcode
1172     */
1173    
1174     static const uschar*
1175     first_significant_code(const uschar *code, int *options, int optbit,
1176     BOOL skipassert)
1177     {
1178     for (;;)
1179     {
1180     switch ((int)*code)
1181     {
1182     case OP_OPT:
1183     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1184     *options = (int)code[1];
1185     code += 2;
1186     break;
1187    
1188     case OP_ASSERT_NOT:
1189     case OP_ASSERTBACK:
1190     case OP_ASSERTBACK_NOT:
1191     if (!skipassert) return code;
1192     do code += GET(code, 1); while (*code == OP_ALT);
1193     code += _pcre_OP_lengths[*code];
1194     break;
1195    
1196     case OP_WORD_BOUNDARY:
1197     case OP_NOT_WORD_BOUNDARY:
1198     if (!skipassert) return code;
1199     /* Fall through */
1200    
1201     case OP_CALLOUT:
1202     case OP_CREF:
1203 nigel 93 case OP_RREF:
1204     case OP_DEF:
1205 nigel 77 code += _pcre_OP_lengths[*code];
1206     break;
1207    
1208     default:
1209     return code;
1210     }
1211     }
1212     /* Control never reaches here */
1213     }
1214    
1215    
1216    
1217    
1218     /*************************************************
1219     * Find the fixed length of a pattern *
1220     *************************************************/
1221    
1222     /* Scan a pattern and compute the fixed length of subject that will match it,
1223     if the length is fixed. This is needed for dealing with backward assertions.
1224     In UTF8 mode, the result is in characters rather than bytes.
1225    
1226     Arguments:
1227     code points to the start of the pattern (the bracket)
1228     options the compiling options
1229    
1230     Returns: the fixed length, or -1 if there is no fixed length,
1231     or -2 if \C was encountered
1232     */
1233    
1234     static int
1235     find_fixedlength(uschar *code, int options)
1236     {
1237     int length = -1;
1238    
1239     register int branchlength = 0;
1240     register uschar *cc = code + 1 + LINK_SIZE;
1241    
1242     /* Scan along the opcodes for this branch. If we get to the end of the
1243     branch, check the length against that of the other branches. */
1244    
1245     for (;;)
1246     {
1247     int d;
1248     register int op = *cc;
1249     switch (op)
1250     {
1251 nigel 93 case OP_CBRA:
1252 nigel 77 case OP_BRA:
1253     case OP_ONCE:
1254     case OP_COND:
1255 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1256 nigel 77 if (d < 0) return d;
1257     branchlength += d;
1258     do cc += GET(cc, 1); while (*cc == OP_ALT);
1259     cc += 1 + LINK_SIZE;
1260     break;
1261    
1262     /* Reached end of a branch; if it's a ket it is the end of a nested
1263     call. If it's ALT it is an alternation in a nested call. If it is
1264     END it's the end of the outer call. All can be handled by the same code. */
1265    
1266     case OP_ALT:
1267     case OP_KET:
1268     case OP_KETRMAX:
1269     case OP_KETRMIN:
1270     case OP_END:
1271     if (length < 0) length = branchlength;
1272     else if (length != branchlength) return -1;
1273     if (*cc != OP_ALT) return length;
1274     cc += 1 + LINK_SIZE;
1275     branchlength = 0;
1276     break;
1277    
1278     /* Skip over assertive subpatterns */
1279    
1280     case OP_ASSERT:
1281     case OP_ASSERT_NOT:
1282     case OP_ASSERTBACK:
1283     case OP_ASSERTBACK_NOT:
1284     do cc += GET(cc, 1); while (*cc == OP_ALT);
1285     /* Fall through */
1286    
1287     /* Skip over things that don't match chars */
1288    
1289     case OP_REVERSE:
1290     case OP_CREF:
1291 nigel 93 case OP_RREF:
1292     case OP_DEF:
1293 nigel 77 case OP_OPT:
1294     case OP_CALLOUT:
1295     case OP_SOD:
1296     case OP_SOM:
1297     case OP_EOD:
1298     case OP_EODN:
1299     case OP_CIRC:
1300     case OP_DOLL:
1301     case OP_NOT_WORD_BOUNDARY:
1302     case OP_WORD_BOUNDARY:
1303     cc += _pcre_OP_lengths[*cc];
1304     break;
1305    
1306     /* Handle literal characters */
1307    
1308     case OP_CHAR:
1309     case OP_CHARNC:
1310 nigel 91 case OP_NOT:
1311 nigel 77 branchlength++;
1312     cc += 2;
1313     #ifdef SUPPORT_UTF8
1314     if ((options & PCRE_UTF8) != 0)
1315     {
1316     while ((*cc & 0xc0) == 0x80) cc++;
1317     }
1318     #endif
1319     break;
1320    
1321     /* Handle exact repetitions. The count is already in characters, but we
1322     need to skip over a multibyte character in UTF8 mode. */
1323    
1324     case OP_EXACT:
1325     branchlength += GET2(cc,1);
1326     cc += 4;
1327     #ifdef SUPPORT_UTF8
1328     if ((options & PCRE_UTF8) != 0)
1329     {
1330     while((*cc & 0x80) == 0x80) cc++;
1331     }
1332     #endif
1333     break;
1334    
1335     case OP_TYPEEXACT:
1336     branchlength += GET2(cc,1);
1337 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1338 nigel 77 cc += 4;
1339     break;
1340    
1341     /* Handle single-char matchers */
1342    
1343     case OP_PROP:
1344     case OP_NOTPROP:
1345 nigel 87 cc += 2;
1346 nigel 77 /* Fall through */
1347    
1348     case OP_NOT_DIGIT:
1349     case OP_DIGIT:
1350     case OP_NOT_WHITESPACE:
1351     case OP_WHITESPACE:
1352     case OP_NOT_WORDCHAR:
1353     case OP_WORDCHAR:
1354     case OP_ANY:
1355 ph10 342 case OP_ALLANY:
1356 nigel 77 branchlength++;
1357     cc++;
1358     break;
1359    
1360     /* The single-byte matcher isn't allowed */
1361    
1362     case OP_ANYBYTE:
1363     return -2;
1364    
1365     /* Check a class for variable quantification */
1366    
1367     #ifdef SUPPORT_UTF8
1368     case OP_XCLASS:
1369     cc += GET(cc, 1) - 33;
1370     /* Fall through */
1371     #endif
1372    
1373     case OP_CLASS:
1374     case OP_NCLASS:
1375     cc += 33;
1376    
1377     switch (*cc)
1378     {
1379     case OP_CRSTAR:
1380     case OP_CRMINSTAR:
1381     case OP_CRQUERY:
1382     case OP_CRMINQUERY:
1383     return -1;
1384    
1385     case OP_CRRANGE:
1386     case OP_CRMINRANGE:
1387     if (GET2(cc,1) != GET2(cc,3)) return -1;
1388     branchlength += GET2(cc,1);
1389     cc += 5;
1390     break;
1391    
1392     default:
1393     branchlength++;
1394     }
1395     break;
1396    
1397     /* Anything else is variable length */
1398    
1399     default:
1400     return -1;
1401     }
1402     }
1403     /* Control never gets here */
1404     }
1405    
1406    
1407    
1408    
1409     /*************************************************
1410     * Scan compiled regex for numbered bracket *
1411     *************************************************/
1412    
1413     /* This little function scans through a compiled pattern until it finds a
1414     capturing bracket with the given number.
1415    
1416     Arguments:
1417     code points to start of expression
1418     utf8 TRUE in UTF-8 mode
1419     number the required bracket number
1420    
1421     Returns: pointer to the opcode for the bracket, or NULL if not found
1422     */
1423    
1424     static const uschar *
1425     find_bracket(const uschar *code, BOOL utf8, int number)
1426     {
1427     for (;;)
1428     {
1429     register int c = *code;
1430     if (c == OP_END) return NULL;
1431 nigel 91
1432     /* XCLASS is used for classes that cannot be represented just by a bit
1433     map. This includes negated single high-valued characters. The length in
1434     the table is zero; the actual length is stored in the compiled code. */
1435    
1436     if (c == OP_XCLASS) code += GET(code, 1);
1437    
1438 nigel 93 /* Handle capturing bracket */
1439 nigel 91
1440 nigel 93 else if (c == OP_CBRA)
1441 nigel 77 {
1442 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1443 nigel 77 if (n == number) return (uschar *)code;
1444 nigel 93 code += _pcre_OP_lengths[c];
1445 nigel 77 }
1446 nigel 91
1447 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1448     repeated character types, we have to test for \p and \P, which have an extra
1449 ph10 218 two bytes of parameters. */
1450 nigel 91
1451 nigel 77 else
1452     {
1453 ph10 218 switch(c)
1454     {
1455     case OP_TYPESTAR:
1456     case OP_TYPEMINSTAR:
1457     case OP_TYPEPLUS:
1458     case OP_TYPEMINPLUS:
1459     case OP_TYPEQUERY:
1460     case OP_TYPEMINQUERY:
1461     case OP_TYPEPOSSTAR:
1462     case OP_TYPEPOSPLUS:
1463     case OP_TYPEPOSQUERY:
1464     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1465 ph10 220 break;
1466 ph10 221
1467     case OP_TYPEUPTO:
1468     case OP_TYPEMINUPTO:
1469     case OP_TYPEEXACT:
1470     case OP_TYPEPOSUPTO:
1471     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1472     break;
1473 ph10 220 }
1474    
1475 ph10 218 /* Add in the fixed length from the table */
1476 ph10 220
1477 nigel 77 code += _pcre_OP_lengths[c];
1478 ph10 220
1479 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1480     a multi-byte character. The length in the table is a minimum, so we have to
1481     arrange to skip the extra bytes. */
1482 ph10 220
1483 ph10 107 #ifdef SUPPORT_UTF8
1484 nigel 77 if (utf8) switch(c)
1485     {
1486     case OP_CHAR:
1487     case OP_CHARNC:
1488     case OP_EXACT:
1489     case OP_UPTO:
1490     case OP_MINUPTO:
1491 nigel 93 case OP_POSUPTO:
1492 nigel 77 case OP_STAR:
1493     case OP_MINSTAR:
1494 nigel 93 case OP_POSSTAR:
1495 nigel 77 case OP_PLUS:
1496     case OP_MINPLUS:
1497 nigel 93 case OP_POSPLUS:
1498 nigel 77 case OP_QUERY:
1499     case OP_MINQUERY:
1500 nigel 93 case OP_POSQUERY:
1501     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1502 nigel 77 break;
1503     }
1504 ph10 369 #else
1505     (void)(utf8); /* Keep compiler happy by referencing function argument */
1506 ph10 111 #endif
1507 nigel 77 }
1508     }
1509     }
1510    
1511    
1512    
1513     /*************************************************
1514     * Scan compiled regex for recursion reference *
1515     *************************************************/
1516    
1517     /* This little function scans through a compiled pattern until it finds an
1518     instance of OP_RECURSE.
1519    
1520     Arguments:
1521     code points to start of expression
1522     utf8 TRUE in UTF-8 mode
1523    
1524     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1525     */
1526    
1527     static const uschar *
1528     find_recurse(const uschar *code, BOOL utf8)
1529     {
1530     for (;;)
1531     {
1532     register int c = *code;
1533     if (c == OP_END) return NULL;
1534 nigel 91 if (c == OP_RECURSE) return code;
1535 ph10 220
1536 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1537     map. This includes negated single high-valued characters. The length in
1538     the table is zero; the actual length is stored in the compiled code. */
1539    
1540     if (c == OP_XCLASS) code += GET(code, 1);
1541    
1542 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1543     repeated character types, we have to test for \p and \P, which have an extra
1544 ph10 218 two bytes of parameters. */
1545 nigel 91
1546 nigel 77 else
1547     {
1548 ph10 218 switch(c)
1549     {
1550     case OP_TYPESTAR:
1551     case OP_TYPEMINSTAR:
1552     case OP_TYPEPLUS:
1553     case OP_TYPEMINPLUS:
1554     case OP_TYPEQUERY:
1555     case OP_TYPEMINQUERY:
1556     case OP_TYPEPOSSTAR:
1557     case OP_TYPEPOSPLUS:
1558     case OP_TYPEPOSQUERY:
1559     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1560 ph10 220 break;
1561 ph10 221
1562     case OP_TYPEPOSUPTO:
1563     case OP_TYPEUPTO:
1564     case OP_TYPEMINUPTO:
1565     case OP_TYPEEXACT:
1566     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1567     break;
1568 ph10 220 }
1569    
1570 ph10 218 /* Add in the fixed length from the table */
1571    
1572 nigel 77 code += _pcre_OP_lengths[c];
1573 ph10 220
1574 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1575     by a multi-byte character. The length in the table is a minimum, so we have
1576     to arrange to skip the extra bytes. */
1577 ph10 220
1578 ph10 107 #ifdef SUPPORT_UTF8
1579 nigel 77 if (utf8) switch(c)
1580     {
1581     case OP_CHAR:
1582     case OP_CHARNC:
1583     case OP_EXACT:
1584     case OP_UPTO:
1585     case OP_MINUPTO:
1586 nigel 93 case OP_POSUPTO:
1587 nigel 77 case OP_STAR:
1588     case OP_MINSTAR:
1589 nigel 93 case OP_POSSTAR:
1590 nigel 77 case OP_PLUS:
1591     case OP_MINPLUS:
1592 nigel 93 case OP_POSPLUS:
1593 nigel 77 case OP_QUERY:
1594     case OP_MINQUERY:
1595 nigel 93 case OP_POSQUERY:
1596     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1597 nigel 77 break;
1598     }
1599 ph10 369 #else
1600     (void)(utf8); /* Keep compiler happy by referencing function argument */
1601 ph10 111 #endif
1602 nigel 77 }
1603     }
1604     }
1605    
1606    
1607    
1608     /*************************************************
1609     * Scan compiled branch for non-emptiness *
1610     *************************************************/
1611    
1612     /* This function scans through a branch of a compiled pattern to see whether it
1613 nigel 93 can match the empty string or not. It is called from could_be_empty()
1614     below and from compile_branch() when checking for an unlimited repeat of a
1615     group that can match nothing. Note that first_significant_code() skips over
1616 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1617     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1618     bracket whose current branch will already have been scanned.
1619 nigel 77
1620     Arguments:
1621     code points to start of search
1622     endcode points to where to stop
1623     utf8 TRUE if in UTF8 mode
1624    
1625     Returns: TRUE if what is matched could be empty
1626     */
1627    
1628     static BOOL
1629     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1630     {
1631     register int c;
1632 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1633 nigel 77 code < endcode;
1634     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1635     {
1636     const uschar *ccode;
1637    
1638     c = *code;
1639 ph10 286
1640     /* Skip over forward assertions; the other assertions are skipped by
1641 ph10 282 first_significant_code() with a TRUE final argument. */
1642 ph10 286
1643 ph10 282 if (c == OP_ASSERT)
1644 ph10 286 {
1645 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1646     c = *code;
1647     continue;
1648 ph10 286 }
1649 ph10 172
1650 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1651 nigel 77
1652 ph10 335 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1653 ph10 170 {
1654 ph10 172 code += _pcre_OP_lengths[c];
1655 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1656     c = *code;
1657     continue;
1658     }
1659    
1660     /* For other groups, scan the branches. */
1661 ph10 172
1662 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1663 nigel 77 {
1664     BOOL empty_branch;
1665     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1666 ph10 395
1667     /* If a conditional group has only one branch, there is a second, implied,
1668     empty branch, so just skip over the conditional, because it could be empty.
1669     Otherwise, scan the individual branches of the group. */
1670    
1671     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1672 nigel 77 code += GET(code, 1);
1673 ph10 395 else
1674     {
1675     empty_branch = FALSE;
1676     do
1677     {
1678     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1679     empty_branch = TRUE;
1680     code += GET(code, 1);
1681     }
1682     while (*code == OP_ALT);
1683     if (!empty_branch) return FALSE; /* All branches are non-empty */
1684 nigel 77 }
1685 ph10 395
1686 ph10 172 c = *code;
1687 nigel 93 continue;
1688 nigel 77 }
1689    
1690 nigel 93 /* Handle the other opcodes */
1691    
1692     switch (c)
1693 nigel 77 {
1694 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1695     cannot be represented just by a bit map. This includes negated single
1696     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1697 ph10 220 actual length is stored in the compiled code, so we must update "code"
1698 ph10 216 here. */
1699 nigel 77
1700     #ifdef SUPPORT_UTF8
1701     case OP_XCLASS:
1702 ph10 216 ccode = code += GET(code, 1);
1703 nigel 77 goto CHECK_CLASS_REPEAT;
1704     #endif
1705    
1706     case OP_CLASS:
1707     case OP_NCLASS:
1708     ccode = code + 33;
1709    
1710     #ifdef SUPPORT_UTF8
1711     CHECK_CLASS_REPEAT:
1712     #endif
1713    
1714     switch (*ccode)
1715     {
1716     case OP_CRSTAR: /* These could be empty; continue */
1717     case OP_CRMINSTAR:
1718     case OP_CRQUERY:
1719     case OP_CRMINQUERY:
1720     break;
1721    
1722     default: /* Non-repeat => class must match */
1723     case OP_CRPLUS: /* These repeats aren't empty */
1724     case OP_CRMINPLUS:
1725     return FALSE;
1726    
1727     case OP_CRRANGE:
1728     case OP_CRMINRANGE:
1729     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1730     break;
1731     }
1732     break;
1733    
1734     /* Opcodes that must match a character */
1735    
1736     case OP_PROP:
1737     case OP_NOTPROP:
1738     case OP_EXTUNI:
1739     case OP_NOT_DIGIT:
1740     case OP_DIGIT:
1741     case OP_NOT_WHITESPACE:
1742     case OP_WHITESPACE:
1743     case OP_NOT_WORDCHAR:
1744     case OP_WORDCHAR:
1745     case OP_ANY:
1746 ph10 345 case OP_ALLANY:
1747 nigel 77 case OP_ANYBYTE:
1748     case OP_CHAR:
1749     case OP_CHARNC:
1750     case OP_NOT:
1751     case OP_PLUS:
1752     case OP_MINPLUS:
1753 nigel 93 case OP_POSPLUS:
1754 nigel 77 case OP_EXACT:
1755     case OP_NOTPLUS:
1756     case OP_NOTMINPLUS:
1757 nigel 93 case OP_NOTPOSPLUS:
1758 nigel 77 case OP_NOTEXACT:
1759     case OP_TYPEPLUS:
1760     case OP_TYPEMINPLUS:
1761 nigel 93 case OP_TYPEPOSPLUS:
1762 nigel 77 case OP_TYPEEXACT:
1763     return FALSE;
1764 ph10 227
1765     /* These are going to continue, as they may be empty, but we have to
1766     fudge the length for the \p and \P cases. */
1767    
1768 ph10 224 case OP_TYPESTAR:
1769     case OP_TYPEMINSTAR:
1770     case OP_TYPEPOSSTAR:
1771     case OP_TYPEQUERY:
1772     case OP_TYPEMINQUERY:
1773     case OP_TYPEPOSQUERY:
1774     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1775 ph10 227 break;
1776    
1777 ph10 224 /* Same for these */
1778 ph10 227
1779 ph10 224 case OP_TYPEUPTO:
1780     case OP_TYPEMINUPTO:
1781     case OP_TYPEPOSUPTO:
1782     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1783     break;
1784 nigel 77
1785     /* End of branch */
1786    
1787     case OP_KET:
1788     case OP_KETRMAX:
1789     case OP_KETRMIN:
1790     case OP_ALT:
1791     return TRUE;
1792    
1793 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1794     MINUPTO, and POSUPTO may be followed by a multibyte character */
1795 nigel 77
1796     #ifdef SUPPORT_UTF8
1797     case OP_STAR:
1798     case OP_MINSTAR:
1799 nigel 93 case OP_POSSTAR:
1800 nigel 77 case OP_QUERY:
1801     case OP_MINQUERY:
1802 nigel 93 case OP_POSQUERY:
1803 nigel 77 case OP_UPTO:
1804     case OP_MINUPTO:
1805 nigel 93 case OP_POSUPTO:
1806 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1807     break;
1808     #endif
1809     }
1810     }
1811    
1812     return TRUE;
1813     }
1814    
1815    
1816    
1817     /*************************************************
1818     * Scan compiled regex for non-emptiness *
1819     *************************************************/
1820    
1821     /* This function is called to check for left recursive calls. We want to check
1822     the current branch of the current pattern to see if it could match the empty
1823     string. If it could, we must look outwards for branches at other levels,
1824     stopping when we pass beyond the bracket which is the subject of the recursion.
1825    
1826     Arguments:
1827     code points to start of the recursion
1828     endcode points to where to stop (current RECURSE item)
1829     bcptr points to the chain of current (unclosed) branch starts
1830     utf8 TRUE if in UTF-8 mode
1831    
1832     Returns: TRUE if what is matched could be empty
1833     */
1834    
1835     static BOOL
1836     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1837     BOOL utf8)
1838     {
1839     while (bcptr != NULL && bcptr->current >= code)
1840     {
1841     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1842     bcptr = bcptr->outer;
1843     }
1844     return TRUE;
1845     }
1846    
1847    
1848    
1849     /*************************************************
1850     * Check for POSIX class syntax *
1851     *************************************************/
1852    
1853     /* This function is called when the sequence "[:" or "[." or "[=" is
1854 ph10 295 encountered in a character class. It checks whether this is followed by a
1855 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1856 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
1857 nigel 77
1858 ph10 298 Originally, this function only recognized a sequence of letters between the
1859     terminators, but it seems that Perl recognizes any sequence of characters,
1860     though of course unknown POSIX names are subsequently rejected. Perl gives an
1861     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1862     didn't consider this to be a POSIX class. Likewise for [:1234:].
1863 ph10 295
1864 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
1865     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1866     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1867     below handles the special case of \], but does not try to do any other escape
1868     processing. This makes it different from Perl for cases such as [:l\ower:]
1869 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1870 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1871 ph10 295 I think.
1872    
1873     Arguments:
1874 nigel 77 ptr pointer to the initial [
1875     endptr where to return the end pointer
1876    
1877     Returns: TRUE or FALSE
1878     */
1879    
1880     static BOOL
1881 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1882 nigel 77 {
1883     int terminator; /* Don't combine these lines; the Solaris cc */
1884     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1885 ph10 295 for (++ptr; *ptr != 0; ptr++)
1886 nigel 77 {
1887 ph10 391 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
1888 ph10 298 {
1889 ph10 391 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
1890     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
1891 ph10 295 {
1892     *endptr = ptr;
1893     return TRUE;
1894 ph10 298 }
1895     }
1896     }
1897 nigel 77 return FALSE;
1898     }
1899    
1900    
1901    
1902    
1903     /*************************************************
1904     * Check POSIX class name *
1905     *************************************************/
1906    
1907     /* This function is called to check the name given in a POSIX-style class entry
1908     such as [:alnum:].
1909    
1910     Arguments:
1911     ptr points to the first letter
1912     len the length of the name
1913    
1914     Returns: a value representing the name, or -1 if unknown
1915     */
1916    
1917     static int
1918     check_posix_name(const uschar *ptr, int len)
1919     {
1920 ph10 240 const char *pn = posix_names;
1921 nigel 77 register int yield = 0;
1922     while (posix_name_lengths[yield] != 0)
1923     {
1924     if (len == posix_name_lengths[yield] &&
1925 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
1926 ph10 243 pn += posix_name_lengths[yield] + 1;
1927 nigel 77 yield++;
1928     }
1929     return -1;
1930     }
1931    
1932    
1933     /*************************************************
1934     * Adjust OP_RECURSE items in repeated group *
1935     *************************************************/
1936    
1937     /* OP_RECURSE items contain an offset from the start of the regex to the group
1938     that is referenced. This means that groups can be replicated for fixed
1939     repetition simply by copying (because the recursion is allowed to refer to
1940     earlier groups that are outside the current group). However, when a group is
1941 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
1942     inserted before it, after it has been compiled. This means that any OP_RECURSE
1943     items within it that refer to the group itself or any contained groups have to
1944     have their offsets adjusted. That one of the jobs of this function. Before it
1945     is called, the partially compiled regex must be temporarily terminated with
1946     OP_END.
1947 nigel 77
1948 nigel 93 This function has been extended with the possibility of forward references for
1949     recursions and subroutine calls. It must also check the list of such references
1950     for the group we are dealing with. If it finds that one of the recursions in
1951     the current group is on this list, it adjusts the offset in the list, not the
1952     value in the reference (which is a group number).
1953    
1954 nigel 77 Arguments:
1955     group points to the start of the group
1956     adjust the amount by which the group is to be moved
1957     utf8 TRUE in UTF-8 mode
1958     cd contains pointers to tables etc.
1959 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1960 nigel 77
1961     Returns: nothing
1962     */
1963    
1964     static void
1965 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1966     uschar *save_hwm)
1967 nigel 77 {
1968     uschar *ptr = group;
1969 ph10 224
1970 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1971     {
1972 nigel 93 int offset;
1973     uschar *hc;
1974    
1975     /* See if this recursion is on the forward reference list. If so, adjust the
1976     reference. */
1977 ph10 345
1978 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1979     {
1980     offset = GET(hc, 0);
1981     if (cd->start_code + offset == ptr + 1)
1982     {
1983     PUT(hc, 0, offset + adjust);
1984     break;
1985     }
1986     }
1987    
1988     /* Otherwise, adjust the recursion offset if it's after the start of this
1989     group. */
1990    
1991     if (hc >= cd->hwm)
1992     {
1993     offset = GET(ptr, 1);
1994     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1995     }
1996    
1997 nigel 77 ptr += 1 + LINK_SIZE;
1998     }
1999     }
2000    
2001    
2002    
2003     /*************************************************
2004     * Insert an automatic callout point *
2005     *************************************************/
2006    
2007     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2008     callout points before each pattern item.
2009    
2010     Arguments:
2011     code current code pointer
2012     ptr current pattern pointer
2013     cd pointers to tables etc
2014    
2015     Returns: new code pointer
2016     */
2017    
2018     static uschar *
2019     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2020     {
2021     *code++ = OP_CALLOUT;
2022     *code++ = 255;
2023     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2024     PUT(code, LINK_SIZE, 0); /* Default length */
2025     return code + 2*LINK_SIZE;
2026     }
2027    
2028    
2029    
2030     /*************************************************
2031     * Complete a callout item *
2032     *************************************************/
2033    
2034     /* A callout item contains the length of the next item in the pattern, which
2035     we can't fill in till after we have reached the relevant point. This is used
2036     for both automatic and manual callouts.
2037    
2038     Arguments:
2039     previous_callout points to previous callout item
2040     ptr current pattern pointer
2041     cd pointers to tables etc
2042    
2043     Returns: nothing
2044     */
2045    
2046     static void
2047     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2048     {
2049     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2050     PUT(previous_callout, 2 + LINK_SIZE, length);
2051     }
2052    
2053    
2054    
2055     #ifdef SUPPORT_UCP
2056     /*************************************************
2057     * Get othercase range *
2058     *************************************************/
2059    
2060     /* This function is passed the start and end of a class range, in UTF-8 mode
2061     with UCP support. It searches up the characters, looking for internal ranges of
2062     characters in the "other" case. Each call returns the next one, updating the
2063     start address.
2064    
2065     Arguments:
2066     cptr points to starting character value; updated
2067     d end value
2068     ocptr where to put start of othercase range
2069     odptr where to put end of othercase range
2070    
2071     Yield: TRUE when range returned; FALSE when no more
2072     */
2073    
2074     static BOOL
2075 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2076     unsigned int *odptr)
2077 nigel 77 {
2078 nigel 93 unsigned int c, othercase, next;
2079 nigel 77
2080     for (c = *cptr; c <= d; c++)
2081 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2082 nigel 77
2083     if (c > d) return FALSE;
2084    
2085     *ocptr = othercase;
2086     next = othercase + 1;
2087    
2088     for (++c; c <= d; c++)
2089     {
2090 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2091 nigel 77 next++;
2092     }
2093    
2094     *odptr = next - 1;
2095     *cptr = c;
2096    
2097     return TRUE;
2098     }
2099     #endif /* SUPPORT_UCP */
2100    
2101    
2102 nigel 93
2103 nigel 77 /*************************************************
2104 nigel 93 * Check if auto-possessifying is possible *
2105     *************************************************/
2106    
2107     /* This function is called for unlimited repeats of certain items, to see
2108     whether the next thing could possibly match the repeated item. If not, it makes
2109     sense to automatically possessify the repeated item.
2110    
2111     Arguments:
2112     op_code the repeated op code
2113     this data for this item, depends on the opcode
2114     utf8 TRUE in UTF-8 mode
2115     utf8_char used for utf8 character bytes, NULL if not relevant
2116     ptr next character in pattern
2117     options options bits
2118     cd contains pointers to tables etc.
2119    
2120     Returns: TRUE if possessifying is wanted
2121     */
2122    
2123     static BOOL
2124     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2125     const uschar *ptr, int options, compile_data *cd)
2126     {
2127     int next;
2128    
2129     /* Skip whitespace and comments in extended mode */
2130    
2131     if ((options & PCRE_EXTENDED) != 0)
2132     {
2133     for (;;)
2134     {
2135     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2136 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2137 nigel 93 {
2138     while (*(++ptr) != 0)
2139     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2140     }
2141     else break;
2142     }
2143     }
2144    
2145     /* If the next item is one that we can handle, get its value. A non-negative
2146     value is a character, a negative value is an escape value. */
2147    
2148 ph10 391 if (*ptr == CHAR_BACKSLASH)
2149 nigel 93 {
2150     int temperrorcode = 0;
2151     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2152     if (temperrorcode != 0) return FALSE;
2153     ptr++; /* Point after the escape sequence */
2154     }
2155    
2156     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2157     {
2158     #ifdef SUPPORT_UTF8
2159     if (utf8) { GETCHARINC(next, ptr); } else
2160     #endif
2161     next = *ptr++;
2162     }
2163    
2164     else return FALSE;
2165    
2166     /* Skip whitespace and comments in extended mode */
2167    
2168     if ((options & PCRE_EXTENDED) != 0)
2169     {
2170     for (;;)
2171     {
2172     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2173 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2174 nigel 93 {
2175     while (*(++ptr) != 0)
2176     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2177     }
2178     else break;
2179     }
2180     }
2181    
2182     /* If the next thing is itself optional, we have to give up. */
2183    
2184 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2185 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2186     return FALSE;
2187 nigel 93
2188     /* Now compare the next item with the previous opcode. If the previous is a
2189     positive single character match, "item" either contains the character or, if
2190     "item" is greater than 127 in utf8 mode, the character's bytes are in
2191     utf8_char. */
2192    
2193    
2194     /* Handle cases when the next item is a character. */
2195    
2196     if (next >= 0) switch(op_code)
2197     {
2198     case OP_CHAR:
2199     #ifdef SUPPORT_UTF8
2200     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2201 ph10 369 #else
2202     (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2203 nigel 93 #endif
2204     return item != next;
2205    
2206     /* For CHARNC (caseless character) we must check the other case. If we have
2207     Unicode property support, we can use it to test the other case of
2208     high-valued characters. */
2209    
2210     case OP_CHARNC:
2211     #ifdef SUPPORT_UTF8
2212     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2213     #endif
2214     if (item == next) return FALSE;
2215     #ifdef SUPPORT_UTF8
2216     if (utf8)
2217     {
2218     unsigned int othercase;
2219     if (next < 128) othercase = cd->fcc[next]; else
2220     #ifdef SUPPORT_UCP
2221 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2222 nigel 93 #else
2223     othercase = NOTACHAR;
2224     #endif
2225     return (unsigned int)item != othercase;
2226     }
2227     else
2228     #endif /* SUPPORT_UTF8 */
2229     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2230    
2231     /* For OP_NOT, "item" must be a single-byte character. */
2232    
2233     case OP_NOT:
2234     if (item == next) return TRUE;
2235     if ((options & PCRE_CASELESS) == 0) return FALSE;
2236     #ifdef SUPPORT_UTF8
2237     if (utf8)
2238     {
2239     unsigned int othercase;
2240     if (next < 128) othercase = cd->fcc[next]; else
2241     #ifdef SUPPORT_UCP
2242 ph10 349 othercase = UCD_OTHERCASE(next);
2243 nigel 93 #else
2244     othercase = NOTACHAR;
2245     #endif
2246     return (unsigned int)item == othercase;
2247     }
2248     else
2249     #endif /* SUPPORT_UTF8 */
2250     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2251    
2252     case OP_DIGIT:
2253     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2254    
2255     case OP_NOT_DIGIT:
2256     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2257    
2258     case OP_WHITESPACE:
2259     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2260    
2261     case OP_NOT_WHITESPACE:
2262     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2263    
2264     case OP_WORDCHAR:
2265     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2266    
2267     case OP_NOT_WORDCHAR:
2268     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2269    
2270 ph10 180 case OP_HSPACE:
2271     case OP_NOT_HSPACE:
2272     switch(next)
2273     {
2274     case 0x09:
2275     case 0x20:
2276     case 0xa0:
2277     case 0x1680:
2278     case 0x180e:
2279     case 0x2000:
2280     case 0x2001:
2281     case 0x2002:
2282     case 0x2003:
2283     case 0x2004:
2284     case 0x2005:
2285     case 0x2006:
2286     case 0x2007:
2287     case 0x2008:
2288     case 0x2009:
2289     case 0x200A:
2290     case 0x202f:
2291     case 0x205f:
2292     case 0x3000:
2293     return op_code != OP_HSPACE;
2294     default:
2295     return op_code == OP_HSPACE;
2296     }
2297    
2298     case OP_VSPACE:
2299     case OP_NOT_VSPACE:
2300     switch(next)
2301     {
2302     case 0x0a:
2303     case 0x0b:
2304     case 0x0c:
2305     case 0x0d:
2306     case 0x85:
2307     case 0x2028:
2308     case 0x2029:
2309     return op_code != OP_VSPACE;
2310     default:
2311     return op_code == OP_VSPACE;
2312     }
2313    
2314 nigel 93 default:
2315     return FALSE;
2316     }
2317    
2318    
2319     /* Handle the case when the next item is \d, \s, etc. */
2320    
2321     switch(op_code)
2322     {
2323     case OP_CHAR:
2324     case OP_CHARNC:
2325     #ifdef SUPPORT_UTF8
2326     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2327     #endif
2328     switch(-next)
2329     {
2330     case ESC_d:
2331     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2332    
2333     case ESC_D:
2334     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2335    
2336     case ESC_s:
2337     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2338    
2339     case ESC_S:
2340     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2341    
2342     case ESC_w:
2343     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2344    
2345     case ESC_W:
2346     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2347 ph10 182
2348 ph10 180 case ESC_h:
2349     case ESC_H:
2350     switch(item)
2351     {
2352     case 0x09:
2353     case 0x20:
2354     case 0xa0:
2355     case 0x1680:
2356     case 0x180e:
2357     case 0x2000:
2358     case 0x2001:
2359     case 0x2002:
2360     case 0x2003:
2361     case 0x2004:
2362     case 0x2005:
2363     case 0x2006:
2364     case 0x2007:
2365     case 0x2008:
2366     case 0x2009:
2367     case 0x200A:
2368     case 0x202f:
2369     case 0x205f:
2370     case 0x3000:
2371     return -next != ESC_h;
2372     default:
2373     return -next == ESC_h;
2374 ph10 182 }
2375    
2376 ph10 180 case ESC_v:
2377     case ESC_V:
2378     switch(item)
2379     {
2380     case 0x0a:
2381     case 0x0b:
2382     case 0x0c:
2383     case 0x0d:
2384     case 0x85:
2385     case 0x2028:
2386     case 0x2029:
2387     return -next != ESC_v;
2388     default:
2389     return -next == ESC_v;
2390 ph10 182 }
2391 nigel 93
2392     default:
2393     return FALSE;
2394     }
2395    
2396     case OP_DIGIT:
2397 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2398     next == -ESC_h || next == -ESC_v;
2399 nigel 93
2400     case OP_NOT_DIGIT:
2401     return next == -ESC_d;
2402    
2403     case OP_WHITESPACE:
2404     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2405    
2406     case OP_NOT_WHITESPACE:
2407 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2408 nigel 93
2409 ph10 180 case OP_HSPACE:
2410     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2411    
2412     case OP_NOT_HSPACE:
2413     return next == -ESC_h;
2414 ph10 182
2415 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2416 ph10 182 case OP_VSPACE:
2417 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2418    
2419     case OP_NOT_VSPACE:
2420 ph10 182 return next == -ESC_v;
2421 ph10 180
2422 nigel 93 case OP_WORDCHAR:
2423 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2424 nigel 93
2425     case OP_NOT_WORDCHAR:
2426     return next == -ESC_w || next == -ESC_d;
2427 ph10 182
2428 nigel 93 default:
2429     return FALSE;
2430     }
2431    
2432     /* Control does not reach here */
2433     }
2434    
2435    
2436    
2437     /*************************************************
2438 nigel 77 * Compile one branch *
2439     *************************************************/
2440    
2441 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2442 nigel 77 changed during the branch, the pointer is used to change the external options
2443 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2444     to find out the amount of memory needed, as well as during the real compile
2445     phase. The value of lengthptr distinguishes the two phases.
2446 nigel 77
2447     Arguments:
2448     optionsptr pointer to the option bits
2449     codeptr points to the pointer to the current code point
2450     ptrptr points to the current pattern pointer
2451     errorcodeptr points to error code variable
2452     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2453     reqbyteptr set to the last literal character required, else < 0
2454     bcptr points to current branch chain
2455     cd contains pointers to tables etc.
2456 nigel 93 lengthptr NULL during the real compile phase
2457     points to length accumulator during pre-compile phase
2458 nigel 77
2459     Returns: TRUE on success
2460     FALSE, with *errorcodeptr set non-zero on error
2461     */
2462    
2463     static BOOL
2464 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2465     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2466     compile_data *cd, int *lengthptr)
2467 nigel 77 {
2468     int repeat_type, op_type;
2469     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2470     int bravalue = 0;
2471     int greedy_default, greedy_non_default;
2472     int firstbyte, reqbyte;
2473     int zeroreqbyte, zerofirstbyte;
2474     int req_caseopt, reqvary, tempreqvary;
2475     int options = *optionsptr;
2476     int after_manual_callout = 0;
2477 nigel 93 int length_prevgroup = 0;
2478 nigel 77 register int c;
2479     register uschar *code = *codeptr;
2480 nigel 93 uschar *last_code = code;
2481     uschar *orig_code = code;
2482 nigel 77 uschar *tempcode;
2483     BOOL inescq = FALSE;
2484     BOOL groupsetfirstbyte = FALSE;
2485     const uschar *ptr = *ptrptr;
2486     const uschar *tempptr;
2487     uschar *previous = NULL;
2488     uschar *previous_callout = NULL;
2489 nigel 93 uschar *save_hwm = NULL;
2490 nigel 77 uschar classbits[32];
2491    
2492     #ifdef SUPPORT_UTF8
2493     BOOL class_utf8;
2494     BOOL utf8 = (options & PCRE_UTF8) != 0;
2495     uschar *class_utf8data;
2496 ph10 300 uschar *class_utf8data_base;
2497 nigel 77 uschar utf8_char[6];
2498     #else
2499     BOOL utf8 = FALSE;
2500 nigel 93 uschar *utf8_char = NULL;
2501 nigel 77 #endif
2502    
2503 nigel 93 #ifdef DEBUG
2504     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2505     #endif
2506    
2507 nigel 77 /* Set up the default and non-default settings for greediness */
2508    
2509     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2510     greedy_non_default = greedy_default ^ 1;
2511    
2512     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2513     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2514     matches a non-fixed char first char; reqbyte just remains unset if we never
2515     find one.
2516    
2517     When we hit a repeat whose minimum is zero, we may have to adjust these values
2518     to take the zero repeat into account. This is implemented by setting them to
2519     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2520     item types that can be repeated set these backoff variables appropriately. */
2521    
2522     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2523    
2524     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2525     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2526     value > 255. It is added into the firstbyte or reqbyte variables to record the
2527     case status of the value. This is used only for ASCII characters. */
2528    
2529     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2530    
2531     /* Switch on next character until the end of the branch */
2532    
2533     for (;; ptr++)
2534     {
2535     BOOL negate_class;
2536 ph10 286 BOOL should_flip_negation;
2537 nigel 77 BOOL possessive_quantifier;
2538     BOOL is_quantifier;
2539 nigel 93 BOOL is_recurse;
2540 ph10 180 BOOL reset_bracount;
2541 nigel 77 int class_charcount;
2542     int class_lastchar;
2543     int newoptions;
2544     int recno;
2545 ph10 172 int refsign;
2546 nigel 77 int skipbytes;
2547     int subreqbyte;
2548     int subfirstbyte;
2549 nigel 93 int terminator;
2550 nigel 77 int mclength;
2551     uschar mcbuffer[8];
2552    
2553 nigel 93 /* Get next byte in the pattern */
2554 nigel 77
2555     c = *ptr;
2556 ph10 345
2557 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2558     previous cycle of this loop. */
2559    
2560     if (lengthptr != NULL)
2561     {
2562     #ifdef DEBUG
2563     if (code > cd->hwm) cd->hwm = code; /* High water info */
2564     #endif
2565     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2566     {
2567     *errorcodeptr = ERR52;
2568     goto FAILED;
2569     }
2570    
2571     /* There is at least one situation where code goes backwards: this is the
2572     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2573     the class is simply eliminated. However, it is created first, so we have to
2574     allow memory for it. Therefore, don't ever reduce the length at this point.
2575     */
2576    
2577     if (code < last_code) code = last_code;
2578 ph10 202
2579     /* Paranoid check for integer overflow */
2580    
2581     if (OFLOW_MAX - *lengthptr < code - last_code)
2582     {
2583     *errorcodeptr = ERR20;
2584     goto FAILED;
2585     }
2586    
2587 nigel 93 *lengthptr += code - last_code;
2588     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2589    
2590     /* If "previous" is set and it is not at the start of the work space, move
2591     it back to there, in order to avoid filling up the work space. Otherwise,
2592     if "previous" is NULL, reset the current code pointer to the start. */
2593    
2594     if (previous != NULL)
2595     {
2596     if (previous > orig_code)
2597     {
2598     memmove(orig_code, previous, code - previous);
2599     code -= previous - orig_code;
2600     previous = orig_code;
2601     }
2602     }
2603     else code = orig_code;
2604    
2605     /* Remember where this code item starts so we can pick up the length
2606     next time round. */
2607    
2608     last_code = code;
2609     }
2610    
2611     /* In the real compile phase, just check the workspace used by the forward
2612     reference list. */
2613    
2614     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2615     {
2616     *errorcodeptr = ERR52;
2617     goto FAILED;
2618     }
2619    
2620 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2621    
2622     if (inescq && c != 0)
2623     {
2624 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2625 nigel 77 {
2626     inescq = FALSE;
2627     ptr++;
2628     continue;
2629     }
2630     else
2631     {
2632     if (previous_callout != NULL)
2633     {
2634 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2635     complete_callout(previous_callout, ptr, cd);
2636 nigel 77 previous_callout = NULL;
2637     }
2638     if ((options & PCRE_AUTO_CALLOUT) != 0)
2639     {
2640     previous_callout = code;
2641     code = auto_callout(code, ptr, cd);
2642     }
2643     goto NORMAL_CHAR;
2644     }
2645     }
2646    
2647     /* Fill in length of a previous callout, except when the next thing is
2648     a quantifier. */
2649    
2650 ph10 392 is_quantifier =
2651 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2652     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2653 nigel 77
2654     if (!is_quantifier && previous_callout != NULL &&
2655     after_manual_callout-- <= 0)
2656     {
2657 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2658     complete_callout(previous_callout, ptr, cd);
2659 nigel 77 previous_callout = NULL;
2660     }
2661    
2662     /* In extended mode, skip white space and comments */
2663    
2664     if ((options & PCRE_EXTENDED) != 0)
2665     {
2666     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2667 ph10 391 if (c == CHAR_NUMBER_SIGN)
2668 nigel 77 {
2669 nigel 93 while (*(++ptr) != 0)
2670 nigel 91 {
2671 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2672 nigel 91 }
2673 nigel 93 if (*ptr != 0) continue;
2674    
2675 nigel 91 /* Else fall through to handle end of string */
2676     c = 0;
2677 nigel 77 }
2678     }
2679    
2680     /* No auto callout for quantifiers. */
2681    
2682     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2683     {
2684     previous_callout = code;
2685     code = auto_callout(code, ptr, cd);
2686     }
2687    
2688     switch(c)
2689     {
2690 nigel 93 /* ===================================================================*/
2691     case 0: /* The branch terminates at string end */
2692 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
2693     case CHAR_RIGHT_PARENTHESIS:
2694 nigel 77 *firstbyteptr = firstbyte;
2695     *reqbyteptr = reqbyte;
2696     *codeptr = code;
2697     *ptrptr = ptr;
2698 nigel 93 if (lengthptr != NULL)
2699     {
2700 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2701     {
2702     *errorcodeptr = ERR20;
2703     goto FAILED;
2704     }
2705 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2706     DPRINTF((">> end branch\n"));
2707     }
2708 nigel 77 return TRUE;
2709    
2710 nigel 93
2711     /* ===================================================================*/
2712 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2713     the setting of any following char as a first character. */
2714    
2715 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
2716 nigel 77 if ((options & PCRE_MULTILINE) != 0)
2717     {
2718     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2719     }
2720     previous = NULL;
2721     *code++ = OP_CIRC;
2722     break;
2723    
2724 ph10 391 case CHAR_DOLLAR_SIGN:
2725 nigel 77 previous = NULL;
2726     *code++ = OP_DOLL;
2727     break;
2728    
2729     /* There can never be a first char if '.' is first, whatever happens about
2730     repeats. The value of reqbyte doesn't change either. */
2731    
2732 ph10 391 case CHAR_DOT:
2733 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2734     zerofirstbyte = firstbyte;
2735     zeroreqbyte = reqbyte;
2736     previous = code;
2737 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2738 nigel 77 break;
2739    
2740 nigel 93
2741     /* ===================================================================*/
2742 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2743     32-byte bitmap of the permitted characters, except in the special case
2744     where there is only one such character. For negated classes, we build the
2745     map as usual, then invert it at the end. However, we use a different opcode
2746     so that data characters > 255 can be handled correctly.
2747 nigel 77
2748     If the class contains characters outside the 0-255 range, a different
2749     opcode is compiled. It may optionally have a bit map for characters < 256,
2750     but those above are are explicitly listed afterwards. A flag byte tells
2751     whether the bitmap is present, and whether this is a negated class or not.
2752 ph10 345
2753 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
2754     default (Perl) mode, it is treated as a data character. */
2755 ph10 345
2756 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
2757 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2758     {
2759     *errorcodeptr = ERR64;
2760 ph10 345 goto FAILED;
2761 ph10 336 }
2762 ph10 345 goto NORMAL_CHAR;
2763 nigel 77
2764 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
2765 nigel 77 previous = code;
2766    
2767     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2768     they are encountered at the top level, so we'll do that too. */
2769    
2770 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2771 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
2772 ph10 295 check_posix_syntax(ptr, &tempptr))
2773 nigel 77 {
2774 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2775 nigel 77 goto FAILED;
2776     }
2777    
2778 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2779 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2780 ph10 205 skip them too. This makes for compatibility with Perl. */
2781 ph10 208
2782 ph10 205 negate_class = FALSE;
2783     for (;;)
2784 nigel 77 {
2785     c = *(++ptr);
2786 ph10 391 if (c == CHAR_BACKSLASH)
2787 ph10 205 {
2788 ph10 392 if (ptr[1] == CHAR_E)
2789 ph10 391 ptr++;
2790 ph10 392 else if (strncmp((const char *)ptr+1,
2791     STR_Q STR_BACKSLASH STR_E, 3) == 0)
2792 ph10 391 ptr += 3;
2793 ph10 392 else
2794 ph10 391 break;
2795 ph10 205 }
2796 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2797 ph10 205 negate_class = TRUE;
2798     else break;
2799 ph10 208 }
2800 ph10 345
2801     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2802     an initial ']' is taken as a data character -- the code below handles
2803 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2804     [^] must match any character, so generate OP_ALLANY. */
2805 ph10 345
2806 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2807 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2808 ph10 341 {
2809     *code++ = negate_class? OP_ALLANY : OP_FAIL;
2810     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2811     zerofirstbyte = firstbyte;
2812     break;
2813 ph10 345 }
2814 nigel 77
2815 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
2816     negation flag at the end, so that support for characters > 255 works
2817 ph10 264 correctly (they are all included in the class). */
2818    
2819     should_flip_negation = FALSE;
2820    
2821 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
2822 nigel 93 of just a single character (as long as it's < 256). However, For higher
2823     valued UTF-8 characters, we don't yet do any optimization. */
2824 nigel 77
2825     class_charcount = 0;
2826     class_lastchar = -1;
2827    
2828 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2829     temporary bit of memory, in case the class contains only 1 character (less
2830     than 256), because in that case the compiled code doesn't use the bit map.
2831     */
2832    
2833     memset(classbits, 0, 32 * sizeof(uschar));
2834    
2835 nigel 77 #ifdef SUPPORT_UTF8
2836     class_utf8 = FALSE; /* No chars >= 256 */
2837 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2838 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2839 nigel 77 #endif
2840    
2841     /* Process characters until ] is reached. By writing this as a "do" it
2842 nigel 93 means that an initial ] is taken as a data character. At the start of the
2843     loop, c contains the first byte of the character. */
2844 nigel 77
2845 nigel 93 if (c != 0) do
2846 nigel 77 {
2847 nigel 93 const uschar *oldptr;
2848    
2849 nigel 77 #ifdef SUPPORT_UTF8
2850     if (utf8 && c > 127)
2851     { /* Braces are required because the */
2852     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2853     }
2854 ph10 309
2855 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2856 ph10 309 data and reset the pointer. This is so that very large classes that
2857 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
2858 ph10 309 (which is on the stack). */
2859    
2860 ph10 300 if (lengthptr != NULL)
2861     {
2862     *lengthptr += class_utf8data - class_utf8data_base;
2863 ph10 309 class_utf8data = class_utf8data_base;
2864     }
2865    
2866 nigel 77 #endif
2867    
2868     /* Inside \Q...\E everything is literal except \E */
2869    
2870     if (inescq)
2871     {
2872 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
2873 nigel 77 {
2874 nigel 93 inescq = FALSE; /* Reset literal state */
2875     ptr++; /* Skip the 'E' */
2876     continue; /* Carry on with next */
2877 nigel 77 }
2878 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2879 nigel 77 }
2880    
2881     /* Handle POSIX class names. Perl allows a negation extension of the
2882     form [:^name:]. A square bracket that doesn't match the syntax is
2883     treated as a literal. We also recognize the POSIX constructions
2884     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2885     5.6 and 5.8 do. */
2886    
2887 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
2888 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2889 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
2890 nigel 77 {
2891     BOOL local_negate = FALSE;
2892 nigel 87 int posix_class, taboffset, tabopt;
2893 nigel 77 register const uschar *cbits = cd->cbits;
2894 nigel 87 uschar pbits[32];
2895 nigel 77
2896 ph10 391 if (ptr[1] != CHAR_COLON)
2897 nigel 77 {
2898     *errorcodeptr = ERR31;
2899     goto FAILED;
2900     }
2901    
2902     ptr += 2;
2903 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
2904 nigel 77 {
2905     local_negate = TRUE;
2906 ph10 286 should_flip_negation = TRUE; /* Note negative special */
2907 nigel 77 ptr++;
2908     }
2909    
2910     posix_class = check_posix_name(ptr, tempptr - ptr);
2911     if (posix_class < 0)
2912     {
2913     *errorcodeptr = ERR30;
2914     goto FAILED;
2915     }
2916    
2917     /* If matching is caseless, upper and lower are converted to
2918     alpha. This relies on the fact that the class table starts with
2919     alpha, lower, upper as the first 3 entries. */
2920    
2921     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2922     posix_class = 0;
2923    
2924 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2925     because we may be adding and subtracting from it, and we don't want to
2926     subtract bits that may be in the main map already. At the end we or the
2927     result into the bit map that is being built. */
2928 nigel 77
2929     posix_class *= 3;
2930 nigel 87
2931     /* Copy in the first table (always present) */
2932    
2933     memcpy(pbits, cbits + posix_class_maps[posix_class],
2934     32 * sizeof(uschar));
2935    
2936     /* If there is a second table, add or remove it as required. */
2937    
2938     taboffset = posix_class_maps[posix_class + 1];
2939     tabopt = posix_class_maps[posix_class + 2];
2940    
2941     if (taboffset >= 0)
2942 nigel 77 {
2943 nigel 87 if (tabopt >= 0)
2944     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2945 nigel 77 else
2946 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2947 nigel 77 }
2948    
2949 nigel 87 /* Not see if we need to remove any special characters. An option
2950     value of 1 removes vertical space and 2 removes underscore. */
2951    
2952     if (tabopt < 0) tabopt = -tabopt;
2953     if (tabopt == 1) pbits[1] &= ~0x3c;
2954     else if (tabopt == 2) pbits[11] &= 0x7f;
2955    
2956     /* Add the POSIX table or its complement into the main table that is
2957     being built and we are done. */
2958    
2959     if (local_negate)
2960     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2961     else
2962     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2963    
2964 nigel 77 ptr = tempptr + 1;
2965     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2966     continue; /* End of POSIX syntax handling */
2967     }
2968    
2969     /* Backslash may introduce a single character, or it may introduce one
2970 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2971     case. Inside a class (and only there) it is treated as backspace.
2972     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2973 ph10 205 to 'or' into the one we are building. We assume they have more than one
2974 nigel 77 character in them, so set class_charcount bigger than one. */
2975    
2976 ph10 391 if (c == CHAR_BACKSLASH)
2977 nigel 77 {
2978 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2979     if (*errorcodeptr != 0) goto FAILED;
2980 nigel 77
2981 ph10 391 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
2982     else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
2983     else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
2984 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2985     {
2986 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
2987 nigel 77 {
2988     ptr += 2; /* avoid empty string */
2989     }
2990     else inescq = TRUE;
2991     continue;
2992     }
2993 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2994 nigel 77
2995     if (c < 0)
2996     {
2997     register const uschar *cbits = cd->cbits;
2998     class_charcount += 2; /* Greater than 1 is what matters */
2999 nigel 93
3000     /* Save time by not doing this in the pre-compile phase. */
3001    
3002     if (lengthptr == NULL) switch (-c)
3003 nigel 77 {
3004     case ESC_d:
3005     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3006     continue;
3007    
3008     case ESC_D:
3009 ph10 286 should_flip_negation = TRUE;
3010 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3011     continue;
3012    
3013     case ESC_w:
3014     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3015     continue;
3016    
3017     case ESC_W:
3018 ph10 286 should_flip_negation = TRUE;
3019 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3020     continue;
3021    
3022     case ESC_s:
3023     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3024     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3025     continue;
3026    
3027     case ESC_S:
3028 ph10 286 should_flip_negation = TRUE;
3029 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3030     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3031     continue;
3032    
3033 nigel 93 default: /* Not recognized; fall through */
3034     break; /* Need "default" setting to stop compiler warning. */
3035     }
3036    
3037     /* In the pre-compile phase, just do the recognition. */
3038    
3039     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3040     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3041 ph10 180
3042 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
3043     they use extra memory. */
3044 ph10 180
3045 ph10 178 if (-c == ESC_h)
3046     {
3047     SETBIT(classbits, 0x09); /* VT */
3048     SETBIT(classbits, 0x20); /* SPACE */
3049 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
3050 ph10 178 #ifdef SUPPORT_UTF8
3051     if (utf8)
3052 ph10 180 {
3053 ph10 178 class_utf8 = TRUE;
3054     *class_utf8data++ = XCL_SINGLE;
3055 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3056 ph10 178 *class_utf8data++ = XCL_SINGLE;
3057 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3058     *class_utf8data++ = XCL_RANGE;
3059     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3060     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3061 ph10 178 *class_utf8data++ = XCL_SINGLE;
3062 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3063 ph10 178 *class_utf8data++ = XCL_SINGLE;
3064 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3065 ph10 178 *class_utf8data++ = XCL_SINGLE;
3066 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3067     }
3068     #endif
3069     continue;
3070     }
3071 nigel 93
3072 ph10 178 if (-c == ESC_H)
3073     {
3074     for (c = 0; c < 32; c++)
3075     {
3076     int x = 0xff;
3077     switch (c)
3078 ph10 180 {
3079 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
3080     case 0x20/8: x ^= 1 << (0x20%8); break;
3081     case 0xa0/8: x ^= 1 << (0xa0%8); break;
3082     default: break;
3083     }
3084     classbits[c] |= x;
3085 ph10 180 }
3086    
3087 ph10 178 #ifdef SUPPORT_UTF8
3088     if (utf8)
3089 ph10 180 {
3090 ph10 178 class_utf8 = TRUE;
3091 ph10 180 *class_utf8data++ = XCL_RANGE;
3092     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3093     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3094     *class_utf8data++ = XCL_RANGE;
3095     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3096     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3097     *class_utf8data++ = XCL_RANGE;
3098     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3099     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3100     *class_utf8data++ = XCL_RANGE;
3101     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3102     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3103     *class_utf8data++ = XCL_RANGE;
3104     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3105     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3106     *class_utf8data++ = XCL_RANGE;
3107     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3108     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3109     *class_utf8data++ = XCL_RANGE;
3110     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3111     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3112     }
3113     #endif
3114     continue;
3115     }
3116 ph10 178
3117     if (-c == ESC_v)
3118     {
3119     SETBIT(classbits, 0x0a); /* LF */
3120     SETBIT(classbits, 0x0b); /* VT */
3121 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3122     SETBIT(classbits, 0x0d); /* CR */
3123     SETBIT(classbits, 0x85); /* NEL */
3124 ph10 178 #ifdef SUPPORT_UTF8
3125     if (utf8)
3126 ph10 180 {
3127 ph10 178 class_utf8 = TRUE;
3128 ph10 180 *class_utf8data++ = XCL_RANGE;
3129     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3130     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3131     }
3132     #endif
3133     continue;
3134     }
3135 ph10 178
3136     if (-c == ESC_V)
3137     {
3138     for (c = 0; c < 32; c++)
3139     {
3140     int x = 0xff;
3141     switch (c)
3142 ph10 180 {
3143 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3144     x ^= 1 << (0x0b%8);
3145     x ^= 1 << (0x0c%8);
3146 ph10 180 x ^= 1 << (0x0d%8);
3147 ph10 178 break;
3148     case 0x85/8: x ^= 1 << (0x85%8); break;
3149     default: break;
3150     }
3151     classbits[c] |= x;
3152 ph10 180 }
3153    
3154 ph10 178 #ifdef SUPPORT_UTF8
3155     if (utf8)
3156 ph10 180 {
3157 ph10 178 class_utf8 = TRUE;
3158 ph10 180 *class_utf8data++ = XCL_RANGE;
3159     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3160     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3161     *class_utf8data++ = XCL_RANGE;
3162     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3163     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3164     }
3165     #endif
3166     continue;
3167     }
3168 ph10 178
3169 nigel 93 /* We need to deal with \P and \p in both phases. */
3170    
3171 nigel 77 #ifdef SUPPORT_UCP
3172 nigel 93 if (-c == ESC_p || -c == ESC_P)
3173     {
3174     BOOL negated;
3175     int pdata;
3176     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3177     if (ptype < 0) goto FAILED;
3178     class_utf8 = TRUE;
3179     *class_utf8data++ = ((-c == ESC_p) != negated)?
3180     XCL_PROP : XCL_NOTPROP;
3181     *class_utf8data++ = ptype;
3182     *class_utf8data++ = pdata;
3183     class_charcount -= 2; /* Not a < 256 character */
3184 nigel 77 continue;
3185 nigel 93 }
3186 nigel 77 #endif
3187 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
3188     strict mode. By default, for compatibility with Perl, they are
3189     treated as literals. */
3190 nigel 77
3191 nigel 93 if ((options & PCRE_EXTRA) != 0)
3192     {
3193     *errorcodeptr = ERR7;
3194     goto FAILED;
3195     }
3196 nigel 77
3197 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3198     c = *ptr; /* Get the final character and fall through */
3199 nigel 77 }
3200    
3201     /* Fall through if we have a single character (c >= 0). This may be
3202 nigel 93 greater than 256 in UTF-8 mode. */
3203 nigel 77
3204     } /* End of backslash handling */
3205    
3206     /* A single character may be followed by '-' to form a range. However,
3207     Perl does not permit ']' to be the end of the range. A '-' character
3208 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3209     entirely. The code for handling \Q and \E is messy. */
3210 nigel 77
3211 nigel 93 CHECK_RANGE:
3212 ph10 391 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3213 nigel 77 {
3214 nigel 93 inescq = FALSE;
3215     ptr += 2;
3216     }
3217    
3218     oldptr = ptr;
3219 ph10 231
3220 ph10 230 /* Remember \r or \n */
3221 ph10 231
3222 ph10 391 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3223 ph10 231
3224 ph10 230 /* Check for range */
3225 nigel 93
3226 ph10 391 if (!inescq && ptr[1] == CHAR_MINUS)
3227 nigel 93 {
3228 nigel 77 int d;
3229     ptr += 2;
3230 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3231 nigel 77
3232 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3233     mode. */
3234    
3235 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3236 nigel 93 {
3237     ptr += 2;
3238 ph10 392 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3239 ph10 391 { ptr += 2; continue; }
3240 nigel 93 inescq = TRUE;
3241     break;
3242     }
3243    
3244 ph10 391 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3245 nigel 93 {
3246     ptr = oldptr;
3247     goto LONE_SINGLE_CHARACTER;
3248     }
3249    
3250 nigel 77 #ifdef SUPPORT_UTF8
3251     if (utf8)
3252     { /* Braces are required because the */
3253     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3254     }
3255     else
3256     #endif
3257     d = *ptr; /* Not UTF-8 mode */
3258    
3259     /* The second part of a range can be a single-character escape, but
3260     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3261     in such circumstances. */
3262    
3263 ph10 391 if (!inescq && d == CHAR_BACKSLASH)
3264 nigel 77 {
3265 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3266     if (*errorcodeptr != 0) goto FAILED;
3267 nigel 77
3268 ph10 275 /* \b is backspace; \X is literal X; \R is literal R; any other
3269 nigel 93 special means the '-' was literal */
3270 nigel 77
3271     if (d < 0)
3272     {
3273 ph10 391 if (d == -ESC_b) d = CHAR_BS;
3274     else if (d == -ESC_X) d = CHAR_X;
3275     else if (d == -ESC_R) d = CHAR_R; else
3276 nigel 77 {
3277 nigel 93 ptr = oldptr;
3278 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3279     }
3280     }
3281     }
3282    
3283 nigel 93 /* Check that the two values are in the correct order. Optimize
3284     one-character ranges */
3285 nigel 77
3286 nigel 93 if (d < c)
3287     {
3288     *errorcodeptr = ERR8;
3289     goto FAILED;
3290     }
3291    
3292 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3293    
3294 ph10 230 /* Remember \r or \n */
3295 ph10 231
3296 ph10 391 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3297 ph10 231
3298 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3299     matching, we have to use an XCLASS with extra data items. Caseless
3300     matching for characters > 127 is available only if UCP support is
3301     available. */
3302    
3303     #ifdef SUPPORT_UTF8
3304     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3305     {
3306     class_utf8 = TRUE;
3307    
3308     /* With UCP support, we can find the other case equivalents of
3309     the relevant characters. There may be several ranges. Optimize how
3310     they fit with the basic range. */
3311    
3312     #ifdef SUPPORT_UCP
3313     if ((options & PCRE_CASELESS) != 0)
3314     {
3315 nigel 93 unsigned int occ, ocd;
3316     unsigned int cc = c;
3317     unsigned int origd = d;
3318 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3319     {
3320 ph10 180 if (occ >= (unsigned int)c &&
3321     ocd <= (unsigned int)d)
3322 ph10 176 continue; /* Skip embedded ranges */
3323 nigel 77
3324 ph10 180 if (occ < (unsigned int)c &&
3325 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3326 nigel 77 { /* if there is overlap, */
3327     c = occ; /* noting that if occ < c */
3328     continue; /* we can't have ocd > d */
3329     } /* because a subrange is */
3330 ph10 180 if (ocd > (unsigned int)d &&
3331 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3332 nigel 77 { /* the basic range. */
3333     d = ocd;
3334     continue;
3335     }
3336    
3337     if (occ == ocd)
3338     {
3339     *class_utf8data++ = XCL_SINGLE;
3340     }
3341     else
3342     {
3343     *class_utf8data++ = XCL_RANGE;
3344     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3345     }
3346     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3347     }
3348     }
3349     #endif /* SUPPORT_UCP */
3350    
3351     /* Now record the original range, possibly modified for UCP caseless
3352     overlapping ranges. */
3353    
3354     *class_utf8data++ = XCL_RANGE;
3355     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3356     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3357    
3358     /* With UCP support, we are done. Without UCP support, there is no
3359     caseless matching for UTF-8 characters > 127; we can use the bit map
3360     for the smaller ones. */
3361    
3362     #ifdef SUPPORT_UCP
3363     continue; /* With next character in the class */
3364     #else
3365     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3366    
3367     /* Adjust upper limit and fall through to set up the map */
3368    
3369     d = 127;
3370    
3371     #endif /* SUPPORT_UCP */
3372     }
3373     #endif /* SUPPORT_UTF8 */
3374    
3375     /* We use the bit map for all cases when not in UTF-8 mode; else
3376     ranges that lie entirely within 0-127 when there is UCP support; else
3377     for partial ranges without UCP support. */
3378    
3379 nigel 93 class_charcount += d - c + 1;
3380     class_lastchar = d;
3381    
3382     /* We can save a bit of time by skipping this in the pre-compile. */
3383    
3384     if (lengthptr == NULL) for (; c <= d; c++)
3385 nigel 77 {
3386     classbits[c/8] |= (1 << (c&7));
3387     if ((options & PCRE_CASELESS) != 0)
3388     {
3389     int uc = cd->fcc[c]; /* flip case */
3390     classbits[uc/8] |= (1 << (uc&7));
3391     }
3392     }
3393    
3394     continue; /* Go get the next char in the class */
3395     }
3396    
3397     /* Handle a lone single character - we can get here for a normal
3398     non-escape char, or after \ that introduces a single character or for an
3399     apparent range that isn't. */
3400    
3401     LONE_SINGLE_CHARACTER:
3402 ph10 231
3403 nigel 77 /* Handle a character that cannot go in the bit map */
3404    
3405     #ifdef SUPPORT_UTF8
3406     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3407     {
3408     class_utf8 = TRUE;
3409     *class_utf8data++ = XCL_SINGLE;
3410     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3411    
3412     #ifdef SUPPORT_UCP
3413     if ((options & PCRE_CASELESS) != 0)
3414     {
3415 nigel 93 unsigned int othercase;
3416 ph10 349 if ((othercase = UCD_OTHERCASE(c)) != c)
3417 nigel 77 {
3418     *class_utf8data++ = XCL_SINGLE;
3419     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3420     }
3421     }
3422     #endif /* SUPPORT_UCP */
3423    
3424     }
3425     else
3426     #endif /* SUPPORT_UTF8 */
3427    
3428     /* Handle a single-byte character */
3429     {
3430     classbits[c/8] |= (1 << (c&7));
3431     if ((options & PCRE_CASELESS) != 0)
3432     {
3433     c = cd->fcc[c]; /* flip case */
3434     classbits[c/8] |= (1 << (c&7));
3435     }
3436     class_charcount++;
3437     class_lastchar = c;
3438     }
3439     }
3440    
3441 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3442 nigel 77
3443 ph10 391 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3444 nigel 77
3445 nigel 93 if (c == 0) /* Missing terminating ']' */
3446     {
3447     *errorcodeptr = ERR6;
3448     goto FAILED;
3449     }
3450 ph10 231
3451    
3452 ph10 230 /* This code has been disabled because it would mean that \s counts as
3453     an explicit \r or \n reference, and that's not really what is wanted. Now
3454     we set the flag only if there is a literal "\r" or "\n" in the class. */
3455 ph10 227
3456 ph10 230 #if 0
3457 ph10 226 /* Remember whether \r or \n are in this class */
3458 ph10 227
3459 ph10 226 if (negate_class)
3460     {
3461 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3462 ph10 226 }
3463     else
3464     {
3465 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3466 ph10 227 }
3467 ph10 230 #endif
3468 ph10 227
3469 ph10 231
3470 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3471 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3472     use of \p or \P, in other words, no use of any XCLASS features, we can
3473     optimize.
3474    
3475 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3476     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3477     operate on single-bytes only. This is an historical hangover. Maybe one day
3478     we can tidy these opcodes to handle multi-byte characters.
3479 nigel 77
3480     The optimization throws away the bit map. We turn the item into a
3481     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3482     that OP_NOT does not support multibyte characters. In the positive case, it
3483     can cause firstbyte to be set. Otherwise, there can be no first char if
3484     this item is first, whatever repeat count may follow. In the case of
3485     reqbyte, save the previous value for reinstating. */
3486    
3487     #ifdef SUPPORT_UTF8
3488 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3489 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3490 nigel 77 #else
3491     if (class_charcount == 1)
3492     #endif
3493     {
3494     zeroreqbyte = reqbyte;
3495    
3496     /* The OP_NOT opcode works on one-byte characters only. */
3497    
3498     if (negate_class)
3499     {
3500     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3501     zerofirstbyte = firstbyte;
3502     *code++ = OP_NOT;
3503     *code++ = class_lastchar;
3504     break;
3505     }
3506    
3507     /* For a single, positive character, get the value into mcbuffer, and
3508     then we can handle this with the normal one-character code. */
3509    
3510     #ifdef SUPPORT_UTF8
3511     if (utf8 && class_lastchar > 127)
3512     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3513     else
3514     #endif
3515     {
3516     mcbuffer[0] = class_lastchar;
3517     mclength = 1;
3518     }
3519     goto ONE_CHAR;
3520     } /* End of 1-char optimization */
3521    
3522     /* The general case - not the one-char optimization. If this is the first
3523     thing in the branch, there can be no first char setting, whatever the
3524     repeat count. Any reqbyte setting must remain unchanged after any kind of
3525     repeat. */
3526    
3527     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3528     zerofirstbyte = firstbyte;
3529     zeroreqbyte = reqbyte;
3530    
3531     /* If there are characters with values > 255, we have to compile an
3532 ph10 286 extended class, with its own opcode, unless there was a negated special
3533     such as \S in the class, because in that case all characters > 255 are in
3534     the class, so any that were explicitly given as well can be ignored. If
3535 ph10 264 (when there are explicit characters > 255 that must be listed) there are no
3536     characters < 256, we can omit the bitmap in the actual compiled code. */
3537 nigel 77
3538     #ifdef SUPPORT_UTF8
3539 ph10 264 if (class_utf8 && !should_flip_negation)
3540 nigel 77 {
3541     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3542     *code++ = OP_XCLASS;
3543     code += LINK_SIZE;
3544     *code = negate_class? XCL_NOT : 0;
3545    
3546 nigel 93 /* If the map is required, move up the extra data to make room for it;
3547     otherwise just move the code pointer to the end of the extra data. */
3548 nigel 77
3549     if (class_charcount > 0)
3550     {
3551     *code++ |= XCL_MAP;
3552 nigel 93 memmove(code + 32, code, class_utf8data - code);
3553 nigel 77 memcpy(code, classbits, 32);
3554 nigel 93 code = class_utf8data + 32;
3555 nigel 77 }
3556 nigel 93 else code = class_utf8data;
3557 nigel 77
3558     /* Now fill in the complete length of the item */
3559    
3560     PUT(previous, 1, code - previous);
3561     break; /* End of class handling */
3562     }
3563     #endif
3564    
3565 ph10 286 /* If there are no characters > 255, set the opcode to OP_CLASS or
3566     OP_NCLASS, depending on whether the whole class was negated and whether
3567     there were negative specials such as \S in the class. Then copy the 32-byte
3568 ph10 264 map into the code vector, negating it if necessary. */
3569 ph10 286
3570 ph10 264 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3571 nigel 77 if (negate_class)
3572     {
3573 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3574     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3575 nigel 77 }
3576     else
3577     {
3578     memcpy(code, classbits, 32);
3579     }
3580     code += 32;
3581     break;
3582    
3583 nigel 93
3584     /* ===================================================================*/
3585 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3586     has been tested above. */
3587    
3588 ph10 391 case CHAR_LEFT_CURLY_BRACKET:
3589 nigel 77 if (!is_quantifier) goto NORMAL_CHAR;
3590     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3591     if (*errorcodeptr != 0) goto FAILED;
3592     goto REPEAT;
3593    
3594 ph10 391 case CHAR_ASTERISK:
3595 nigel 77 repeat_min = 0;
3596     repeat_max = -1;
3597     goto REPEAT;
3598    
3599 ph10 391 case CHAR_PLUS:
3600 nigel 77 repeat_min = 1;
3601     repeat_max = -1;
3602     goto REPEAT;
3603    
3604 ph10 391 case CHAR_QUESTION_MARK:
3605 nigel 77 repeat_min = 0;
3606     repeat_max = 1;
3607    
3608     REPEAT:
3609     if (previous == NULL)
3610     {
3611     *errorcodeptr = ERR9;
3612     goto FAILED;
3613     }
3614    
3615     if (repeat_min == 0)
3616     {
3617     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3618     reqbyte = zeroreqbyte; /* Ditto */
3619     }
3620    
3621     /* Remember whether this is a variable length repeat */
3622    
3623     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3624    
3625     op_type = 0; /* Default single-char op codes */
3626     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3627    
3628     /* Save start of previous item, in case we have to move it up to make space
3629     for an inserted OP_ONCE for the additional '+' extension. */
3630    
3631     tempcode = previous;
3632    
3633     /* If the next character is '+', we have a possessive quantifier. This
3634     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3635     If the next character is '?' this is a minimizing repeat, by default,
3636     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3637     repeat type to the non-default. */
3638    
3639 ph10 391 if (ptr[1] == CHAR_PLUS)
3640 nigel 77 {
3641     repeat_type = 0; /* Force greedy */
3642     possessive_quantifier = TRUE;
3643     ptr++;
3644     }
3645 ph10 391 else if (ptr[1] == CHAR_QUESTION_MARK)
3646 nigel 77 {
3647     repeat_type = greedy_non_default;
3648     ptr++;
3649     }
3650     else repeat_type = greedy_default;
3651    
3652     /* If previous was a character match, abolish the item and generate a
3653     repeat item instead. If a char item has a minumum of more than one, ensure
3654     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3655     the first thing in a branch because the x will have gone into firstbyte
3656     instead. */
3657    
3658     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3659     {
3660     /* Deal with UTF-8 characters that take up more than one byte. It's
3661     easier to write this out separately than try to macrify it. Use c to
3662     hold the length of the character in bytes, plus 0x80 to flag that it's a
3663     length rather than a small character. */
3664    
3665     #ifdef SUPPORT_UTF8
3666     if (utf8 && (code[-1] & 0x80) != 0)
3667     {
3668     uschar *lastchar = code - 1;
3669     while((*lastchar & 0xc0) == 0x80) lastchar--;
3670     c = code - lastchar; /* Length of UTF-8 character */
3671     memcpy(utf8_char, lastchar, c); /* Save the char */
3672     c |= 0x80; /* Flag c as a length */
3673     }
3674     else
3675     #endif
3676    
3677     /* Handle the case of a single byte - either with no UTF8 support, or
3678     with UTF-8 disabled, or for a UTF-8 character < 128. */
3679    
3680     {
3681     c = code[-1];
3682     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3683     }
3684    
3685 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3686     the line is something that cannot possibly match this character. If so,
3687     automatically possessifying this item gains some performance in the case
3688     where the match fails. */
3689    
3690     if (!possessive_quantifier &&
3691     repeat_max < 0 &&
3692     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3693     options, cd))
3694     {
3695     repeat_type = 0; /* Force greedy */
3696     possessive_quantifier = TRUE;
3697     }
3698    
3699 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3700     }
3701    
3702     /* If previous was a single negated character ([^a] or similar), we use
3703     one of the special opcodes, replacing it. The code is shared with single-
3704     character repeats by setting opt_type to add a suitable offset into
3705 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3706     currently used only for single-byte chars. */
3707 nigel 77
3708     else if (*previous == OP_NOT)
3709     {
3710     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3711     c = previous[1];
3712 nigel 93 if (!possessive_quantifier &&
3713     repeat_max < 0 &&
3714     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3715     {
3716     repeat_type = 0; /* Force greedy */
3717     possessive_quantifier = TRUE;
3718     }
3719 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3720     }
3721    
3722     /* If previous was a character type match (\d or similar), abolish it and
3723     create a suitable repeat item. The code is shared with single-character
3724     repeats by setting op_type to add a suitable offset into repeat_type. Note
3725     the the Unicode property types will be present only when SUPPORT_UCP is
3726     defined, but we don't wrap the little bits of code here because it just
3727     makes it horribly messy. */
3728    
3729     else if (*previous < OP_EODN)
3730     {
3731     uschar *oldcode;
3732 nigel 87 int prop_type, prop_value;
3733 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3734     c = *previous;
3735    
3736 nigel 93 if (!possessive_quantifier &&
3737     repeat_max < 0 &&
3738     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3739     {
3740     repeat_type = 0; /* Force greedy */
3741     possessive_quantifier = TRUE;
3742     }
3743    
3744 nigel 77 OUTPUT_SINGLE_REPEAT:
3745 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3746     {
3747     prop_type = previous[1];
3748     prop_value = previous[2];
3749     }
3750     else prop_type = prop_value = -1;
3751 nigel 77
3752     oldcode = code;
3753     code = previous; /* Usually overwrite previous item */
3754    
3755     /* If the maximum is zero then the minimum must also be zero; Perl allows
3756     this case, so we do too - by simply omitting the item altogether. */
3757    
3758     if (repeat_max == 0) goto END_REPEAT;
3759    
3760     /* All real repeats make it impossible to handle partial matching (maybe
3761     one day we will be able to remove this restriction). */
3762    
3763 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3764 nigel 77
3765     /* Combine the op_type with the repeat_type */
3766    
3767     repeat_type += op_type;
3768    
3769     /* A minimum of zero is handled either as the special case * or ?, or as
3770     an UPTO, with the maximum given. */
3771    
3772     if (repeat_min == 0)
3773     {
3774     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3775     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3776     else
3777     {
3778     *code++ = OP_UPTO + repeat_type;
3779     PUT2INC(code, 0, repeat_max);
3780     }
3781     }
3782    
3783     /* A repeat minimum of 1 is optimized into some special cases. If the
3784 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3785 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3786     one less than the maximum. */
3787    
3788     else if (repeat_min == 1)
3789     {
3790     if (repeat_max == -1)
3791     *code++ = OP_PLUS + repeat_type;
3792     else
3793     {
3794     code = oldcode; /* leave previous item in place */
3795     if (repeat_max == 1) goto END_REPEAT;
3796     *code++ = OP_UPTO + repeat_type;
3797     PUT2INC(code, 0, repeat_max - 1);
3798     }
3799     }
3800    
3801     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3802     handled as an EXACT followed by an UPTO. */
3803    
3804     else
3805     {
3806     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3807     PUT2INC(code, 0, repeat_min);
3808    
3809     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3810     we have to insert the character for the previous code. For a repeated
3811 nigel 87 Unicode property match, there are two extra bytes that define the
3812 nigel 77 required property. In UTF-8 mode, long characters have their length in
3813     c, with the 0x80 bit as a flag. */
3814    
3815     if (repeat_max < 0)
3816     {
3817     #ifdef SUPPORT_UTF8
3818     if (utf8 && c >= 128)
3819     {
3820     memcpy(code, utf8_char, c & 7);
3821     code += c & 7;
3822     }
3823     else
3824     #endif
3825     {
3826     *code++ = c;
3827 nigel 87 if (prop_type >= 0)
3828     {
3829     *code++ = prop_type;
3830     *code++ = prop_value;
3831     }
3832 nigel 77 }
3833     *code++ = OP_STAR + repeat_type;
3834     }
3835    
3836     /* Else insert an UPTO if the max is greater than the min, again
3837 nigel 93 preceded by the character, for the previously inserted code. If the
3838     UPTO is just for 1 instance, we can use QUERY instead. */
3839 nigel 77
3840     else if (repeat_max != repeat_min)
3841     {
3842     #ifdef SUPPORT_UTF8
3843     if (utf8 && c >= 128)
3844     {
3845     memcpy(code, utf8_char, c & 7);
3846     code += c & 7;
3847     }
3848     else
3849     #endif
3850     *code++ = c;
3851 nigel 87 if (prop_type >= 0)
3852     {
3853     *code++ = prop_type;
3854     *code++ = prop_value;
3855     }
3856 nigel 77 repeat_max -= repeat_min;
3857 nigel 93
3858     if (repeat_max == 1)
3859     {
3860     *code++ = OP_QUERY + repeat_type;
3861     }
3862     else
3863     {
3864     *code++ = OP_UPTO + repeat_type;
3865     PUT2INC(code, 0, repeat_max);
3866     }
3867 nigel 77 }
3868     }
3869    
3870     /* The character or character type itself comes last in all cases. */
3871    
3872     #ifdef SUPPORT_UTF8
3873     if (utf8 && c >= 128)
3874     {
3875     memcpy(code, utf8_char, c & 7);
3876     code += c & 7;
3877     }
3878     else
3879     #endif
3880     *code++ = c;
3881    
3882 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3883     define the required property. */
3884 nigel 77
3885     #ifdef SUPPORT_UCP
3886 nigel 87 if (prop_type >= 0)
3887     {
3888     *code++ = prop_type;
3889     *code++ = prop_value;
3890     }
3891 nigel 77 #endif
3892     }
3893    
3894     /* If previous was a character class or a back reference, we put the repeat
3895     stuff after it, but just skip the item if the repeat was {0,0}. */
3896    
3897     else if (*previous == OP_CLASS ||
3898     *previous == OP_NCLASS ||
3899     #ifdef SUPPORT_UTF8
3900     *previous == OP_XCLASS ||
3901     #endif
3902     *previous == OP_REF)
3903     {
3904     if (repeat_max == 0)
3905     {
3906     code = previous;
3907     goto END_REPEAT;
3908     }
3909    
3910     /* All real repeats make it impossible to handle partial matching (maybe
3911     one day we will be able to remove this restriction). */
3912    
3913 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3914 nigel 77
3915     if (repeat_min == 0 && repeat_max == -1)
3916     *code++ = OP_CRSTAR + repeat_type;
3917     else if (repeat_min == 1 && repeat_max == -1)
3918     *code++ = OP_CRPLUS + repeat_type;
3919     else if (repeat_min == 0 && repeat_max == 1)
3920     *code++ = OP_CRQUERY + repeat_type;
3921     else
3922     {
3923     *code++ = OP_CRRANGE + repeat_type;
3924     PUT2INC(code, 0, repeat_min);
3925     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3926     PUT2INC(code, 0, repeat_max);
3927     }
3928     }
3929    
3930     /* If previous was a bracket group, we may have to replicate it in certain
3931     cases. */
3932    
3933 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3934     *previous == OP_ONCE || *previous == OP_COND)
3935 nigel 77 {
3936     register int i;
3937     int ketoffset = 0;
3938     int len = code - previous;
3939     uschar *bralink = NULL;
3940    
3941 nigel 93 /* Repeating a DEFINE group is pointless */
3942    
3943     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3944     {
3945     *errorcodeptr = ERR55;
3946     goto FAILED;
3947     }
3948    
3949 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3950     by scanning through from the start, and compute the offset back to it
3951     from the current code pointer. There may be an OP_OPT setting following
3952     the final KET, so we can't find the end just by going back from the code
3953     pointer. */
3954    
3955     if (repeat_max == -1)
3956     {
3957     register uschar *ket = previous;
3958     do ket += GET(ket, 1); while (*ket != OP_KET);
3959     ketoffset = code - ket;
3960     }
3961    
3962     /* The case of a zero minimum is special because of the need to stick
3963     OP_BRAZERO in front of it, and because the group appears once in the
3964     data, whereas in other cases it appears the minimum number of times. For
3965     this reason, it is simplest to treat this case separately, as otherwise
3966     the code gets far too messy. There are several special subcases when the
3967     minimum is zero. */
3968    
3969     if (repeat_min == 0)
3970     {
3971 ph10 335 /* If the maximum is also zero, we used to just omit the group from the
3972     output altogether, like this:
3973 nigel 77
3974 ph10 335 ** if (repeat_max == 0)
3975     ** {
3976     ** code = previous;
3977     ** goto END_REPEAT;
3978     ** }
3979 nigel 77
3980 ph10 345 However, that fails when a group is referenced as a subroutine from
3981     elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
3982     so that it is skipped on execution. As we don't have a list of which
3983     groups are referenced, we cannot do this selectively.
3984    
3985 ph10 335 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
3986     and do no more at this point. However, we do need to adjust any
3987     OP_RECURSE calls inside the group that refer to the group itself or any
3988     internal or forward referenced group, because the offset is from the
3989     start of the whole regex. Temporarily terminate the pattern while doing
3990     this. */
3991 nigel 77
3992 ph10 335 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
3993 nigel 77 {
3994     *code = OP_END;
3995 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3996 nigel 77 memmove(previous+1, previous, len);
3997     code++;
3998 ph10 335 if (repeat_max == 0)
3999     {
4000     *previous++ = OP_SKIPZERO;
4001     goto END_REPEAT;
4002 ph10 345 }
4003 nigel 77 *previous++ = OP_BRAZERO + repeat_type;
4004     }
4005