/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 455 - (hide annotations) (download)
Sat Sep 26 19:12:32 2009 UTC (4 years, 11 months ago) by ph10
File MIME type: text/plain
File size: 217318 byte(s)
Added lower bound length-finding to pcre_study() and use it when matching; make 
the value available via pcre_fullinfo(); also fixed bugs connected with
pcre_study() in pcre_dfa_exec(). 

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 381 Copyright (c) 1997-2009 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 391 #ifndef EBCDIC
101    
102     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
103 ph10 392 in UTF-8 mode. */
104 ph10 391
105 ph10 392 static const short int escapes[] = {
106 ph10 391 0, 0,
107     0, 0,
108 ph10 392 0, 0,
109     0, 0,
110     0, 0,
111 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
112 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
113 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
114 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
115     -ESC_B, -ESC_C,
116     -ESC_D, -ESC_E,
117     0, -ESC_G,
118     -ESC_H, 0,
119     0, -ESC_K,
120 ph10 391 0, 0,
121 ph10 392 0, 0,
122 ph10 391 -ESC_P, -ESC_Q,
123     -ESC_R, -ESC_S,
124 ph10 392 0, 0,
125     -ESC_V, -ESC_W,
126     -ESC_X, 0,
127     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
128 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
129 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
130 ph10 391 CHAR_GRAVE_ACCENT, 7,
131 ph10 392 -ESC_b, 0,
132     -ESC_d, ESC_e,
133 ph10 391 ESC_f, 0,
134     -ESC_h, 0,
135 ph10 392 0, -ESC_k,
136 ph10 391 0, 0,
137     ESC_n, 0,
138 ph10 392 -ESC_p, 0,
139     ESC_r, -ESC_s,
140 ph10 391 ESC_tee, 0,
141 ph10 392 -ESC_v, -ESC_w,
142     0, 0,
143 ph10 391 -ESC_z
144 nigel 77 };
145    
146 ph10 392 #else
147 ph10 391
148     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
149    
150 nigel 77 static const short int escapes[] = {
151     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
152     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
153     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
154     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
155     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
156     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
157     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
158     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
159 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
160 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
161 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
162 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
163 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
164     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
165     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
166     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
167 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
168 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
169 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
170 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
171 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
172     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
173     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
174     };
175     #endif
176    
177    
178 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
179     searched linearly. Put all the names into a single string, in order to reduce
180 ph10 392 the number of relocations when a shared library is dynamically linked. The
181     string is built from string macros so that it works in UTF-8 mode on EBCDIC
182 ph10 391 platforms. */
183 ph10 210
184     typedef struct verbitem {
185     int len;
186     int op;
187 ph10 211 } verbitem;
188 ph10 210
189 ph10 240 static const char verbnames[] =
190 ph10 391 STRING_ACCEPT0
191     STRING_COMMIT0
192     STRING_F0
193     STRING_FAIL0
194     STRING_PRUNE0
195     STRING_SKIP0
196     STRING_THEN;
197 ph10 240
198 ph10 327 static const verbitem verbs[] = {
199 ph10 240 { 6, OP_ACCEPT },
200     { 6, OP_COMMIT },
201     { 1, OP_FAIL },
202     { 4, OP_FAIL },
203     { 5, OP_PRUNE },
204     { 4, OP_SKIP },
205     { 4, OP_THEN }
206 ph10 210 };
207    
208 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
209 ph10 210
210    
211 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
212     now all in a single string, to reduce the number of relocations when a shared
213 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
214     length entry. The first three must be alpha, lower, upper, as this is assumed
215     for handling case independence. */
216 nigel 77
217 ph10 240 static const char posix_names[] =
218 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
219     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
220 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
221     STRING_word0 STRING_xdigit;
222 nigel 77
223     static const uschar posix_name_lengths[] = {
224     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
225    
226 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
227     base map, with an optional addition or removal of another map. Then, for some
228     classes, there is some additional tweaking: for [:blank:] the vertical space
229     characters are removed, and for [:alpha:] and [:alnum:] the underscore
230     character is removed. The triples in the table consist of the base map offset,
231     second map offset or -1 if no second map, and a non-negative value for map
232     addition or a negative value for map subtraction (if there are two maps). The
233     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
234     remove vertical space characters, 2 => remove underscore. */
235 nigel 77
236     static const int posix_class_maps[] = {
237 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
238     cbit_lower, -1, 0, /* lower */
239     cbit_upper, -1, 0, /* upper */
240     cbit_word, -1, 2, /* alnum - word without underscore */
241     cbit_print, cbit_cntrl, 0, /* ascii */
242     cbit_space, -1, 1, /* blank - a GNU extension */
243     cbit_cntrl, -1, 0, /* cntrl */
244     cbit_digit, -1, 0, /* digit */
245     cbit_graph, -1, 0, /* graph */
246     cbit_print, -1, 0, /* print */
247     cbit_punct, -1, 0, /* punct */
248     cbit_space, -1, 0, /* space */
249     cbit_word, -1, 0, /* word - a Perl extension */
250     cbit_xdigit,-1, 0 /* xdigit */
251 nigel 77 };
252    
253    
254 nigel 93 #define STRING(a) # a
255     #define XSTRING(s) STRING(s)
256    
257 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
258 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
259     they are documented. Always add a new error instead. Messages marked DEAD below
260 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
261     the number of relocations needed when a shared library is loaded dynamically,
262     it is now one long string. We cannot use a table of offsets, because the
263     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
264     simply count through to the one we want - this isn't a performance issue
265 ph10 240 because these strings are used only when there is a compilation error. */
266 nigel 77
267 ph10 240 static const char error_texts[] =
268     "no error\0"
269     "\\ at end of pattern\0"
270     "\\c at end of pattern\0"
271     "unrecognized character follows \\\0"
272     "numbers out of order in {} quantifier\0"
273 nigel 77 /* 5 */
274 ph10 240 "number too big in {} quantifier\0"
275     "missing terminating ] for character class\0"
276     "invalid escape sequence in character class\0"
277     "range out of order in character class\0"
278     "nothing to repeat\0"
279 nigel 77 /* 10 */
280 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
281     "internal error: unexpected repeat\0"
282 ph10 269 "unrecognized character after (? or (?-\0"
283 ph10 240 "POSIX named classes are supported only within a class\0"
284     "missing )\0"
285 nigel 77 /* 15 */
286 ph10 240 "reference to non-existent subpattern\0"
287     "erroffset passed as NULL\0"
288     "unknown option bit(s) set\0"
289     "missing ) after comment\0"
290     "parentheses nested too deeply\0" /** DEAD **/
291 nigel 77 /* 20 */
292 ph10 240 "regular expression is too large\0"
293     "failed to get memory\0"
294     "unmatched parentheses\0"
295     "internal error: code overflow\0"
296     "unrecognized character after (?<\0"
297 nigel 77 /* 25 */
298 ph10 240 "lookbehind assertion is not fixed length\0"
299     "malformed number or name after (?(\0"
300     "conditional group contains more than two branches\0"
301     "assertion expected after (?(\0"
302     "(?R or (?[+-]digits must be followed by )\0"
303 nigel 77 /* 30 */
304 ph10 240 "unknown POSIX class name\0"
305     "POSIX collating elements are not supported\0"
306     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
307     "spare error\0" /** DEAD **/
308     "character value in \\x{...} sequence is too large\0"
309 nigel 77 /* 35 */
310 ph10 240 "invalid condition (?(0)\0"
311     "\\C not allowed in lookbehind assertion\0"
312     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
313     "number after (?C is > 255\0"
314     "closing ) for (?C expected\0"
315 nigel 77 /* 40 */
316 ph10 240 "recursive call could loop indefinitely\0"
317     "unrecognized character after (?P\0"
318     "syntax error in subpattern name (missing terminator)\0"
319     "two named subpatterns have the same name\0"
320     "invalid UTF-8 string\0"
321 nigel 77 /* 45 */
322 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
323     "malformed \\P or \\p sequence\0"
324     "unknown property name after \\P or \\p\0"
325     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
326     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
327 nigel 91 /* 50 */
328 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
329     "octal value is greater than \\377 (not in UTF-8 mode)\0"
330     "internal error: overran compiling workspace\0"
331     "internal error: previously-checked referenced subpattern not found\0"
332     "DEFINE group contains more than one branch\0"
333 nigel 93 /* 55 */
334 ph10 240 "repeating a DEFINE group is not allowed\0"
335     "inconsistent NEWLINE options\0"
336 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
337     "a numbered reference must not be zero\0"
338 ph10 240 "(*VERB) with an argument is not supported\0"
339 ph10 211 /* 60 */
340 ph10 240 "(*VERB) not recognized\0"
341 ph10 268 "number is too big\0"
342 ph10 272 "subpattern name expected\0"
343 ph10 336 "digit expected after (?+\0"
344 ph10 345 "] is an invalid data character in JavaScript compatibility mode";
345 nigel 77
346    
347     /* Table to identify digits and hex digits. This is used when compiling
348     patterns. Note that the tables in chartables are dependent on the locale, and
349     may mark arbitrary characters as digits - but the PCRE compiling code expects
350     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
351     a private table here. It costs 256 bytes, but it is a lot faster than doing
352     character value tests (at least in some simple cases I timed), and in some
353     applications one wants PCRE to compile efficiently as well as match
354     efficiently.
355    
356     For convenience, we use the same bit definitions as in chartables:
357    
358     0x04 decimal digit
359     0x08 hexadecimal digit
360    
361     Then we can use ctype_digit and ctype_xdigit in the code. */
362    
363 ph10 392 #ifndef EBCDIC
364 ph10 391
365 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
366 ph10 391 UTF-8 mode. */
367    
368 nigel 77 static const unsigned char digitab[] =
369     {
370     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
371     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
374     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
375     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
376     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
377     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
378     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
379     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
380     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
381     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
382     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
383     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
387     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
388     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
393     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
394     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
395     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
396     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
397     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
398     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
399     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
400     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
401     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
402    
403 ph10 392 #else
404 ph10 391
405     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
406    
407 nigel 77 static const unsigned char digitab[] =
408     {
409     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
410     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
411     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
412     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
413     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
414     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
415     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
416     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
417     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
418     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
419     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
420 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
421 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
422     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
423     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
424     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
425     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
426     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
427     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
428     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
429     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
430     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
431     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
432     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
433     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
434     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
435     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
436     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
437     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
438     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
439     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
440     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
441    
442     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
443     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
444     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
445     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
446     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
447     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
448     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
449     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
450     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
451     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
452     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
453     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
454 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
455 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
456     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
457     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
458     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
459     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
460     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
461     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
462     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
463     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
464     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
465     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
466     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
467     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
468     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
469     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
470     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
471     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
472     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
473     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
474     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
475     #endif
476    
477    
478     /* Definition to allow mutual recursion */
479    
480     static BOOL
481 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
482 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
483 nigel 77
484    
485    
486     /*************************************************
487 ph10 240 * Find an error text *
488     *************************************************/
489    
490 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
491     some of the text is of unknown length, we can't use a table of offsets.
492     Instead, just count through the strings. This is not a performance issue
493 ph10 240 because it happens only when there has been a compilation error.
494    
495     Argument: the error number
496     Returns: pointer to the error string
497     */
498    
499     static const char *
500     find_error_text(int n)
501     {
502     const char *s = error_texts;
503 ph10 369 for (; n > 0; n--) while (*s++ != 0) {};
504 ph10 240 return s;
505     }
506    
507    
508     /*************************************************
509 nigel 77 * Handle escapes *
510     *************************************************/
511    
512     /* This function is called when a \ has been encountered. It either returns a
513     positive value for a simple escape such as \n, or a negative value which
514 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
515     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
516     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
517     ptr is pointing at the \. On exit, it is on the final character of the escape
518     sequence.
519 nigel 77
520     Arguments:
521     ptrptr points to the pattern position pointer
522     errorcodeptr points to the errorcode variable
523     bracount number of previous extracting brackets
524     options the options bits
525     isclass TRUE if inside a character class
526    
527     Returns: zero or positive => a data character
528     negative => a special escape sequence
529 ph10 213 on error, errorcodeptr is set
530 nigel 77 */
531    
532     static int
533     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
534     int options, BOOL isclass)
535     {
536 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
537     const uschar *ptr = *ptrptr + 1;
538 nigel 77 int c, i;
539    
540 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
541     ptr--; /* Set pointer back to the last byte */
542    
543 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
544    
545     if (c == 0) *errorcodeptr = ERR1;
546    
547 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
548     in a table. A non-zero result is something that can be returned immediately.
549 nigel 77 Otherwise further processing may be required. */
550    
551 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
552     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
553     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
554 nigel 77
555 ph10 97 #else /* EBCDIC coding */
556 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
557 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
558     #endif
559    
560     /* Escapes that need further processing, or are illegal. */
561    
562     else
563     {
564     const uschar *oldptr;
565 nigel 93 BOOL braced, negated;
566    
567 nigel 77 switch (c)
568     {
569     /* A number of Perl escapes are not handled by PCRE. We give an explicit
570     error. */
571    
572 ph10 391 case CHAR_l:
573     case CHAR_L:
574     case CHAR_N:
575     case CHAR_u:
576     case CHAR_U:
577 nigel 77 *errorcodeptr = ERR37;
578     break;
579    
580 ph10 333 /* \g must be followed by one of a number of specific things:
581 ph10 345
582 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
583     backreference. If negative, it is a relative backreference. This is a Perl
584     5.10 feature.
585 ph10 345
586 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
587     is part of Perl's movement towards a unified syntax for back references. As
588     this is synonymous with \k{name}, we fudge it up by pretending it really
589     was \k.
590 ph10 345
591     (3) For Oniguruma compatibility we also support \g followed by a name or a
592     number either in angle brackets or in single quotes. However, these are
593     (possibly recursive) subroutine calls, _not_ backreferences. Just return
594 ph10 333 the -ESC_g code (cf \k). */
595 nigel 93
596 ph10 391 case CHAR_g:
597     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
598 ph10 333 {
599     c = -ESC_g;
600 ph10 345 break;
601     }
602 ph10 333
603     /* Handle the Perl-compatible cases */
604 ph10 345
605 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
606 nigel 93 {
607 ph10 171 const uschar *p;
608 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
609     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
610     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
611 ph10 171 {
612     c = -ESC_k;
613     break;
614 ph10 172 }
615 nigel 93 braced = TRUE;
616     ptr++;
617     }
618     else braced = FALSE;
619    
620 ph10 391 if (ptr[1] == CHAR_MINUS)
621 nigel 93 {
622     negated = TRUE;
623     ptr++;
624     }
625     else negated = FALSE;
626    
627     c = 0;
628     while ((digitab[ptr[1]] & ctype_digit) != 0)
629 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
630 ph10 220
631 ph10 333 if (c < 0) /* Integer overflow */
632 ph10 213 {
633     *errorcodeptr = ERR61;
634     break;
635 ph10 220 }
636 ph10 345
637 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
638 nigel 93 {
639     *errorcodeptr = ERR57;
640 ph10 213 break;
641 nigel 93 }
642 ph10 345
643 ph10 333 if (c == 0)
644     {
645     *errorcodeptr = ERR58;
646     break;
647 ph10 345 }
648 nigel 93
649     if (negated)
650     {
651     if (c > bracount)
652     {
653     *errorcodeptr = ERR15;
654 ph10 213 break;
655 nigel 93 }
656     c = bracount - (c - 1);
657     }
658    
659     c = -(ESC_REF + c);
660     break;
661    
662 nigel 77 /* The handling of escape sequences consisting of a string of digits
663     starting with one that is not zero is not straightforward. By experiment,
664     the way Perl works seems to be as follows:
665    
666     Outside a character class, the digits are read as a decimal number. If the
667     number is less than 10, or if there are that many previous extracting
668     left brackets, then it is a back reference. Otherwise, up to three octal
669     digits are read to form an escaped byte. Thus \123 is likely to be octal
670     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
671     value is greater than 377, the least significant 8 bits are taken. Inside a
672     character class, \ followed by a digit is always an octal number. */
673    
674 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
675     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
676 nigel 77
677     if (!isclass)
678     {
679     oldptr = ptr;
680 ph10 391 c -= CHAR_0;
681 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
682 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
683 ph10 333 if (c < 0) /* Integer overflow */
684 ph10 213 {
685     *errorcodeptr = ERR61;
686 ph10 220 break;
687     }
688 nigel 77 if (c < 10 || c <= bracount)
689     {
690     c = -(ESC_REF + c);
691     break;
692     }
693     ptr = oldptr; /* Put the pointer back and fall through */
694     }
695    
696     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
697     generates a binary zero byte and treats the digit as a following literal.
698     Thus we have to pull back the pointer by one. */
699    
700 ph10 391 if ((c = *ptr) >= CHAR_8)
701 nigel 77 {
702     ptr--;
703     c = 0;
704     break;
705     }
706    
707     /* \0 always starts an octal number, but we may drop through to here with a
708 nigel 91 larger first octal digit. The original code used just to take the least
709     significant 8 bits of octal numbers (I think this is what early Perls used
710     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
711     than 3 octal digits. */
712 nigel 77
713 ph10 391 case CHAR_0:
714     c -= CHAR_0;
715     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
716     c = c * 8 + *(++ptr) - CHAR_0;
717 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
718 nigel 77 break;
719    
720 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
721     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
722     treated as a data character. */
723 nigel 77
724 ph10 391 case CHAR_x:
725     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
726 nigel 77 {
727     const uschar *pt = ptr + 2;
728 nigel 87 int count = 0;
729    
730 nigel 77 c = 0;
731     while ((digitab[*pt] & ctype_xdigit) != 0)
732     {
733 nigel 87 register int cc = *pt++;
734 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
735 nigel 77 count++;
736 nigel 87
737 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
738     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
739     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
740 ph10 97 #else /* EBCDIC coding */
741 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
742     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
743 nigel 77 #endif
744     }
745 nigel 87
746 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
747 nigel 77 {
748 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
749 nigel 77 ptr = pt;
750     break;
751     }
752 nigel 87
753 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
754     recognize this construct; fall through to the normal \x handling. */
755     }
756    
757 nigel 87 /* Read just a single-byte hex-defined char */
758 nigel 77
759     c = 0;
760     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
761     {
762 ph10 391 int cc; /* Some compilers don't like */
763     cc = *(++ptr); /* ++ in initializers */
764     #ifndef EBCDIC /* ASCII/UTF-8 coding */
765     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
766     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
767 ph10 97 #else /* EBCDIC coding */
768 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
769     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
770 nigel 77 #endif
771     }
772     break;
773    
774 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
775     This coding is ASCII-specific, but then the whole concept of \cx is
776     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
777 nigel 77
778 ph10 391 case CHAR_c:
779 nigel 77 c = *(++ptr);
780     if (c == 0)
781     {
782     *errorcodeptr = ERR2;
783 ph10 213 break;
784 nigel 77 }
785    
786 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
787     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
788 nigel 77 c ^= 0x40;
789 ph10 97 #else /* EBCDIC coding */
790 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
791 nigel 77 c ^= 0xC0;
792     #endif
793     break;
794    
795     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
796 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
797     otherwise, for Perl compatibility, it is a literal. This code looks a bit
798     odd, but there used to be some cases other than the default, and there may
799     be again in future, so I haven't "optimized" it. */
800 nigel 77
801     default:
802     if ((options & PCRE_EXTRA) != 0) switch(c)
803     {
804     default:
805     *errorcodeptr = ERR3;
806     break;
807     }
808     break;
809     }
810     }
811    
812     *ptrptr = ptr;
813     return c;
814     }
815    
816    
817    
818     #ifdef SUPPORT_UCP
819     /*************************************************
820     * Handle \P and \p *
821     *************************************************/
822    
823     /* This function is called after \P or \p has been encountered, provided that
824     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
825     pointing at the P or p. On exit, it is pointing at the final character of the
826     escape sequence.
827    
828     Argument:
829     ptrptr points to the pattern position pointer
830     negptr points to a boolean that is set TRUE for negation else FALSE
831 nigel 87 dptr points to an int that is set to the detailed property value
832 nigel 77 errorcodeptr points to the error code variable
833    
834 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
835 nigel 77 */
836    
837     static int
838 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
839 nigel 77 {
840     int c, i, bot, top;
841     const uschar *ptr = *ptrptr;
842 nigel 87 char name[32];
843 nigel 77
844     c = *(++ptr);
845     if (c == 0) goto ERROR_RETURN;
846    
847     *negptr = FALSE;
848    
849 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
850     negation. */
851 nigel 77
852 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
853 nigel 77 {
854 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
855 nigel 77 {
856     *negptr = TRUE;
857     ptr++;
858     }
859 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
860 nigel 77 {
861     c = *(++ptr);
862     if (c == 0) goto ERROR_RETURN;
863 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
864 nigel 77 name[i] = c;
865     }
866 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
867 nigel 77 name[i] = 0;
868     }
869    
870     /* Otherwise there is just one following character */
871    
872     else
873     {
874     name[0] = c;
875     name[1] = 0;
876     }
877    
878     *ptrptr = ptr;
879    
880     /* Search for a recognized property name using binary chop */
881    
882     bot = 0;
883     top = _pcre_utt_size;
884    
885     while (bot < top)
886     {
887 nigel 87 i = (bot + top) >> 1;
888 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
889 nigel 87 if (c == 0)
890     {
891     *dptr = _pcre_utt[i].value;
892     return _pcre_utt[i].type;
893     }
894 nigel 77 if (c > 0) bot = i + 1; else top = i;
895     }
896    
897     *errorcodeptr = ERR47;
898     *ptrptr = ptr;
899     return -1;
900    
901     ERROR_RETURN:
902     *errorcodeptr = ERR46;
903     *ptrptr = ptr;
904     return -1;
905     }
906     #endif
907    
908    
909    
910    
911     /*************************************************
912     * Check for counted repeat *
913     *************************************************/
914    
915     /* This function is called when a '{' is encountered in a place where it might
916     start a quantifier. It looks ahead to see if it really is a quantifier or not.
917     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
918     where the ddds are digits.
919    
920     Arguments:
921     p pointer to the first char after '{'
922    
923     Returns: TRUE or FALSE
924     */
925    
926     static BOOL
927     is_counted_repeat(const uschar *p)
928     {
929     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
930     while ((digitab[*p] & ctype_digit) != 0) p++;
931 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
932 nigel 77
933 ph10 391 if (*p++ != CHAR_COMMA) return FALSE;
934     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
935 nigel 77
936     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
937     while ((digitab[*p] & ctype_digit) != 0) p++;
938    
939 ph10 391 return (*p == CHAR_RIGHT_CURLY_BRACKET);
940 nigel 77 }
941    
942    
943    
944     /*************************************************
945     * Read repeat counts *
946     *************************************************/
947    
948     /* Read an item of the form {n,m} and return the values. This is called only
949     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
950     so the syntax is guaranteed to be correct, but we need to check the values.
951    
952     Arguments:
953     p pointer to first char after '{'
954     minp pointer to int for min
955     maxp pointer to int for max
956     returned as -1 if no max
957     errorcodeptr points to error code variable
958    
959     Returns: pointer to '}' on success;
960     current ptr on error, with errorcodeptr set non-zero
961     */
962    
963     static const uschar *
964     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
965     {
966     int min = 0;
967     int max = -1;
968    
969 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
970     an integer overflow. */
971    
972 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
973 nigel 81 if (min < 0 || min > 65535)
974     {
975     *errorcodeptr = ERR5;
976     return p;
977     }
978 nigel 77
979 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
980     Also, max must not be less than min. */
981    
982 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
983 nigel 77 {
984 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
985 nigel 77 {
986     max = 0;
987 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
988 nigel 81 if (max < 0 || max > 65535)
989     {
990     *errorcodeptr = ERR5;
991     return p;
992     }
993 nigel 77 if (max < min)
994     {
995     *errorcodeptr = ERR4;
996     return p;
997     }
998     }
999     }
1000    
1001 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1002     '}'. */
1003 nigel 77
1004 nigel 81 *minp = min;
1005     *maxp = max;
1006 nigel 77 return p;
1007     }
1008    
1009    
1010    
1011     /*************************************************
1012 ph10 408 * Subroutine for finding forward reference *
1013 nigel 91 *************************************************/
1014    
1015 ph10 408 /* This recursive function is called only from find_parens() below. The
1016     top-level call starts at the beginning of the pattern. All other calls must
1017     start at a parenthesis. It scans along a pattern's text looking for capturing
1018 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1019     name it is given, it returns its number. Alternatively, if the name is NULL, it
1020 ph10 408 returns when it reaches a given numbered subpattern. We know that if (?P< is
1021     encountered, the name will be terminated by '>' because that is checked in the
1022 ph10 411 first pass. Recursion is used to keep track of subpatterns that reset the
1023 ph10 408 capturing group numbers - the (?| feature.
1024 nigel 91
1025     Arguments:
1026 ph10 408 ptrptr address of the current character pointer (updated)
1027 ph10 345 cd compile background data
1028 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1029     lorn name length, or subpattern number if name is NULL
1030     xmode TRUE if we are in /x mode
1031 ph10 411 count pointer to the current capturing subpattern number (updated)
1032 nigel 91
1033     Returns: the number of the named subpattern, or -1 if not found
1034     */
1035    
1036     static int
1037 ph10 408 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1038     BOOL xmode, int *count)
1039 nigel 91 {
1040 ph10 408 uschar *ptr = *ptrptr;
1041     int start_count = *count;
1042     int hwm_count = start_count;
1043     BOOL dup_parens = FALSE;
1044 nigel 93
1045 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1046 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1047    
1048     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1049     {
1050     if (ptr[1] == CHAR_QUESTION_MARK &&
1051 ph10 411 ptr[2] == CHAR_VERTICAL_LINE)
1052 ph10 408 {
1053     ptr += 3;
1054 ph10 411 dup_parens = TRUE;
1055     }
1056 ph10 408
1057     /* Handle a normal, unnamed capturing parenthesis */
1058 ph10 411
1059 ph10 408 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1060     {
1061     *count += 1;
1062     if (name == NULL && *count == lorn) return *count;
1063 ph10 411 ptr++;
1064 ph10 408 }
1065    
1066     /* Handle a condition. If it is an assertion, just carry on so that it
1067     is processed as normal. If not, skip to the closing parenthesis of the
1068 ph10 411 condition (there can't be any nested parens. */
1069    
1070 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1071     {
1072 ph10 411 ptr += 2;
1073 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1074     {
1075     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1076 ph10 411 if (*ptr != 0) ptr++;
1077 ph10 408 }
1078 ph10 411 }
1079    
1080 ph10 408 /* We have either (? or (* and not a condition */
1081    
1082     else
1083 ph10 411 {
1084 ph10 408 ptr += 2;
1085     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1086    
1087     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1088 ph10 411
1089 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1090     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1091     {
1092     int term;
1093     const uschar *thisname;
1094     *count += 1;
1095     if (name == NULL && *count == lorn) return *count;
1096     term = *ptr++;
1097     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1098     thisname = ptr;
1099     while (*ptr != term) ptr++;
1100     if (name != NULL && lorn == ptr - thisname &&
1101     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1102     return *count;
1103 ph10 438 term++;
1104 ph10 411 }
1105 ph10 408 }
1106 ph10 411 }
1107 ph10 408
1108 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1109 ph10 408 bars. */
1110    
1111 nigel 91 for (; *ptr != 0; ptr++)
1112     {
1113 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1114    
1115 ph10 391 if (*ptr == CHAR_BACKSLASH)
1116 nigel 93 {
1117 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1118 ph10 391 if (*ptr == CHAR_Q) for (;;)
1119 nigel 93 {
1120 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1121 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1122 ph10 391 if (*(++ptr) == CHAR_E) break;
1123 nigel 93 }
1124     continue;
1125     }
1126    
1127 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1128     are handled for real. If the first character is '^', skip it. Also, if the
1129     first few characters (either before or after ^) are \Q\E or \E we skip them
1130 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1131 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1132 nigel 93
1133 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1134 nigel 93 {
1135 ph10 340 BOOL negate_class = FALSE;
1136     for (;;)
1137     {
1138 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1139 ph10 340 {
1140 ph10 438 if (ptr[2] == CHAR_E)
1141     ptr+= 2;
1142     else if (strncmp((const char *)ptr+2,
1143 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1144 ph10 438 ptr += 4;
1145 ph10 392 else
1146 ph10 391 break;
1147 ph10 340 }
1148 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1149     {
1150 ph10 340 negate_class = TRUE;
1151 ph10 438 ptr++;
1152     }
1153 ph10 340 else break;
1154     }
1155    
1156     /* If the next character is ']', it is a data character that must be
1157 ph10 341 skipped, except in JavaScript compatibility mode. */
1158 ph10 345
1159 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1160 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1161 ph10 345 ptr++;
1162    
1163 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1164 nigel 93 {
1165 ph10 220 if (*ptr == 0) return -1;
1166 ph10 391 if (*ptr == CHAR_BACKSLASH)
1167 nigel 93 {
1168 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1169 ph10 391 if (*ptr == CHAR_Q) for (;;)
1170 nigel 93 {
1171 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1172 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1173 ph10 391 if (*(++ptr) == CHAR_E) break;
1174 nigel 93 }
1175     continue;
1176     }
1177     }
1178     continue;
1179     }
1180    
1181     /* Skip comments in /x mode */
1182    
1183 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1184 nigel 93 {
1185 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1186 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1187 nigel 93 continue;
1188     }
1189    
1190 ph10 408 /* Check for the special metacharacters */
1191 ph10 411
1192 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1193 nigel 93 {
1194 ph10 408 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1195     if (rc > 0) return rc;
1196     if (*ptr == 0) goto FAIL_EXIT;
1197 nigel 93 }
1198 ph10 411
1199 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1200     {
1201     if (dup_parens && *count < hwm_count) *count = hwm_count;
1202     *ptrptr = ptr;
1203     return -1;
1204     }
1205 ph10 411
1206     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1207 ph10 408 {
1208     if (*count > hwm_count) hwm_count = *count;
1209     *count = start_count;
1210 ph10 411 }
1211 ph10 408 }
1212 nigel 93
1213 ph10 408 FAIL_EXIT:
1214     *ptrptr = ptr;
1215     return -1;
1216     }
1217 nigel 93
1218    
1219    
1220    
1221 ph10 408 /*************************************************
1222     * Find forward referenced subpattern *
1223     *************************************************/
1224 nigel 93
1225 ph10 408 /* This function scans along a pattern's text looking for capturing
1226     subpatterns, and counting them. If it finds a named pattern that matches the
1227     name it is given, it returns its number. Alternatively, if the name is NULL, it
1228     returns when it reaches a given numbered subpattern. This is used for forward
1229     references to subpatterns. We used to be able to start this scan from the
1230     current compiling point, using the current count value from cd->bracount, and
1231     do it all in a single loop, but the addition of the possibility of duplicate
1232     subpattern numbers means that we have to scan from the very start, in order to
1233     take account of such duplicates, and to use a recursive function to keep track
1234     of the different types of group.
1235    
1236     Arguments:
1237     cd compile background data
1238     name name to seek, or NULL if seeking a numbered subpattern
1239     lorn name length, or subpattern number if name is NULL
1240     xmode TRUE if we are in /x mode
1241    
1242     Returns: the number of the found subpattern, or -1 if not found
1243     */
1244    
1245     static int
1246     find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1247     {
1248     uschar *ptr = (uschar *)cd->start_pattern;
1249     int count = 0;
1250     int rc;
1251    
1252     /* If the pattern does not start with an opening parenthesis, the first call
1253     to find_parens_sub() will scan right to the end (if necessary). However, if it
1254     does start with a parenthesis, find_parens_sub() will return when it hits the
1255     matching closing parens. That is why we have to have a loop. */
1256    
1257 ph10 411 for (;;)
1258     {
1259 ph10 408 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1260 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1261     }
1262    
1263 ph10 408 return rc;
1264 nigel 91 }
1265    
1266    
1267    
1268 ph10 408
1269 nigel 91 /*************************************************
1270 nigel 77 * Find first significant op code *
1271     *************************************************/
1272    
1273     /* This is called by several functions that scan a compiled expression looking
1274     for a fixed first character, or an anchoring op code etc. It skips over things
1275     that do not influence this. For some calls, a change of option is important.
1276     For some calls, it makes sense to skip negative forward and all backward
1277     assertions, and also the \b assertion; for others it does not.
1278    
1279     Arguments:
1280     code pointer to the start of the group
1281     options pointer to external options
1282     optbit the option bit whose changing is significant, or
1283     zero if none are
1284     skipassert TRUE if certain assertions are to be skipped
1285    
1286     Returns: pointer to the first significant opcode
1287     */
1288    
1289     static const uschar*
1290     first_significant_code(const uschar *code, int *options, int optbit,
1291     BOOL skipassert)
1292     {
1293     for (;;)
1294     {
1295     switch ((int)*code)
1296     {
1297     case OP_OPT:
1298     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1299     *options = (int)code[1];
1300     code += 2;
1301     break;
1302    
1303     case OP_ASSERT_NOT:
1304     case OP_ASSERTBACK:
1305     case OP_ASSERTBACK_NOT:
1306     if (!skipassert) return code;
1307     do code += GET(code, 1); while (*code == OP_ALT);
1308     code += _pcre_OP_lengths[*code];
1309     break;
1310    
1311     case OP_WORD_BOUNDARY:
1312     case OP_NOT_WORD_BOUNDARY:
1313     if (!skipassert) return code;
1314     /* Fall through */
1315    
1316     case OP_CALLOUT:
1317     case OP_CREF:
1318 nigel 93 case OP_RREF:
1319     case OP_DEF:
1320 nigel 77 code += _pcre_OP_lengths[*code];
1321     break;
1322    
1323     default:
1324     return code;
1325     }
1326     }
1327     /* Control never reaches here */
1328     }
1329    
1330    
1331    
1332    
1333     /*************************************************
1334 ph10 454 * Find the fixed length of a branch *
1335 nigel 77 *************************************************/
1336    
1337 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1338 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1339 ph10 454 In UTF8 mode, the result is in characters rather than bytes. The branch is
1340     temporarily terminated with OP_END when this function is called.
1341 nigel 77
1342 ph10 454 This function is called when a backward assertion is encountered, so that if it
1343     fails, the error message can point to the correct place in the pattern.
1344     However, we cannot do this when the assertion contains subroutine calls,
1345     because they can be forward references. We solve this by remembering this case
1346     and doing the check at the end; a flag specifies which mode we are running in.
1347    
1348 nigel 77 Arguments:
1349     code points to the start of the pattern (the bracket)
1350     options the compiling options
1351 ph10 454 atend TRUE if called when the pattern is complete
1352     cd the "compile data" structure
1353 nigel 77
1354 ph10 454 Returns: the fixed length,
1355     or -1 if there is no fixed length,
1356 nigel 77 or -2 if \C was encountered
1357 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1358 nigel 77 */
1359    
1360     static int
1361 ph10 454 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1362 nigel 77 {
1363     int length = -1;
1364    
1365     register int branchlength = 0;
1366     register uschar *cc = code + 1 + LINK_SIZE;
1367    
1368     /* Scan along the opcodes for this branch. If we get to the end of the
1369     branch, check the length against that of the other branches. */
1370    
1371     for (;;)
1372     {
1373     int d;
1374 ph10 454 uschar *ce, *cs;
1375 nigel 77 register int op = *cc;
1376     switch (op)
1377     {
1378 nigel 93 case OP_CBRA:
1379 nigel 77 case OP_BRA:
1380     case OP_ONCE:
1381     case OP_COND:
1382 ph10 454 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1383 nigel 77 if (d < 0) return d;
1384     branchlength += d;
1385     do cc += GET(cc, 1); while (*cc == OP_ALT);
1386     cc += 1 + LINK_SIZE;
1387     break;
1388    
1389     /* Reached end of a branch; if it's a ket it is the end of a nested
1390     call. If it's ALT it is an alternation in a nested call. If it is
1391     END it's the end of the outer call. All can be handled by the same code. */
1392    
1393     case OP_ALT:
1394     case OP_KET:
1395     case OP_KETRMAX:
1396     case OP_KETRMIN:
1397     case OP_END:
1398     if (length < 0) length = branchlength;
1399     else if (length != branchlength) return -1;
1400     if (*cc != OP_ALT) return length;
1401     cc += 1 + LINK_SIZE;
1402     branchlength = 0;
1403     break;
1404 ph10 454
1405     /* A true recursion implies not fixed length, but a subroutine call may
1406     be OK. If the subroutine is a forward reference, we can't deal with
1407     it until the end of the pattern, so return -3. */
1408    
1409     case OP_RECURSE:
1410     if (!atend) return -3;
1411     cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1412     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1413     if (cc > cs && cc < ce) return -1; /* Recursion */
1414     d = find_fixedlength(cs + 2, options, atend, cd);
1415     if (d < 0) return d;
1416     branchlength += d;
1417     cc += 1 + LINK_SIZE;
1418     break;
1419 nigel 77
1420     /* Skip over assertive subpatterns */
1421    
1422     case OP_ASSERT:
1423     case OP_ASSERT_NOT:
1424     case OP_ASSERTBACK:
1425     case OP_ASSERTBACK_NOT:
1426     do cc += GET(cc, 1); while (*cc == OP_ALT);
1427     /* Fall through */
1428    
1429     /* Skip over things that don't match chars */
1430    
1431     case OP_REVERSE:
1432     case OP_CREF:
1433 nigel 93 case OP_RREF:
1434     case OP_DEF:
1435 nigel 77 case OP_OPT:
1436     case OP_CALLOUT:
1437     case OP_SOD:
1438     case OP_SOM:
1439     case OP_EOD:
1440     case OP_EODN:
1441     case OP_CIRC:
1442     case OP_DOLL:
1443     case OP_NOT_WORD_BOUNDARY:
1444     case OP_WORD_BOUNDARY:
1445     cc += _pcre_OP_lengths[*cc];
1446     break;
1447    
1448     /* Handle literal characters */
1449    
1450     case OP_CHAR:
1451     case OP_CHARNC:
1452 nigel 91 case OP_NOT:
1453 nigel 77 branchlength++;
1454     cc += 2;
1455     #ifdef SUPPORT_UTF8
1456 ph10 426 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1457     cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1458 nigel 77 #endif
1459     break;
1460    
1461     /* Handle exact repetitions. The count is already in characters, but we
1462     need to skip over a multibyte character in UTF8 mode. */
1463    
1464     case OP_EXACT:
1465     branchlength += GET2(cc,1);
1466     cc += 4;
1467     #ifdef SUPPORT_UTF8
1468 ph10 426 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1469     cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1470 nigel 77 #endif
1471     break;
1472    
1473     case OP_TYPEEXACT:
1474     branchlength += GET2(cc,1);
1475 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1476 nigel 77 cc += 4;
1477     break;
1478    
1479     /* Handle single-char matchers */
1480    
1481     case OP_PROP:
1482     case OP_NOTPROP:
1483 nigel 87 cc += 2;
1484 nigel 77 /* Fall through */
1485    
1486     case OP_NOT_DIGIT:
1487     case OP_DIGIT:
1488     case OP_NOT_WHITESPACE:
1489     case OP_WHITESPACE:
1490     case OP_NOT_WORDCHAR:
1491     case OP_WORDCHAR:
1492     case OP_ANY:
1493 ph10 342 case OP_ALLANY:
1494 nigel 77 branchlength++;
1495     cc++;
1496     break;
1497    
1498     /* The single-byte matcher isn't allowed */
1499    
1500     case OP_ANYBYTE:
1501     return -2;
1502    
1503     /* Check a class for variable quantification */
1504    
1505     #ifdef SUPPORT_UTF8
1506     case OP_XCLASS:
1507     cc += GET(cc, 1) - 33;
1508     /* Fall through */
1509     #endif
1510    
1511     case OP_CLASS:
1512     case OP_NCLASS:
1513     cc += 33;
1514    
1515     switch (*cc)
1516     {
1517     case OP_CRSTAR:
1518     case OP_CRMINSTAR:
1519     case OP_CRQUERY:
1520     case OP_CRMINQUERY:
1521     return -1;
1522    
1523     case OP_CRRANGE:
1524     case OP_CRMINRANGE:
1525     if (GET2(cc,1) != GET2(cc,3)) return -1;
1526     branchlength += GET2(cc,1);
1527     cc += 5;
1528     break;
1529    
1530     default:
1531     branchlength++;
1532     }
1533     break;
1534    
1535     /* Anything else is variable length */
1536    
1537     default:
1538     return -1;
1539     }
1540     }
1541     /* Control never gets here */
1542     }
1543    
1544    
1545    
1546    
1547     /*************************************************
1548 ph10 454 * Scan compiled regex for specific bracket *
1549 nigel 77 *************************************************/
1550    
1551     /* This little function scans through a compiled pattern until it finds a
1552 ph10 454 capturing bracket with the given number, or, if the number is negative, an
1553 ph10 455 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1554     so that it can be called from pcre_study() when finding the minimum matching
1555     length.
1556 nigel 77
1557     Arguments:
1558     code points to start of expression
1559     utf8 TRUE in UTF-8 mode
1560 ph10 454 number the required bracket number or negative to find a lookbehind
1561 nigel 77
1562     Returns: pointer to the opcode for the bracket, or NULL if not found
1563     */
1564    
1565 ph10 455 const uschar *
1566     _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1567 nigel 77 {
1568     for (;;)
1569     {
1570     register int c = *code;
1571     if (c == OP_END) return NULL;
1572 nigel 91
1573     /* XCLASS is used for classes that cannot be represented just by a bit
1574     map. This includes negated single high-valued characters. The length in
1575     the table is zero; the actual length is stored in the compiled code. */
1576    
1577     if (c == OP_XCLASS) code += GET(code, 1);
1578 ph10 454
1579     /* Handle recursion */
1580    
1581     else if (c == OP_REVERSE)
1582     {
1583     if (number < 0) return (uschar *)code;
1584     code += _pcre_OP_lengths[c];
1585     }
1586 nigel 91
1587 nigel 93 /* Handle capturing bracket */
1588 nigel 91
1589 nigel 93 else if (c == OP_CBRA)
1590 nigel 77 {
1591 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1592 nigel 77 if (n == number) return (uschar *)code;
1593 nigel 93 code += _pcre_OP_lengths[c];
1594 nigel 77 }
1595 nigel 91
1596 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1597     repeated character types, we have to test for \p and \P, which have an extra
1598 ph10 218 two bytes of parameters. */
1599 nigel 91
1600 nigel 77 else
1601     {
1602 ph10 218 switch(c)
1603     {
1604     case OP_TYPESTAR:
1605     case OP_TYPEMINSTAR:
1606     case OP_TYPEPLUS:
1607     case OP_TYPEMINPLUS:
1608     case OP_TYPEQUERY:
1609     case OP_TYPEMINQUERY:
1610     case OP_TYPEPOSSTAR:
1611     case OP_TYPEPOSPLUS:
1612     case OP_TYPEPOSQUERY:
1613     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1614 ph10 220 break;
1615 ph10 221
1616     case OP_TYPEUPTO:
1617     case OP_TYPEMINUPTO:
1618     case OP_TYPEEXACT:
1619     case OP_TYPEPOSUPTO:
1620     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1621     break;
1622 ph10 220 }
1623    
1624 ph10 218 /* Add in the fixed length from the table */
1625 ph10 220
1626 nigel 77 code += _pcre_OP_lengths[c];
1627 ph10 220
1628 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1629     a multi-byte character. The length in the table is a minimum, so we have to
1630     arrange to skip the extra bytes. */
1631 ph10 220
1632 ph10 107 #ifdef SUPPORT_UTF8
1633 nigel 77 if (utf8) switch(c)
1634     {
1635     case OP_CHAR:
1636     case OP_CHARNC:
1637     case OP_EXACT:
1638     case OP_UPTO:
1639     case OP_MINUPTO:
1640 nigel 93 case OP_POSUPTO:
1641 nigel 77 case OP_STAR:
1642     case OP_MINSTAR:
1643 nigel 93 case OP_POSSTAR:
1644 nigel 77 case OP_PLUS:
1645     case OP_MINPLUS:
1646 nigel 93 case OP_POSPLUS:
1647 nigel 77 case OP_QUERY:
1648     case OP_MINQUERY:
1649 nigel 93 case OP_POSQUERY:
1650     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1651 nigel 77 break;
1652     }
1653 ph10 369 #else
1654     (void)(utf8); /* Keep compiler happy by referencing function argument */
1655 ph10 111 #endif
1656 nigel 77 }
1657     }
1658     }
1659    
1660    
1661    
1662     /*************************************************
1663     * Scan compiled regex for recursion reference *
1664     *************************************************/
1665    
1666     /* This little function scans through a compiled pattern until it finds an
1667     instance of OP_RECURSE.
1668    
1669     Arguments:
1670     code points to start of expression
1671     utf8 TRUE in UTF-8 mode
1672    
1673     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1674     */
1675    
1676     static const uschar *
1677     find_recurse(const uschar *code, BOOL utf8)
1678     {
1679     for (;;)
1680     {
1681     register int c = *code;
1682     if (c == OP_END) return NULL;
1683 nigel 91 if (c == OP_RECURSE) return code;
1684 ph10 220
1685 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1686     map. This includes negated single high-valued characters. The length in
1687     the table is zero; the actual length is stored in the compiled code. */
1688    
1689     if (c == OP_XCLASS) code += GET(code, 1);
1690    
1691 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1692     repeated character types, we have to test for \p and \P, which have an extra
1693 ph10 218 two bytes of parameters. */
1694 nigel 91
1695 nigel 77 else
1696     {
1697 ph10 218 switch(c)
1698     {
1699     case OP_TYPESTAR:
1700     case OP_TYPEMINSTAR:
1701     case OP_TYPEPLUS:
1702     case OP_TYPEMINPLUS:
1703     case OP_TYPEQUERY:
1704     case OP_TYPEMINQUERY:
1705     case OP_TYPEPOSSTAR:
1706     case OP_TYPEPOSPLUS:
1707     case OP_TYPEPOSQUERY:
1708     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1709 ph10 220 break;
1710 ph10 221
1711     case OP_TYPEPOSUPTO:
1712     case OP_TYPEUPTO:
1713     case OP_TYPEMINUPTO:
1714     case OP_TYPEEXACT:
1715     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1716     break;
1717 ph10 220 }
1718    
1719 ph10 218 /* Add in the fixed length from the table */
1720    
1721 nigel 77 code += _pcre_OP_lengths[c];
1722 ph10 220
1723 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1724     by a multi-byte character. The length in the table is a minimum, so we have
1725     to arrange to skip the extra bytes. */
1726 ph10 220
1727 ph10 107 #ifdef SUPPORT_UTF8
1728 nigel 77 if (utf8) switch(c)
1729     {
1730     case OP_CHAR:
1731     case OP_CHARNC:
1732     case OP_EXACT:
1733     case OP_UPTO:
1734     case OP_MINUPTO:
1735 nigel 93 case OP_POSUPTO:
1736 nigel 77 case OP_STAR:
1737     case OP_MINSTAR:
1738 nigel 93 case OP_POSSTAR:
1739 nigel 77 case OP_PLUS:
1740     case OP_MINPLUS:
1741 nigel 93 case OP_POSPLUS:
1742 nigel 77 case OP_QUERY:
1743     case OP_MINQUERY:
1744 nigel 93 case OP_POSQUERY:
1745     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1746 nigel 77 break;
1747     }
1748 ph10 369 #else
1749     (void)(utf8); /* Keep compiler happy by referencing function argument */
1750 ph10 111 #endif
1751 nigel 77 }
1752     }
1753     }
1754    
1755    
1756    
1757     /*************************************************
1758     * Scan compiled branch for non-emptiness *
1759     *************************************************/
1760    
1761     /* This function scans through a branch of a compiled pattern to see whether it
1762 nigel 93 can match the empty string or not. It is called from could_be_empty()
1763     below and from compile_branch() when checking for an unlimited repeat of a
1764     group that can match nothing. Note that first_significant_code() skips over
1765 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1766     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1767     bracket whose current branch will already have been scanned.
1768 nigel 77
1769     Arguments:
1770     code points to start of search
1771     endcode points to where to stop
1772     utf8 TRUE if in UTF8 mode
1773    
1774     Returns: TRUE if what is matched could be empty
1775     */
1776    
1777     static BOOL
1778     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1779     {
1780     register int c;
1781 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1782 nigel 77 code < endcode;
1783     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1784     {
1785     const uschar *ccode;
1786    
1787     c = *code;
1788 ph10 286
1789     /* Skip over forward assertions; the other assertions are skipped by
1790 ph10 282 first_significant_code() with a TRUE final argument. */
1791 ph10 286
1792 ph10 282 if (c == OP_ASSERT)
1793 ph10 286 {
1794 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1795     c = *code;
1796     continue;
1797 ph10 286 }
1798 ph10 172
1799 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1800 nigel 77
1801 ph10 335 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1802 ph10 170 {
1803 ph10 172 code += _pcre_OP_lengths[c];
1804 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1805     c = *code;
1806     continue;
1807     }
1808    
1809     /* For other groups, scan the branches. */
1810 ph10 172
1811 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1812 nigel 77 {
1813     BOOL empty_branch;
1814     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1815 ph10 406
1816     /* If a conditional group has only one branch, there is a second, implied,
1817 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
1818     Otherwise, scan the individual branches of the group. */
1819 ph10 406
1820 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1821 nigel 77 code += GET(code, 1);
1822 ph10 395 else
1823 ph10 406 {
1824 ph10 395 empty_branch = FALSE;
1825     do
1826     {
1827     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1828     empty_branch = TRUE;
1829     code += GET(code, 1);
1830     }
1831     while (*code == OP_ALT);
1832     if (!empty_branch) return FALSE; /* All branches are non-empty */
1833 nigel 77 }
1834 ph10 406
1835 ph10 172 c = *code;
1836 nigel 93 continue;
1837 nigel 77 }
1838    
1839 nigel 93 /* Handle the other opcodes */
1840    
1841     switch (c)
1842 nigel 77 {
1843 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1844     cannot be represented just by a bit map. This includes negated single
1845     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1846 ph10 220 actual length is stored in the compiled code, so we must update "code"
1847 ph10 216 here. */
1848 nigel 77
1849     #ifdef SUPPORT_UTF8
1850     case OP_XCLASS:
1851 ph10 216 ccode = code += GET(code, 1);
1852 nigel 77 goto CHECK_CLASS_REPEAT;
1853     #endif
1854    
1855     case OP_CLASS:
1856     case OP_NCLASS:
1857     ccode = code + 33;
1858    
1859     #ifdef SUPPORT_UTF8
1860     CHECK_CLASS_REPEAT:
1861     #endif
1862    
1863     switch (*ccode)
1864     {
1865     case OP_CRSTAR: /* These could be empty; continue */
1866     case OP_CRMINSTAR:
1867     case OP_CRQUERY:
1868     case OP_CRMINQUERY:
1869     break;
1870    
1871     default: /* Non-repeat => class must match */
1872     case OP_CRPLUS: /* These repeats aren't empty */
1873     case OP_CRMINPLUS:
1874     return FALSE;
1875    
1876     case OP_CRRANGE:
1877     case OP_CRMINRANGE:
1878     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1879     break;
1880     }
1881     break;
1882    
1883     /* Opcodes that must match a character */
1884    
1885     case OP_PROP:
1886     case OP_NOTPROP:
1887     case OP_EXTUNI:
1888     case OP_NOT_DIGIT:
1889     case OP_DIGIT:
1890     case OP_NOT_WHITESPACE:
1891     case OP_WHITESPACE:
1892     case OP_NOT_WORDCHAR:
1893     case OP_WORDCHAR:
1894     case OP_ANY:
1895 ph10 345 case OP_ALLANY:
1896 nigel 77 case OP_ANYBYTE:
1897     case OP_CHAR:
1898     case OP_CHARNC:
1899     case OP_NOT:
1900     case OP_PLUS:
1901     case OP_MINPLUS:
1902 nigel 93 case OP_POSPLUS:
1903 nigel 77 case OP_EXACT:
1904     case OP_NOTPLUS:
1905     case OP_NOTMINPLUS:
1906 nigel 93 case OP_NOTPOSPLUS:
1907 nigel 77 case OP_NOTEXACT:
1908     case OP_TYPEPLUS:
1909     case OP_TYPEMINPLUS:
1910 nigel 93 case OP_TYPEPOSPLUS:
1911 nigel 77 case OP_TYPEEXACT:
1912     return FALSE;
1913 ph10 227
1914     /* These are going to continue, as they may be empty, but we have to
1915     fudge the length for the \p and \P cases. */
1916    
1917 ph10 224 case OP_TYPESTAR:
1918     case OP_TYPEMINSTAR:
1919     case OP_TYPEPOSSTAR:
1920     case OP_TYPEQUERY:
1921     case OP_TYPEMINQUERY:
1922     case OP_TYPEPOSQUERY:
1923     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1924 ph10 227 break;
1925    
1926 ph10 224 /* Same for these */
1927 ph10 227
1928 ph10 224 case OP_TYPEUPTO:
1929     case OP_TYPEMINUPTO:
1930     case OP_TYPEPOSUPTO:
1931     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1932     break;
1933 nigel 77
1934     /* End of branch */
1935    
1936     case OP_KET:
1937     case OP_KETRMAX:
1938     case OP_KETRMIN:
1939     case OP_ALT:
1940     return TRUE;
1941    
1942 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1943     MINUPTO, and POSUPTO may be followed by a multibyte character */
1944 nigel 77
1945     #ifdef SUPPORT_UTF8
1946     case OP_STAR:
1947     case OP_MINSTAR:
1948 nigel 93 case OP_POSSTAR:
1949 nigel 77 case OP_QUERY:
1950     case OP_MINQUERY:
1951 nigel 93 case OP_POSQUERY:
1952 ph10 426 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1953     break;
1954    
1955 nigel 77 case OP_UPTO:
1956     case OP_MINUPTO:
1957 nigel 93 case OP_POSUPTO:
1958 ph10 426 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
1959 nigel 77 break;
1960     #endif
1961     }
1962     }
1963    
1964     return TRUE;
1965     }
1966    
1967    
1968    
1969     /*************************************************
1970     * Scan compiled regex for non-emptiness *
1971     *************************************************/
1972    
1973     /* This function is called to check for left recursive calls. We want to check
1974     the current branch of the current pattern to see if it could match the empty
1975     string. If it could, we must look outwards for branches at other levels,
1976     stopping when we pass beyond the bracket which is the subject of the recursion.
1977    
1978     Arguments:
1979     code points to start of the recursion
1980     endcode points to where to stop (current RECURSE item)
1981     bcptr points to the chain of current (unclosed) branch starts
1982     utf8 TRUE if in UTF-8 mode
1983    
1984     Returns: TRUE if what is matched could be empty
1985     */
1986    
1987     static BOOL
1988     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1989     BOOL utf8)
1990     {
1991     while (bcptr != NULL && bcptr->current >= code)
1992     {
1993     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1994     bcptr = bcptr->outer;
1995     }
1996     return TRUE;
1997     }
1998    
1999    
2000    
2001     /*************************************************
2002     * Check for POSIX class syntax *
2003     *************************************************/
2004    
2005     /* This function is called when the sequence "[:" or "[." or "[=" is
2006 ph10 295 encountered in a character class. It checks whether this is followed by a
2007 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2008 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2009 nigel 77
2010 ph10 298 Originally, this function only recognized a sequence of letters between the
2011     terminators, but it seems that Perl recognizes any sequence of characters,
2012     though of course unknown POSIX names are subsequently rejected. Perl gives an
2013     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2014     didn't consider this to be a POSIX class. Likewise for [:1234:].
2015 ph10 295
2016 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2017     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2018     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2019     below handles the special case of \], but does not try to do any other escape
2020     processing. This makes it different from Perl for cases such as [:l\ower:]
2021 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2022 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2023 ph10 295 I think.
2024    
2025     Arguments:
2026 nigel 77 ptr pointer to the initial [
2027     endptr where to return the end pointer
2028    
2029     Returns: TRUE or FALSE
2030     */
2031    
2032     static BOOL
2033 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2034 nigel 77 {
2035     int terminator; /* Don't combine these lines; the Solaris cc */
2036     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2037 ph10 295 for (++ptr; *ptr != 0; ptr++)
2038 nigel 77 {
2039 ph10 391 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2040 ph10 298 {
2041 ph10 391 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2042     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2043 ph10 295 {
2044     *endptr = ptr;
2045     return TRUE;
2046 ph10 298 }
2047     }
2048     }
2049 nigel 77 return FALSE;
2050     }
2051    
2052    
2053    
2054    
2055     /*************************************************
2056     * Check POSIX class name *
2057     *************************************************/
2058    
2059     /* This function is called to check the name given in a POSIX-style class entry
2060     such as [:alnum:].
2061    
2062     Arguments:
2063     ptr points to the first letter
2064     len the length of the name
2065    
2066     Returns: a value representing the name, or -1 if unknown
2067     */
2068    
2069     static int
2070     check_posix_name(const uschar *ptr, int len)
2071     {
2072 ph10 240 const char *pn = posix_names;
2073 nigel 77 register int yield = 0;
2074     while (posix_name_lengths[yield] != 0)
2075     {
2076     if (len == posix_name_lengths[yield] &&
2077 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2078 ph10 243 pn += posix_name_lengths[yield] + 1;
2079 nigel 77 yield++;
2080     }
2081     return -1;
2082     }
2083    
2084    
2085     /*************************************************
2086     * Adjust OP_RECURSE items in repeated group *
2087     *************************************************/
2088    
2089     /* OP_RECURSE items contain an offset from the start of the regex to the group
2090     that is referenced. This means that groups can be replicated for fixed
2091     repetition simply by copying (because the recursion is allowed to refer to
2092     earlier groups that are outside the current group). However, when a group is
2093 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2094     inserted before it, after it has been compiled. This means that any OP_RECURSE
2095     items within it that refer to the group itself or any contained groups have to
2096     have their offsets adjusted. That one of the jobs of this function. Before it
2097     is called, the partially compiled regex must be temporarily terminated with
2098     OP_END.
2099 nigel 77
2100 nigel 93 This function has been extended with the possibility of forward references for
2101     recursions and subroutine calls. It must also check the list of such references
2102     for the group we are dealing with. If it finds that one of the recursions in
2103     the current group is on this list, it adjusts the offset in the list, not the
2104     value in the reference (which is a group number).
2105    
2106 nigel 77 Arguments:
2107     group points to the start of the group
2108     adjust the amount by which the group is to be moved
2109     utf8 TRUE in UTF-8 mode
2110     cd contains pointers to tables etc.
2111 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2112 nigel 77
2113     Returns: nothing
2114     */
2115    
2116     static void
2117 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2118     uschar *save_hwm)
2119 nigel 77 {
2120     uschar *ptr = group;
2121 ph10 224
2122 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2123     {
2124 nigel 93 int offset;
2125     uschar *hc;
2126    
2127     /* See if this recursion is on the forward reference list. If so, adjust the
2128     reference. */
2129 ph10 345
2130 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2131     {
2132     offset = GET(hc, 0);
2133     if (cd->start_code + offset == ptr + 1)
2134     {
2135     PUT(hc, 0, offset + adjust);
2136     break;
2137     }
2138     }
2139    
2140     /* Otherwise, adjust the recursion offset if it's after the start of this
2141     group. */
2142    
2143     if (hc >= cd->hwm)
2144     {
2145     offset = GET(ptr, 1);
2146     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2147     }
2148    
2149 nigel 77 ptr += 1 + LINK_SIZE;
2150     }
2151     }
2152    
2153    
2154    
2155     /*************************************************
2156     * Insert an automatic callout point *
2157     *************************************************/
2158    
2159     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2160     callout points before each pattern item.
2161    
2162     Arguments:
2163     code current code pointer
2164     ptr current pattern pointer
2165     cd pointers to tables etc
2166    
2167     Returns: new code pointer
2168     */
2169    
2170     static uschar *
2171     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2172     {
2173     *code++ = OP_CALLOUT;
2174     *code++ = 255;
2175     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2176     PUT(code, LINK_SIZE, 0); /* Default length */
2177     return code + 2*LINK_SIZE;
2178     }
2179    
2180    
2181    
2182     /*************************************************
2183     * Complete a callout item *
2184     *************************************************/
2185    
2186     /* A callout item contains the length of the next item in the pattern, which
2187     we can't fill in till after we have reached the relevant point. This is used
2188     for both automatic and manual callouts.
2189    
2190     Arguments:
2191     previous_callout points to previous callout item
2192     ptr current pattern pointer
2193     cd pointers to tables etc
2194    
2195     Returns: nothing
2196     */
2197    
2198     static void
2199     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2200     {
2201     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2202     PUT(previous_callout, 2 + LINK_SIZE, length);
2203     }
2204    
2205    
2206    
2207     #ifdef SUPPORT_UCP
2208     /*************************************************
2209     * Get othercase range *
2210     *************************************************/
2211    
2212     /* This function is passed the start and end of a class range, in UTF-8 mode
2213     with UCP support. It searches up the characters, looking for internal ranges of
2214     characters in the "other" case. Each call returns the next one, updating the
2215     start address.
2216    
2217     Arguments:
2218     cptr points to starting character value; updated
2219     d end value
2220     ocptr where to put start of othercase range
2221     odptr where to put end of othercase range
2222    
2223     Yield: TRUE when range returned; FALSE when no more
2224     */
2225    
2226     static BOOL
2227 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2228     unsigned int *odptr)
2229 nigel 77 {
2230 nigel 93 unsigned int c, othercase, next;
2231 nigel 77
2232     for (c = *cptr; c <= d; c++)
2233 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2234 nigel 77
2235     if (c > d) return FALSE;
2236    
2237     *ocptr = othercase;
2238     next = othercase + 1;
2239    
2240     for (++c; c <= d; c++)
2241     {
2242 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2243 nigel 77 next++;
2244     }
2245    
2246     *odptr = next - 1;
2247     *cptr = c;
2248    
2249     return TRUE;
2250     }
2251     #endif /* SUPPORT_UCP */
2252    
2253    
2254 nigel 93
2255 nigel 77 /*************************************************
2256 nigel 93 * Check if auto-possessifying is possible *
2257     *************************************************/
2258    
2259     /* This function is called for unlimited repeats of certain items, to see
2260     whether the next thing could possibly match the repeated item. If not, it makes
2261     sense to automatically possessify the repeated item.
2262    
2263     Arguments:
2264     op_code the repeated op code
2265     this data for this item, depends on the opcode
2266     utf8 TRUE in UTF-8 mode
2267     utf8_char used for utf8 character bytes, NULL if not relevant
2268     ptr next character in pattern
2269     options options bits
2270     cd contains pointers to tables etc.
2271    
2272     Returns: TRUE if possessifying is wanted
2273     */
2274    
2275     static BOOL
2276     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2277     const uschar *ptr, int options, compile_data *cd)
2278     {
2279     int next;
2280    
2281     /* Skip whitespace and comments in extended mode */
2282    
2283     if ((options & PCRE_EXTENDED) != 0)
2284     {
2285     for (;;)
2286     {
2287     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2288 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2289 nigel 93 {
2290     while (*(++ptr) != 0)
2291     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2292     }
2293     else break;
2294     }
2295     }
2296    
2297     /* If the next item is one that we can handle, get its value. A non-negative
2298     value is a character, a negative value is an escape value. */
2299    
2300 ph10 391 if (*ptr == CHAR_BACKSLASH)
2301 nigel 93 {
2302     int temperrorcode = 0;
2303     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2304     if (temperrorcode != 0) return FALSE;
2305     ptr++; /* Point after the escape sequence */
2306     }
2307    
2308     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2309     {
2310     #ifdef SUPPORT_UTF8
2311     if (utf8) { GETCHARINC(next, ptr); } else
2312     #endif
2313     next = *ptr++;
2314     }
2315    
2316     else return FALSE;
2317    
2318     /* Skip whitespace and comments in extended mode */
2319    
2320     if ((options & PCRE_EXTENDED) != 0)
2321     {
2322     for (;;)
2323     {
2324     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2325 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2326 nigel 93 {
2327     while (*(++ptr) != 0)
2328     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2329     }
2330     else break;
2331     }
2332     }
2333    
2334     /* If the next thing is itself optional, we have to give up. */
2335    
2336 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2337 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2338     return FALSE;
2339 nigel 93
2340     /* Now compare the next item with the previous opcode. If the previous is a
2341     positive single character match, "item" either contains the character or, if
2342     "item" is greater than 127 in utf8 mode, the character's bytes are in
2343     utf8_char. */
2344    
2345    
2346     /* Handle cases when the next item is a character. */
2347    
2348     if (next >= 0) switch(op_code)
2349     {
2350     case OP_CHAR:
2351     #ifdef SUPPORT_UTF8
2352     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2353 ph10 369 #else
2354     (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2355 nigel 93 #endif
2356     return item != next;
2357    
2358     /* For CHARNC (caseless character) we must check the other case. If we have
2359     Unicode property support, we can use it to test the other case of
2360     high-valued characters. */
2361    
2362     case OP_CHARNC:
2363     #ifdef SUPPORT_UTF8
2364     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2365     #endif
2366     if (item == next) return FALSE;
2367     #ifdef SUPPORT_UTF8
2368     if (utf8)
2369     {
2370     unsigned int othercase;
2371     if (next < 128) othercase = cd->fcc[next]; else
2372     #ifdef SUPPORT_UCP
2373 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2374 nigel 93 #else
2375     othercase = NOTACHAR;
2376     #endif
2377     return (unsigned int)item != othercase;
2378     }
2379     else
2380     #endif /* SUPPORT_UTF8 */
2381     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2382    
2383     /* For OP_NOT, "item" must be a single-byte character. */
2384    
2385     case OP_NOT:
2386     if (item == next) return TRUE;
2387     if ((options & PCRE_CASELESS) == 0) return FALSE;
2388     #ifdef SUPPORT_UTF8
2389     if (utf8)
2390     {
2391     unsigned int othercase;
2392     if (next < 128) othercase = cd->fcc[next]; else
2393     #ifdef SUPPORT_UCP
2394 ph10 349 othercase = UCD_OTHERCASE(next);
2395 nigel 93 #else
2396     othercase = NOTACHAR;
2397     #endif
2398     return (unsigned int)item == othercase;
2399     }
2400     else
2401     #endif /* SUPPORT_UTF8 */
2402     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2403    
2404     case OP_DIGIT:
2405     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2406    
2407     case OP_NOT_DIGIT:
2408     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2409    
2410     case OP_WHITESPACE:
2411     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2412    
2413     case OP_NOT_WHITESPACE:
2414     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2415    
2416     case OP_WORDCHAR:
2417     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2418    
2419     case OP_NOT_WORDCHAR:
2420     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2421    
2422 ph10 180 case OP_HSPACE:
2423     case OP_NOT_HSPACE:
2424     switch(next)
2425     {
2426     case 0x09:
2427     case 0x20:
2428     case 0xa0:
2429     case 0x1680:
2430     case 0x180e:
2431     case 0x2000:
2432     case 0x2001:
2433     case 0x2002:
2434     case 0x2003:
2435     case 0x2004:
2436     case 0x2005:
2437     case 0x2006:
2438     case 0x2007:
2439     case 0x2008:
2440     case 0x2009:
2441     case 0x200A:
2442     case 0x202f:
2443     case 0x205f:
2444     case 0x3000:
2445     return op_code != OP_HSPACE;
2446     default:
2447     return op_code == OP_HSPACE;
2448     }
2449    
2450     case OP_VSPACE:
2451     case OP_NOT_VSPACE:
2452     switch(next)
2453     {
2454     case 0x0a:
2455     case 0x0b:
2456     case 0x0c:
2457     case 0x0d:
2458     case 0x85:
2459     case 0x2028:
2460     case 0x2029:
2461     return op_code != OP_VSPACE;
2462     default:
2463     return op_code == OP_VSPACE;
2464     }
2465    
2466 nigel 93 default:
2467     return FALSE;
2468     }
2469    
2470    
2471     /* Handle the case when the next item is \d, \s, etc. */
2472    
2473     switch(op_code)
2474     {
2475     case OP_CHAR:
2476     case OP_CHARNC:
2477     #ifdef SUPPORT_UTF8
2478     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2479     #endif
2480     switch(-next)
2481     {
2482     case ESC_d:
2483     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2484    
2485     case ESC_D:
2486     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2487    
2488     case ESC_s:
2489     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2490    
2491     case ESC_S:
2492     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2493    
2494     case ESC_w:
2495     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2496    
2497     case ESC_W:
2498     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2499 ph10 182
2500 ph10 180 case ESC_h:
2501     case ESC_H:
2502     switch(item)
2503     {
2504     case 0x09:
2505     case 0x20:
2506     case 0xa0:
2507     case 0x1680:
2508     case 0x180e:
2509     case 0x2000:
2510     case 0x2001:
2511     case 0x2002:
2512     case 0x2003:
2513     case 0x2004:
2514     case 0x2005:
2515     case 0x2006:
2516     case 0x2007:
2517     case 0x2008:
2518     case 0x2009:
2519     case 0x200A:
2520     case 0x202f:
2521     case 0x205f:
2522     case 0x3000:
2523     return -next != ESC_h;
2524     default:
2525     return -next == ESC_h;
2526 ph10 182 }
2527    
2528 ph10 180 case ESC_v:
2529     case ESC_V:
2530     switch(item)
2531     {
2532     case 0x0a:
2533     case 0x0b:
2534     case 0x0c:
2535     case 0x0d:
2536     case 0x85:
2537     case 0x2028:
2538     case 0x2029:
2539     return -next != ESC_v;
2540     default:
2541     return -next == ESC_v;
2542 ph10 182 }
2543 nigel 93
2544     default:
2545     return FALSE;
2546     }
2547    
2548     case OP_DIGIT:
2549 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2550     next == -ESC_h || next == -ESC_v;
2551 nigel 93
2552     case OP_NOT_DIGIT:
2553     return next == -ESC_d;
2554    
2555     case OP_WHITESPACE:
2556     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2557    
2558     case OP_NOT_WHITESPACE:
2559 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2560 nigel 93
2561 ph10 180 case OP_HSPACE:
2562     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2563    
2564     case OP_NOT_HSPACE:
2565     return next == -ESC_h;
2566 ph10 182
2567 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2568 ph10 182 case OP_VSPACE:
2569 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2570    
2571     case OP_NOT_VSPACE:
2572 ph10 182 return next == -ESC_v;
2573 ph10 180
2574 nigel 93 case OP_WORDCHAR:
2575 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2576 nigel 93
2577     case OP_NOT_WORDCHAR:
2578     return next == -ESC_w || next == -ESC_d;
2579 ph10 182
2580 nigel 93 default:
2581     return FALSE;
2582     }
2583    
2584     /* Control does not reach here */
2585     }
2586    
2587    
2588    
2589     /*************************************************
2590 nigel 77 * Compile one branch *
2591     *************************************************/
2592    
2593 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2594 nigel 77 changed during the branch, the pointer is used to change the external options
2595 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2596     to find out the amount of memory needed, as well as during the real compile
2597     phase. The value of lengthptr distinguishes the two phases.
2598 nigel 77
2599     Arguments:
2600     optionsptr pointer to the option bits
2601     codeptr points to the pointer to the current code point
2602     ptrptr points to the current pattern pointer
2603     errorcodeptr points to error code variable
2604     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2605     reqbyteptr set to the last literal character required, else < 0
2606     bcptr points to current branch chain
2607     cd contains pointers to tables etc.
2608 nigel 93 lengthptr NULL during the real compile phase
2609     points to length accumulator during pre-compile phase
2610 nigel 77
2611     Returns: TRUE on success
2612     FALSE, with *errorcodeptr set non-zero on error
2613     */
2614    
2615     static BOOL
2616 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2617     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2618     compile_data *cd, int *lengthptr)
2619 nigel 77 {
2620     int repeat_type, op_type;
2621     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2622     int bravalue = 0;
2623     int greedy_default, greedy_non_default;
2624     int firstbyte, reqbyte;
2625     int zeroreqbyte, zerofirstbyte;
2626     int req_caseopt, reqvary, tempreqvary;
2627     int options = *optionsptr;
2628     int after_manual_callout = 0;
2629 nigel 93 int length_prevgroup = 0;
2630 nigel 77 register int c;
2631     register uschar *code = *codeptr;
2632 nigel 93 uschar *last_code = code;
2633     uschar *orig_code = code;
2634 nigel 77 uschar *tempcode;
2635     BOOL inescq = FALSE;
2636     BOOL groupsetfirstbyte = FALSE;
2637     const uschar *ptr = *ptrptr;
2638     const uschar *tempptr;
2639     uschar *previous = NULL;
2640     uschar *previous_callout = NULL;
2641 nigel 93 uschar *save_hwm = NULL;
2642 nigel 77 uschar classbits[32];
2643    
2644     #ifdef SUPPORT_UTF8
2645     BOOL class_utf8;
2646     BOOL utf8 = (options & PCRE_UTF8) != 0;
2647     uschar *class_utf8data;
2648 ph10 300 uschar *class_utf8data_base;
2649 nigel 77 uschar utf8_char[6];
2650     #else
2651     BOOL utf8 = FALSE;
2652 nigel 93 uschar *utf8_char = NULL;
2653 nigel 77 #endif
2654    
2655 nigel 93 #ifdef DEBUG
2656     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2657     #endif
2658    
2659 nigel 77 /* Set up the default and non-default settings for greediness */
2660    
2661     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2662     greedy_non_default = greedy_default ^ 1;
2663    
2664     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2665     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2666     matches a non-fixed char first char; reqbyte just remains unset if we never
2667     find one.
2668    
2669     When we hit a repeat whose minimum is zero, we may have to adjust these values
2670     to take the zero repeat into account. This is implemented by setting them to
2671     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2672     item types that can be repeated set these backoff variables appropriately. */
2673    
2674     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2675    
2676     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2677     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2678     value > 255. It is added into the firstbyte or reqbyte variables to record the
2679     case status of the value. This is used only for ASCII characters. */
2680    
2681     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2682    
2683     /* Switch on next character until the end of the branch */
2684    
2685     for (;; ptr++)
2686     {
2687     BOOL negate_class;
2688 ph10 286 BOOL should_flip_negation;
2689 nigel 77 BOOL possessive_quantifier;
2690     BOOL is_quantifier;
2691 nigel 93 BOOL is_recurse;
2692 ph10 180 BOOL reset_bracount;
2693 nigel 77 int class_charcount;
2694     int class_lastchar;
2695     int newoptions;
2696     int recno;
2697 ph10 172 int refsign;
2698 nigel 77 int skipbytes;
2699     int subreqbyte;
2700     int subfirstbyte;
2701 nigel 93 int terminator;
2702 nigel 77 int mclength;
2703     uschar mcbuffer[8];
2704    
2705 nigel 93 /* Get next byte in the pattern */
2706 nigel 77
2707     c = *ptr;
2708 ph10 345
2709 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2710     previous cycle of this loop. */
2711    
2712     if (lengthptr != NULL)
2713     {
2714     #ifdef DEBUG
2715     if (code > cd->hwm) cd->hwm = code; /* High water info */
2716     #endif
2717     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2718     {
2719     *errorcodeptr = ERR52;
2720     goto FAILED;
2721     }
2722    
2723     /* There is at least one situation where code goes backwards: this is the
2724     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2725     the class is simply eliminated. However, it is created first, so we have to
2726     allow memory for it. Therefore, don't ever reduce the length at this point.
2727     */
2728    
2729     if (code < last_code) code = last_code;
2730 ph10 202
2731     /* Paranoid check for integer overflow */
2732    
2733     if (OFLOW_MAX - *lengthptr < code - last_code)
2734     {
2735     *errorcodeptr = ERR20;
2736     goto FAILED;
2737     }
2738    
2739 nigel 93 *lengthptr += code - last_code;
2740     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2741    
2742     /* If "previous" is set and it is not at the start of the work space, move
2743     it back to there, in order to avoid filling up the work space. Otherwise,
2744     if "previous" is NULL, reset the current code pointer to the start. */
2745    
2746     if (previous != NULL)
2747     {
2748     if (previous > orig_code)
2749     {
2750     memmove(orig_code, previous, code - previous);
2751     code -= previous - orig_code;
2752     previous = orig_code;
2753     }
2754     }
2755     else code = orig_code;
2756    
2757     /* Remember where this code item starts so we can pick up the length
2758     next time round. */
2759    
2760     last_code = code;
2761     }
2762    
2763     /* In the real compile phase, just check the workspace used by the forward
2764     reference list. */
2765    
2766     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2767     {
2768     *errorcodeptr = ERR52;
2769     goto FAILED;
2770     }
2771    
2772 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2773    
2774     if (inescq && c != 0)
2775     {
2776 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2777 nigel 77 {
2778     inescq = FALSE;
2779     ptr++;
2780     continue;
2781     }
2782     else
2783     {
2784     if (previous_callout != NULL)
2785     {
2786 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2787     complete_callout(previous_callout, ptr, cd);
2788 nigel 77 previous_callout = NULL;
2789     }
2790     if ((options & PCRE_AUTO_CALLOUT) != 0)
2791     {
2792     previous_callout = code;
2793     code = auto_callout(code, ptr, cd);
2794     }
2795     goto NORMAL_CHAR;
2796     }
2797     }
2798    
2799     /* Fill in length of a previous callout, except when the next thing is
2800     a quantifier. */
2801    
2802 ph10 392 is_quantifier =
2803 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2804     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2805 nigel 77
2806     if (!is_quantifier && previous_callout != NULL &&
2807     after_manual_callout-- <= 0)
2808     {
2809 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2810     complete_callout(previous_callout, ptr, cd);
2811 nigel 77 previous_callout = NULL;
2812     }
2813    
2814     /* In extended mode, skip white space and comments */
2815    
2816     if ((options & PCRE_EXTENDED) != 0)
2817     {
2818     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2819 ph10 391 if (c == CHAR_NUMBER_SIGN)
2820 nigel 77 {
2821 nigel 93 while (*(++ptr) != 0)
2822 nigel 91 {
2823 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2824 nigel 91 }
2825 nigel 93 if (*ptr != 0) continue;
2826    
2827 nigel 91 /* Else fall through to handle end of string */
2828     c = 0;
2829 nigel 77 }
2830     }
2831    
2832     /* No auto callout for quantifiers. */
2833    
2834     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2835     {
2836     previous_callout = code;
2837     code = auto_callout(code, ptr, cd);
2838     }
2839    
2840     switch(c)
2841     {
2842 nigel 93 /* ===================================================================*/
2843     case 0: /* The branch terminates at string end */
2844 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
2845     case CHAR_RIGHT_PARENTHESIS:
2846 nigel 77 *firstbyteptr = firstbyte;
2847     *reqbyteptr = reqbyte;
2848     *codeptr = code;
2849     *ptrptr = ptr;
2850 nigel 93 if (lengthptr != NULL)
2851     {
2852 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2853     {
2854     *errorcodeptr = ERR20;
2855     goto FAILED;
2856     }
2857 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2858     DPRINTF((">> end branch\n"));
2859     }
2860 nigel 77 return TRUE;
2861    
2862 nigel 93
2863     /* ===================================================================*/
2864 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2865     the setting of any following char as a first character. */
2866    
2867 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
2868 nigel 77 if ((options & PCRE_MULTILINE) != 0)
2869     {
2870     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2871     }
2872     previous = NULL;
2873     *code++ = OP_CIRC;
2874     break;
2875    
2876 ph10 391 case CHAR_DOLLAR_SIGN:
2877 nigel 77 previous = NULL;
2878     *code++ = OP_DOLL;
2879     break;
2880    
2881     /* There can never be a first char if '.' is first, whatever happens about
2882     repeats. The value of reqbyte doesn't change either. */
2883    
2884 ph10 391 case CHAR_DOT:
2885 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2886     zerofirstbyte = firstbyte;
2887     zeroreqbyte = reqbyte;
2888     previous = code;
2889 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2890 nigel 77 break;
2891    
2892 nigel 93
2893     /* ===================================================================*/
2894 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2895     32-byte bitmap of the permitted characters, except in the special case
2896     where there is only one such character. For negated classes, we build the
2897     map as usual, then invert it at the end. However, we use a different opcode
2898     so that data characters > 255 can be handled correctly.
2899 nigel 77
2900     If the class contains characters outside the 0-255 range, a different
2901     opcode is compiled. It may optionally have a bit map for characters < 256,
2902     but those above are are explicitly listed afterwards. A flag byte tells
2903     whether the bitmap is present, and whether this is a negated class or not.
2904 ph10 345
2905 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
2906     default (Perl) mode, it is treated as a data character. */
2907 ph10 345
2908 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
2909 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2910     {
2911     *errorcodeptr = ERR64;
2912 ph10 345 goto FAILED;
2913 ph10 336 }
2914 ph10 345 goto NORMAL_CHAR;
2915 nigel 77
2916 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
2917 nigel 77 previous = code;
2918    
2919     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2920     they are encountered at the top level, so we'll do that too. */
2921    
2922 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2923 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
2924 ph10 295 check_posix_syntax(ptr, &tempptr))
2925 nigel 77 {
2926 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2927 nigel 77 goto FAILED;
2928     }
2929    
2930 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2931 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2932 ph10 205 skip them too. This makes for compatibility with Perl. */
2933 ph10 208
2934 ph10 205 negate_class = FALSE;
2935     for (;;)
2936 nigel 77 {
2937     c = *(++ptr);
2938 ph10 391 if (c == CHAR_BACKSLASH)
2939 ph10 205 {
2940 ph10 392 if (ptr[1] == CHAR_E)
2941 ph10 391 ptr++;
2942 ph10 392 else if (strncmp((const char *)ptr+1,
2943     STR_Q STR_BACKSLASH STR_E, 3) == 0)
2944 ph10 391 ptr += 3;
2945 ph10 392 else
2946 ph10 391 break;
2947 ph10 205 }
2948 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2949 ph10 205 negate_class = TRUE;
2950     else break;
2951 ph10 208 }
2952 ph10 345
2953     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2954     an initial ']' is taken as a data character -- the code below handles
2955 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2956     [^] must match any character, so generate OP_ALLANY. */
2957 ph10 345
2958 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2959 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2960 ph10 341 {
2961     *code++ = negate_class? OP_ALLANY : OP_FAIL;
2962     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2963     zerofirstbyte = firstbyte;
2964     break;
2965 ph10 345 }
2966 nigel 77
2967 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
2968     negation flag at the end, so that support for characters > 255 works
2969 ph10 264 correctly (they are all included in the class). */
2970    
2971     should_flip_negation = FALSE;
2972    
2973 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
2974 nigel 93 of just a single character (as long as it's < 256). However, For higher
2975     valued UTF-8 characters, we don't yet do any optimization. */
2976 nigel 77
2977     class_charcount = 0;
2978     class_lastchar = -1;
2979    
2980 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2981     temporary bit of memory, in case the class contains only 1 character (less
2982     than 256), because in that case the compiled code doesn't use the bit map.
2983     */
2984    
2985     memset(classbits, 0, 32 * sizeof(uschar));
2986    
2987 nigel 77 #ifdef SUPPORT_UTF8
2988     class_utf8 = FALSE; /* No chars >= 256 */
2989 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2990 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2991 nigel 77 #endif
2992    
2993     /* Process characters until ] is reached. By writing this as a "do" it
2994 nigel 93 means that an initial ] is taken as a data character. At the start of the
2995     loop, c contains the first byte of the character. */
2996 nigel 77
2997 nigel 93 if (c != 0) do
2998 nigel 77 {
2999 nigel 93 const uschar *oldptr;
3000    
3001 nigel 77 #ifdef SUPPORT_UTF8
3002     if (utf8 && c > 127)
3003     { /* Braces are required because the */
3004     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3005     }
3006 ph10 309
3007 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3008 ph10 309 data and reset the pointer. This is so that very large classes that
3009 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
3010 ph10 309 (which is on the stack). */
3011    
3012 ph10 300 if (lengthptr != NULL)
3013     {
3014     *lengthptr += class_utf8data - class_utf8data_base;
3015 ph10 309 class_utf8data = class_utf8data_base;
3016     }
3017    
3018 nigel 77 #endif
3019    
3020     /* Inside \Q...\E everything is literal except \E */
3021    
3022     if (inescq)
3023     {
3024 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3025 nigel 77 {
3026 nigel 93 inescq = FALSE; /* Reset literal state */
3027     ptr++; /* Skip the 'E' */
3028     continue; /* Carry on with next */
3029 nigel 77 }
3030 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
3031 nigel 77 }
3032    
3033     /* Handle POSIX class names. Perl allows a negation extension of the
3034     form [:^name:]. A square bracket that doesn't match the syntax is
3035     treated as a literal. We also recognize the POSIX constructions
3036     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3037     5.6 and 5.8 do. */
3038    
3039 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3040 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3041 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3042 nigel 77 {
3043     BOOL local_negate = FALSE;
3044 nigel 87 int posix_class, taboffset, tabopt;
3045 nigel 77 register const uschar *cbits = cd->cbits;
3046 nigel 87 uschar pbits[32];
3047 nigel 77
3048 ph10 391 if (ptr[1] != CHAR_COLON)
3049 nigel 77 {
3050     *errorcodeptr = ERR31;
3051     goto FAILED;
3052     }
3053    
3054     ptr += 2;
3055 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3056 nigel 77 {
3057     local_negate = TRUE;
3058 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3059 nigel 77 ptr++;
3060     }
3061    
3062     posix_class = check_posix_name(ptr, tempptr - ptr);
3063     if (posix_class < 0)
3064     {
3065     *errorcodeptr = ERR30;
3066     goto FAILED;
3067     }
3068    
3069     /* If matching is caseless, upper and lower are converted to
3070     alpha. This relies on the fact that the class table starts with
3071     alpha, lower, upper as the first 3 entries. */
3072    
3073     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3074     posix_class = 0;
3075    
3076 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
3077     because we may be adding and subtracting from it, and we don't want to
3078     subtract bits that may be in the main map already. At the end we or the
3079     result into the bit map that is being built. */
3080 nigel 77
3081     posix_class *= 3;
3082 nigel 87
3083     /* Copy in the first table (always present) */
3084    
3085     memcpy(pbits, cbits + posix_class_maps[posix_class],
3086     32 * sizeof(uschar));
3087    
3088     /* If there is a second table, add or remove it as required. */
3089    
3090     taboffset = posix_class_maps[posix_class + 1];
3091     tabopt = posix_class_maps[posix_class + 2];
3092    
3093     if (taboffset >= 0)
3094 nigel 77 {
3095 nigel 87 if (tabopt >= 0)
3096     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3097 nigel 77 else
3098 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3099 nigel 77 }
3100    
3101 nigel 87 /* Not see if we need to remove any special characters. An option
3102     value of 1 removes vertical space and 2 removes underscore. */
3103    
3104     if (tabopt < 0) tabopt = -tabopt;
3105     if (tabopt == 1) pbits[1] &= ~0x3c;
3106     else if (tabopt == 2) pbits[11] &= 0x7f;
3107    
3108     /* Add the POSIX table or its complement into the main table that is
3109     being built and we are done. */
3110    
3111     if (local_negate)
3112     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3113     else
3114     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3115    
3116 nigel 77 ptr = tempptr + 1;
3117     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3118     continue; /* End of POSIX syntax handling */
3119     }
3120    
3121     /* Backslash may introduce a single character, or it may introduce one
3122 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3123     case. Inside a class (and only there) it is treated as backspace.
3124     Elsewhere it marks a word boundary. Other escapes have preset maps ready
3125 ph10 205 to 'or' into the one we are building. We assume they have more than one
3126 nigel 77 character in them, so set class_charcount bigger than one. */
3127    
3128 ph10 391 if (c == CHAR_BACKSLASH)
3129 nigel 77 {
3130 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3131     if (*errorcodeptr != 0) goto FAILED;
3132 nigel 77
3133 ph10 391 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3134     else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
3135     else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
3136 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3137     {
3138 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3139 nigel 77 {
3140     ptr += 2; /* avoid empty string */
3141     }
3142     else inescq = TRUE;
3143     continue;
3144     }
3145 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3146 nigel 77
3147     if (c < 0)
3148     {
3149     register const uschar *cbits = cd->cbits;
3150     class_charcount += 2; /* Greater than 1 is what matters */
3151 nigel 93
3152     /* Save time by not doing this in the pre-compile phase. */
3153    
3154     if (lengthptr == NULL) switch (-c)
3155 nigel 77 {
3156     case ESC_d:
3157     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3158     continue;
3159    
3160     case ESC_D:
3161 ph10 286 should_flip_negation = TRUE;
3162 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3163     continue;
3164    
3165     case ESC_w:
3166     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3167     continue;
3168    
3169     case ESC_W:
3170 ph10 286 should_flip_negation = TRUE;
3171 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3172     continue;
3173    
3174     case ESC_s:
3175     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3176     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3177     continue;
3178    
3179     case ESC_S:
3180 ph10 286 should_flip_negation = TRUE;
3181 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3182     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3183     continue;
3184    
3185 nigel 93 default: /* Not recognized; fall through */
3186     break; /* Need "default" setting to stop compiler warning. */
3187     }
3188    
3189     /* In the pre-compile phase, just do the recognition. */
3190    
3191     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3192     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3193 ph10 180
3194 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
3195     they use extra memory. */
3196 ph10 180
3197 ph10 178 if (-c == ESC_h)
3198     {
3199     SETBIT(classbits, 0x09); /* VT */
3200     SETBIT(classbits, 0x20); /* SPACE */
3201 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
3202 ph10 178 #ifdef SUPPORT_UTF8
3203     if (utf8)
3204 ph10 180 {
3205 ph10 178 class_utf8 = TRUE;
3206     *class_utf8data++ = XCL_SINGLE;
3207 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3208 ph10 178 *class_utf8data++ = XCL_SINGLE;
3209 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3210     *class_utf8data++ = XCL_RANGE;
3211     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3212     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3213 ph10 178 *class_utf8data++ = XCL_SINGLE;
3214 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3215 ph10 178 *class_utf8data++ = XCL_SINGLE;
3216 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3217 ph10 178 *class_utf8data++ = XCL_SINGLE;
3218 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3219     }
3220     #endif
3221     continue;
3222     }
3223 nigel 93
3224 ph10 178 if (-c == ESC_H)
3225     {
3226     for (c = 0; c < 32; c++)
3227     {
3228     int x = 0xff;
3229     switch (c)
3230 ph10 180 {
3231 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
3232     case 0x20/8: x ^= 1 << (0x20%8); break;
3233     case 0xa0/8: x ^= 1 << (0xa0%8); break;
3234     default: break;
3235     }
3236     classbits[c] |= x;
3237 ph10 180 }
3238    
3239 ph10 178 #ifdef SUPPORT_UTF8
3240     if (utf8)
3241 ph10 180 {
3242 ph10 178 class_utf8 = TRUE;
3243 ph10 180 *class_utf8data++ = XCL_RANGE;
3244     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3245     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3246     *class_utf8data++ = XCL_RANGE;
3247     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3248     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3249     *class_utf8data++ = XCL_RANGE;
3250     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3251     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3252     *class_utf8data++ = XCL_RANGE;
3253     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3254     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3255     *class_utf8data++ = XCL_RANGE;
3256     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3257     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3258     *class_utf8data++ = XCL_RANGE;
3259     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3260     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3261     *class_utf8data++ = XCL_RANGE;
3262     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3263     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3264     }
3265     #endif
3266     continue;
3267     }
3268 ph10 178
3269     if (-c == ESC_v)
3270     {
3271     SETBIT(classbits, 0x0a); /* LF */
3272     SETBIT(classbits, 0x0b); /* VT */
3273 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3274     SETBIT(classbits, 0x0d); /* CR */
3275     SETBIT(classbits, 0x85); /* NEL */
3276 ph10 178 #ifdef SUPPORT_UTF8
3277     if (utf8)
3278 ph10 180 {
3279 ph10 178 class_utf8 = TRUE;
3280 ph10 180 *class_utf8data++ = XCL_RANGE;
3281     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3282     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3283     }
3284     #endif
3285     continue;
3286     }
3287 ph10 178
3288     if (-c == ESC_V)
3289     {
3290     for (c = 0; c < 32; c++)
3291     {
3292     int x = 0xff;
3293     switch (c)
3294 ph10 180 {
3295 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3296     x ^= 1 << (0x0b%8);
3297     x ^= 1 << (0x0c%8);
3298 ph10 180 x ^= 1 << (0x0d%8);
3299 ph10 178 break;
3300     case 0x85/8: x ^= 1 << (0x85%8); break;
3301     default: break;
3302     }
3303     classbits[c] |= x;
3304 ph10 180 }
3305    
3306 ph10 178 #ifdef SUPPORT_UTF8
3307     if (utf8)
3308 ph10 180 {
3309 ph10 178 class_utf8 = TRUE;
3310 ph10 180 *class_utf8data++ = XCL_RANGE;
3311     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3312     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3313     *class_utf8data++ = XCL_RANGE;
3314     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3315     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3316     }
3317     #endif
3318     continue;
3319     }
3320 ph10 178
3321 nigel 93 /* We need to deal with \P and \p in both phases. */
3322    
3323 nigel 77 #ifdef SUPPORT_UCP
3324 nigel 93 if (-c == ESC_p || -c == ESC_P)
3325     {
3326     BOOL negated;
3327     int pdata;
3328     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3329     if (ptype < 0) goto FAILED;
3330     class_utf8 = TRUE;
3331     *class_utf8data++ = ((-c == ESC_p) != negated)?
3332     XCL_PROP : XCL_NOTPROP;
3333     *class_utf8data++ = ptype;
3334     *class_utf8data++ = pdata;
3335     class_charcount -= 2; /* Not a < 256 character */
3336 nigel 77 continue;
3337 nigel 93 }
3338 nigel 77 #endif
3339 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
3340     strict mode. By default, for compatibility with Perl, they are
3341     treated as literals. */
3342 nigel 77
3343 nigel 93 if ((options & PCRE_EXTRA) != 0)
3344     {
3345     *errorcodeptr = ERR7;
3346     goto FAILED;
3347     }
3348 nigel 77
3349 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3350     c = *ptr; /* Get the final character and fall through */
3351 nigel 77 }
3352    
3353     /* Fall through if we have a single character (c >= 0). This may be
3354 nigel 93 greater than 256 in UTF-8 mode. */
3355 nigel 77
3356     } /* End of backslash handling */
3357    
3358     /* A single character may be followed by '-' to form a range. However,
3359     Perl does not permit ']' to be the end of the range. A '-' character
3360 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3361     entirely. The code for handling \Q and \E is messy. */
3362 nigel 77
3363 nigel 93 CHECK_RANGE:
3364 ph10 391 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3365 nigel 77 {
3366 nigel 93 inescq = FALSE;
3367     ptr += 2;
3368     }
3369    
3370     oldptr = ptr;
3371 ph10 231
3372 ph10 230 /* Remember \r or \n */
3373 ph10 231
3374 ph10 391 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3375 ph10 231
3376 ph10 230 /* Check for range */
3377 nigel 93
3378 ph10 391 if (!inescq && ptr[1] == CHAR_MINUS)
3379 nigel 93 {
3380 nigel 77 int d;
3381     ptr += 2;
3382 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3383 nigel 77
3384 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3385     mode. */
3386    
3387 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3388 nigel 93 {
3389     ptr += 2;
3390 ph10 392 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3391 ph10 391 { ptr += 2; continue; }
3392 nigel 93 inescq = TRUE;
3393     break;
3394     }
3395    
3396 ph10 391 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3397 nigel 93 {
3398     ptr = oldptr;
3399     goto LONE_SINGLE_CHARACTER;
3400     }
3401    
3402 nigel 77 #ifdef SUPPORT_UTF8
3403     if (utf8)
3404     { /* Braces are required because the */
3405     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3406     }
3407     else
3408     #endif
3409     d = *ptr; /* Not UTF-8 mode */
3410    
3411     /* The second part of a range can be a single-character escape, but
3412     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3413     in such circumstances. */
3414    
3415 ph10 391 if (!inescq && d == CHAR_BACKSLASH)
3416 nigel 77 {
3417 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3418     if (*errorcodeptr != 0) goto FAILED;
3419 nigel 77
3420 ph10 275 /* \b is backspace; \X is literal X; \R is literal R; any other
3421 nigel 93 special means the '-' was literal */
3422 nigel 77
3423     if (d < 0)
3424     {
3425 ph10 391 if (d == -ESC_b) d = CHAR_BS;
3426     else if (d == -ESC_X) d = CHAR_X;
3427     else if (d == -ESC_R) d = CHAR_R; else
3428 nigel 77 {
3429 nigel 93 ptr = oldptr;
3430 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3431     }
3432     }
3433     }
3434    
3435 nigel 93 /* Check that the two values are in the correct order. Optimize
3436     one-character ranges */
3437 nigel 77
3438 nigel 93 if (d < c)
3439     {
3440     *errorcodeptr = ERR8;
3441     goto FAILED;
3442     }
3443    
3444 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3445    
3446 ph10 230 /* Remember \r or \n */
3447 ph10 231
3448 ph10 391 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3449 ph10 231
3450 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3451     matching, we have to use an XCLASS with extra data items. Caseless
3452     matching for characters > 127 is available only if UCP support is
3453     available. */
3454    
3455     #ifdef SUPPORT_UTF8
3456     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3457     {
3458     class_utf8 = TRUE;
3459    
3460     /* With UCP support, we can find the other case equivalents of
3461     the relevant characters. There may be several ranges. Optimize how
3462     they fit with the basic range. */
3463    
3464     #ifdef SUPPORT_UCP
3465     if ((options & PCRE_CASELESS) != 0)
3466     {
3467 nigel 93 unsigned int occ, ocd;
3468     unsigned int cc = c;
3469     unsigned int origd = d;
3470 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3471     {
3472 ph10 180 if (occ >= (unsigned int)c &&
3473     ocd <= (unsigned int)d)
3474 ph10 176 continue; /* Skip embedded ranges */
3475 nigel 77
3476 ph10 180 if (occ < (unsigned int)c &&
3477 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3478 nigel 77 { /* if there is overlap, */
3479     c = occ; /* noting that if occ < c */
3480     continue; /* we can't have ocd > d */
3481     } /* because a subrange is */
3482 ph10 180 if (ocd > (unsigned int)d &&
3483 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3484 nigel 77 { /* the basic range. */
3485     d = ocd;
3486     continue;
3487     }
3488    
3489     if (occ == ocd)
3490     {
3491     *class_utf8data++ = XCL_SINGLE;
3492     }
3493     else
3494     {
3495     *class_utf8data++ = XCL_RANGE;
3496     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3497     }
3498     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3499     }
3500     }
3501     #endif /* SUPPORT_UCP */
3502    
3503     /* Now record the original range, possibly modified for UCP caseless
3504     overlapping ranges. */
3505    
3506     *class_utf8data++ = XCL_RANGE;
3507     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3508     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3509    
3510     /* With UCP support, we are done. Without UCP support, there is no
3511     caseless matching for UTF-8 characters > 127; we can use the bit map
3512     for the smaller ones. */
3513    
3514     #ifdef SUPPORT_UCP
3515     continue; /* With next character in the class */
3516     #else
3517     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3518    
3519     /* Adjust upper limit and fall through to set up the map */
3520    
3521     d = 127;
3522    
3523     #endif /* SUPPORT_UCP */
3524     }
3525     #endif /* SUPPORT_UTF8 */
3526    
3527     /* We use the bit map for all cases when not in UTF-8 mode; else
3528     ranges that lie entirely within 0-127 when there is UCP support; else
3529     for partial ranges without UCP support. */
3530    
3531 nigel 93 class_charcount += d - c + 1;
3532     class_lastchar = d;
3533    
3534     /* We can save a bit of time by skipping this in the pre-compile. */
3535    
3536     if (lengthptr == NULL) for (; c <= d; c++)
3537 nigel 77 {
3538     classbits[c/8] |= (1 << (c&7));
3539     if ((options & PCRE_CASELESS) != 0)
3540     {
3541     int uc = cd->fcc[c]; /* flip case */
3542     classbits[uc/8] |= (1 << (uc&7));
3543     }
3544     }
3545    
3546     continue; /* Go get the next char in the class */
3547     }
3548    
3549     /* Handle a lone single character - we can get here for a normal
3550     non-escape char, or after \ that introduces a single character or for an
3551     apparent range that isn't. */
3552    
3553     LONE_SINGLE_CHARACTER:
3554 ph10 231
3555 nigel 77 /* Handle a character that cannot go in the bit map */
3556    
3557     #ifdef SUPPORT_UTF8
3558     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3559     {
3560     class_utf8 = TRUE;
3561     *class_utf8data++ = XCL_SINGLE;
3562     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3563    
3564     #ifdef SUPPORT_UCP
3565     if ((options & PCRE_CASELESS) != 0)
3566     {
3567 nigel 93 unsigned int othercase;
3568 ph10 349 if ((othercase = UCD_OTHERCASE(c)) != c)
3569 nigel 77 {
3570     *class_utf8data++ = XCL_SINGLE;
3571     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3572     }
3573     }
3574     #endif /* SUPPORT_UCP */
3575    
3576     }
3577     else
3578     #endif /* SUPPORT_UTF8 */
3579    
3580     /* Handle a single-byte character */
3581     {
3582     classbits[c/8] |= (1 << (c&7));
3583     if ((options & PCRE_CASELESS) != 0)
3584     {
3585     c = cd->fcc[c]; /* flip case */
3586     classbits[c/8] |= (1 << (c&7));
3587     }
3588     class_charcount++;
3589     class_lastchar = c;
3590     }
3591     }
3592    
3593 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3594 nigel 77
3595 ph10 391 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3596 nigel 77
3597 nigel 93 if (c == 0) /* Missing terminating ']' */
3598     {
3599     *errorcodeptr = ERR6;
3600     goto FAILED;
3601     }
3602 ph10 231
3603    
3604 ph10 230 /* This code has been disabled because it would mean that \s counts as
3605     an explicit \r or \n reference, and that's not really what is wanted. Now
3606     we set the flag only if there is a literal "\r" or "\n" in the class. */
3607 ph10 227
3608 ph10 230 #if 0
3609 ph10 226 /* Remember whether \r or \n are in this class */
3610 ph10 227
3611 ph10 226 if (negate_class)
3612     {
3613 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3614 ph10 226 }
3615     else
3616     {
3617 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3618 ph10 227 }
3619 ph10 230 #endif
3620 ph10 227
3621 ph10 231
3622 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3623 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3624     use of \p or \P, in other words, no use of any XCLASS features, we can
3625     optimize.
3626    
3627 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3628     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3629     operate on single-bytes only. This is an historical hangover. Maybe one day
3630     we can tidy these opcodes to handle multi-byte characters.
3631 nigel 77
3632     The optimization throws away the bit map. We turn the item into a
3633     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3634     that OP_NOT does not support multibyte characters. In the positive case, it
3635     can cause firstbyte to be set. Otherwise, there can be no first char if
3636     this item is first, whatever repeat count may follow. In the case of
3637     reqbyte, save the previous value for reinstating. */
3638    
3639     #ifdef SUPPORT_UTF8
3640 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3641 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3642 nigel 77 #else
3643     if (class_charcount == 1)
3644     #endif
3645     {
3646     zeroreqbyte = reqbyte;
3647    
3648     /* The OP_NOT opcode works on one-byte characters only. */
3649    
3650     if (negate_class)
3651     {
3652     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3653     zerofirstbyte = firstbyte;
3654     *code++ = OP_NOT;
3655     *code++ = class_lastchar;
3656     break;
3657     }
3658    
3659     /* For a single, positive character, get the value into mcbuffer, and
3660     then we can handle this with the normal one-character code. */
3661    
3662     #ifdef SUPPORT_UTF8
3663     if (utf8 && class_lastchar > 127)
3664     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3665     else
3666     #endif
3667     {
3668     mcbuffer[0] = class_lastchar;
3669     mclength = 1;
3670     }
3671     goto ONE_CHAR;
3672     } /* End of 1-char optimization */
3673    
3674     /* The general case - not the one-char optimization. If this is the first
3675     thing in the branch, there can be no first char setting, whatever the
3676     repeat count. Any reqbyte setting must remain unchanged after any kind of
3677     repeat. */
3678    
3679     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3680     zerofirstbyte = firstbyte;
3681     zeroreqbyte = reqbyte;
3682    
3683     /* If there are characters with values > 255, we have to compile an
3684 ph10 286 extended class, with its own opcode, unless there was a negated special
3685     such as \S in the class, because in that case all characters > 255 are in
3686     the class, so any that were explicitly given as well can be ignored. If
3687 ph10 264 (when there are explicit characters > 255 that must be listed) there are no
3688     characters < 256, we can omit the bitmap in the actual compiled code. */
3689 nigel 77
3690     #ifdef SUPPORT_UTF8
3691 ph10 264 if (class_utf8 && !should_flip_negation)
3692 nigel 77 {
3693     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3694     *code++ = OP_XCLASS;
3695     code += LINK_SIZE;
3696     *code = negate_class? XCL_NOT : 0;
3697    
3698 nigel 93 /* If the map is required, move up the extra data to make room for it;
3699     otherwise just move the code pointer to the end of the extra data. */
3700 nigel 77
3701     if (class_charcount > 0)
3702     {
3703     *code++ |= XCL_MAP;
3704 nigel 93 memmove(code + 32, code, class_utf8data - code);
3705 nigel 77 memcpy(code, classbits, 32);
3706 nigel 93 code = class_utf8data + 32;
3707 nigel 77 }
3708 nigel 93 else code = class_utf8data;
3709 nigel 77
3710     /* Now fill in the complete length of the item */
3711    
3712     PUT(previous, 1, code - previous);
3713     break; /* End of class handling */
3714     }
3715     #endif
3716    
3717 ph10 286 /* If there are no characters > 255, set the opcode to OP_CLASS or
3718     OP_NCLASS, depending on whether the whole class was negated and whether
3719     there were negative specials such as \S in the class. Then copy the 32-byte
3720 ph10 264 map into the code vector, negating it if necessary. */
3721 ph10 286
3722 ph10 264 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3723 nigel 77 if (negate_class)
3724     {
3725 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3726     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3727 nigel 77 }
3728     else
3729     {
3730     memcpy(code, classbits, 32);
3731     }
3732     code += 32;
3733     break;
3734    
3735 nigel 93
3736     /* ===================================================================*/
3737 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3738     has been tested above. */
3739    
3740 ph10 391 case CHAR_LEFT_CURLY_BRACKET:
3741 nigel 77 if (!is_quantifier) goto NORMAL_CHAR;
3742     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3743     if (*errorcodeptr != 0) goto FAILED;
3744     goto REPEAT;
3745    
3746 ph10 391 case CHAR_ASTERISK:
3747 nigel 77 repeat_min = 0;
3748     repeat_max = -1;
3749     goto REPEAT;
3750    
3751 ph10 391 case CHAR_PLUS:
3752 nigel 77 repeat_min = 1;
3753     repeat_max = -1;
3754     goto REPEAT;
3755    
3756 ph10 391 case CHAR_QUESTION_MARK:
3757 nigel 77 repeat_min = 0;
3758     repeat_max = 1;
3759    
3760     REPEAT:
3761     if (previous == NULL)
3762     {
3763     *errorcodeptr = ERR9;
3764     goto FAILED;
3765     }
3766    
3767     if (repeat_min == 0)
3768     {
3769     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3770     reqbyte = zeroreqbyte; /* Ditto */
3771     }
3772    
3773     /* Remember whether this is a variable length repeat */
3774    
3775     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3776    
3777     op_type = 0; /* Default single-char op codes */
3778     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3779    
3780     /* Save start of previous item, in case we have to move it up to make space
3781     for an inserted OP_ONCE for the additional '+' extension. */
3782    
3783     tempcode = previous;
3784    
3785     /* If the next character is '+', we have a possessive quantifier. This
3786     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3787     If the next character is '?' this is a minimizing repeat, by default,
3788     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3789     repeat type to the non-default. */
3790    
3791 ph10 391 if (ptr[1] == CHAR_PLUS)
3792 nigel 77 {
3793     repeat_type = 0; /* Force greedy */
3794     possessive_quantifier = TRUE;
3795     ptr++;
3796     }
3797 ph10 391 else if (ptr[1] == CHAR_QUESTION_MARK)
3798 nigel 77 {
3799     repeat_type = greedy_non_default;
3800     ptr++;
3801     }
3802     else repeat_type = greedy_default;
3803    
3804     /* If previous was a character match, abolish the item and generate a
3805     repeat item instead. If a char item has a minumum of more than one, ensure
3806     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3807     the first thing in a branch because the x will have gone into firstbyte
3808     instead. */
3809    
3810     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3811     {
3812     /* Deal with UTF-8 characters that take up more than one byte. It's
3813     easier to write this out separately than try to macrify it. Use c to
3814     hold the length of the character in bytes, plus 0x80 to flag that it's a
3815     length rather than a small character. */
3816    
3817     #ifdef SUPPORT_UTF8
3818     if (utf8 && (code[-1] & 0x80) != 0)
3819     {
3820     uschar *lastchar = code - 1;
3821     while((*lastchar & 0xc0) == 0x80) lastchar--;
3822     c = code - lastchar; /* Length of UTF-8 character */
3823     memcpy(utf8_char, lastchar, c); /* Save the char */
3824     c |= 0x80; /* Flag c as a length */
3825     }
3826     else
3827     #endif
3828    
3829     /* Handle the case of a single byte - either with no UTF8 support, or
3830     with UTF-8 disabled, or for a UTF-8 character < 128. */
3831    
3832     {
3833     c = code[-1];
3834     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3835     }
3836    
3837 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3838     the line is something that cannot possibly match this character. If so,
3839     automatically possessifying this item gains some performance in the case
3840     where the match fails. */
3841    
3842     if (!possessive_quantifier &&
3843     repeat_max < 0 &&
3844     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3845     options, cd))
3846     {
3847     repeat_type = 0; /* Force greedy */
3848     possessive_quantifier = TRUE;
3849     }
3850    
3851 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3852     }
3853    
3854     /* If previous was a single negated character ([^a] or similar), we use
3855     one of the special opcodes, replacing it. The code is shared with single-
3856     character repeats by setting opt_type to add a suitable offset into
3857 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3858     currently used only for single-byte chars. */
3859 nigel 77
3860     else if (*previous == OP_NOT)
3861     {
3862     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3863     c = previous[1];
3864 nigel 93 if (!possessive_quantifier &&
3865     repeat_max < 0 &&
3866     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3867     {
3868     repeat_type = 0; /* Force greedy */
3869     possessive_quantifier = TRUE;
3870     }
3871 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3872     }
3873    
3874     /* If previous was a character type match (\d or similar), abolish it and
3875     create a suitable repeat item. The code is shared with single-character
3876     repeats by setting op_type to add a suitable offset into repeat_type. Note
3877     the the Unicode property types will be present only when SUPPORT_UCP is
3878     defined, but we don't wrap the little bits of code here because it just
3879     makes it horribly messy. */
3880    
3881     else if (*previous < OP_EODN)
3882     {
3883     uschar *oldcode;
3884 nigel 87 int prop_type, prop_value;
3885 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3886     c = *previous;
3887    
3888 nigel 93 if (!possessive_quantifier &&
3889     repeat_max < 0 &&
3890     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3891     {
3892     repeat_type = 0; /* Force greedy */
3893     possessive_quantifier = TRUE;
3894     }
3895    
3896 nigel 77 OUTPUT_SINGLE_REPEAT:
3897 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3898     {
3899     prop_type = previous[1];
3900     prop_value = previous[2];
3901     }
3902     else prop_type = prop_value = -1;
3903 nigel 77
3904     oldcode = code;
3905     code = previous; /* Usually overwrite previous item */
3906    
3907     /* If the maximum is zero then the minimum must also be zero; Perl allows
3908     this case, so we do too - by simply omitting the item altogether. */
3909    
3910     if (repeat_max == 0) goto END_REPEAT;
3911    
3912 ph10 426 /*--------------------------------------------------------------------*/
3913     /* This code is obsolete from release 8.00; the restriction was finally
3914     removed: */
3915    
3916 nigel 77 /* All real repeats make it impossible to handle partial matching (maybe
3917     one day we will be able to remove this restriction). */
3918 ph10 426
3919     /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3920     /*--------------------------------------------------------------------*/
3921 nigel 77
3922     /* Combine the op_type with the repeat_type */
3923    
3924     repeat_type += op_type;
3925    
3926     /* A minimum of zero is handled either as the special case * or ?, or as
3927     an UPTO, with the maximum given. */
3928    
3929     if (repeat_min == 0)
3930     {
3931     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3932     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3933     else
3934     {
3935     *code++ = OP_UPTO + repeat_type;
3936     PUT2INC(code, 0, repeat_max);
3937     }
3938     }
3939    
3940     /* A repeat minimum of 1 is optimized into some special cases. If the
3941 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3942 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3943     one less than the maximum. */
3944    
3945     else if (repeat_min == 1)
3946     {
3947     if (repeat_max == -1)
3948     *code++ = OP_PLUS + repeat_type;
3949     else
3950     {
3951     code = oldcode; /* leave previous item in place */
3952     if (repeat_max == 1) goto END_REPEAT;
3953     *code++ = OP_UPTO + repeat_type;
3954     PUT2INC(code, 0, repeat_max - 1);
3955     }
3956     }
3957    
3958     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3959     handled as an EXACT followed by an UPTO. */
3960    
3961     else
3962     {
3963     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3964     PUT2INC(code, 0, repeat_min);
3965    
3966     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3967     we have to insert the character for the previous code. For a repeated
3968 nigel 87 Unicode property match, there are two extra bytes that define the
3969 nigel 77 required property. In UTF-8 mode, long characters have their length in
3970     c, with the 0x80 bit as a flag. */
3971    
3972     if (repeat_max < 0)
3973     {
3974     #ifdef SUPPORT_UTF8
3975     if (utf8 && c >= 128)
3976     {
3977     memcpy(code, utf8_char, c & 7);
3978     code += c & 7;
3979     }
3980     else
3981     #endif
3982     {
3983     *code++ = c;
3984 nigel 87 if (prop_type >= 0)
3985     {
3986     *code++ = prop_type;
3987     *code++ = prop_value;
3988     }
3989 nigel 77 }
3990     *code++ = OP_STAR + repeat_type;
3991     }
3992    
3993     /* Else insert an UPTO if the max is greater than the min, again
3994 nigel 93 preceded by the character, for the previously inserted code. If the
3995     UPTO is just for 1 instance, we can use QUERY instead. */
3996 nigel 77
3997     else if (repeat_max != repeat_min)
3998     {
3999     #ifdef SUPPORT_UTF8
4000     if (utf8 && c >= 128)
4001     {
4002     memcpy(code, utf8_char, c & 7);
4003     code += c & 7;
4004     }
4005     else
4006     #endif
4007     *code++ = c;
4008 nigel 87 if (prop_type >= 0)
4009     {
4010     *code++ = prop_type;
4011     *code++ = prop_value;
4012     }
4013 nigel 77 repeat_max -= repeat_min;
4014 nigel 93
4015     if (repeat_max == 1)
4016