/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 412 - (hide annotations) (download)
Sat Apr 11 10:34:37 2009 UTC (5 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 211898 byte(s)
Add support for (*UTF8).

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 381 Copyright (c) 1997-2009 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 391 #ifndef EBCDIC
101    
102     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
103 ph10 392 in UTF-8 mode. */
104 ph10 391
105 ph10 392 static const short int escapes[] = {
106 ph10 391 0, 0,
107     0, 0,
108 ph10 392 0, 0,
109     0, 0,
110     0, 0,
111 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
112 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
113 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
114 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
115     -ESC_B, -ESC_C,
116     -ESC_D, -ESC_E,
117     0, -ESC_G,
118     -ESC_H, 0,
119     0, -ESC_K,
120 ph10 391 0, 0,
121 ph10 392 0, 0,
122 ph10 391 -ESC_P, -ESC_Q,
123     -ESC_R, -ESC_S,
124 ph10 392 0, 0,
125     -ESC_V, -ESC_W,
126     -ESC_X, 0,
127     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
128 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
129 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
130 ph10 391 CHAR_GRAVE_ACCENT, 7,
131 ph10 392 -ESC_b, 0,
132     -ESC_d, ESC_e,
133 ph10 391 ESC_f, 0,
134     -ESC_h, 0,
135 ph10 392 0, -ESC_k,
136 ph10 391 0, 0,
137     ESC_n, 0,
138 ph10 392 -ESC_p, 0,
139     ESC_r, -ESC_s,
140 ph10 391 ESC_tee, 0,
141 ph10 392 -ESC_v, -ESC_w,
142     0, 0,
143 ph10 391 -ESC_z
144 nigel 77 };
145    
146 ph10 392 #else
147 ph10 391
148     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
149    
150 nigel 77 static const short int escapes[] = {
151     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
152     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
153     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
154     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
155     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
156     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
157     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
158     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
159 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
160 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
161 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
162 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
163 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
164     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
165     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
166     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
167 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
168 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
169 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
170 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
171 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
172     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
173     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
174     };
175     #endif
176    
177    
178 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
179     searched linearly. Put all the names into a single string, in order to reduce
180 ph10 392 the number of relocations when a shared library is dynamically linked. The
181     string is built from string macros so that it works in UTF-8 mode on EBCDIC
182 ph10 391 platforms. */
183 ph10 210
184     typedef struct verbitem {
185     int len;
186     int op;
187 ph10 211 } verbitem;
188 ph10 210
189 ph10 240 static const char verbnames[] =
190 ph10 391 STRING_ACCEPT0
191     STRING_COMMIT0
192     STRING_F0
193     STRING_FAIL0
194     STRING_PRUNE0
195     STRING_SKIP0
196     STRING_THEN;
197 ph10 240
198 ph10 327 static const verbitem verbs[] = {
199 ph10 240 { 6, OP_ACCEPT },
200     { 6, OP_COMMIT },
201     { 1, OP_FAIL },
202     { 4, OP_FAIL },
203     { 5, OP_PRUNE },
204     { 4, OP_SKIP },
205     { 4, OP_THEN }
206 ph10 210 };
207    
208 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
209 ph10 210
210    
211 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
212     now all in a single string, to reduce the number of relocations when a shared
213 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
214     length entry. The first three must be alpha, lower, upper, as this is assumed
215     for handling case independence. */
216 nigel 77
217 ph10 240 static const char posix_names[] =
218 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
219     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
220 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
221     STRING_word0 STRING_xdigit;
222 nigel 77
223     static const uschar posix_name_lengths[] = {
224     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
225    
226 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
227     base map, with an optional addition or removal of another map. Then, for some
228     classes, there is some additional tweaking: for [:blank:] the vertical space
229     characters are removed, and for [:alpha:] and [:alnum:] the underscore
230     character is removed. The triples in the table consist of the base map offset,
231     second map offset or -1 if no second map, and a non-negative value for map
232     addition or a negative value for map subtraction (if there are two maps). The
233     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
234     remove vertical space characters, 2 => remove underscore. */
235 nigel 77
236     static const int posix_class_maps[] = {
237 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
238     cbit_lower, -1, 0, /* lower */
239     cbit_upper, -1, 0, /* upper */
240     cbit_word, -1, 2, /* alnum - word without underscore */
241     cbit_print, cbit_cntrl, 0, /* ascii */
242     cbit_space, -1, 1, /* blank - a GNU extension */
243     cbit_cntrl, -1, 0, /* cntrl */
244     cbit_digit, -1, 0, /* digit */
245     cbit_graph, -1, 0, /* graph */
246     cbit_print, -1, 0, /* print */
247     cbit_punct, -1, 0, /* punct */
248     cbit_space, -1, 0, /* space */
249     cbit_word, -1, 0, /* word - a Perl extension */
250     cbit_xdigit,-1, 0 /* xdigit */
251 nigel 77 };
252    
253    
254 nigel 93 #define STRING(a) # a
255     #define XSTRING(s) STRING(s)
256    
257 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
258 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
259     they are documented. Always add a new error instead. Messages marked DEAD below
260 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
261     the number of relocations needed when a shared library is loaded dynamically,
262     it is now one long string. We cannot use a table of offsets, because the
263     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
264     simply count through to the one we want - this isn't a performance issue
265 ph10 240 because these strings are used only when there is a compilation error. */
266 nigel 77
267 ph10 240 static const char error_texts[] =
268     "no error\0"
269     "\\ at end of pattern\0"
270     "\\c at end of pattern\0"
271     "unrecognized character follows \\\0"
272     "numbers out of order in {} quantifier\0"
273 nigel 77 /* 5 */
274 ph10 240 "number too big in {} quantifier\0"
275     "missing terminating ] for character class\0"
276     "invalid escape sequence in character class\0"
277     "range out of order in character class\0"
278     "nothing to repeat\0"
279 nigel 77 /* 10 */
280 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
281     "internal error: unexpected repeat\0"
282 ph10 269 "unrecognized character after (? or (?-\0"
283 ph10 240 "POSIX named classes are supported only within a class\0"
284     "missing )\0"
285 nigel 77 /* 15 */
286 ph10 240 "reference to non-existent subpattern\0"
287     "erroffset passed as NULL\0"
288     "unknown option bit(s) set\0"
289     "missing ) after comment\0"
290     "parentheses nested too deeply\0" /** DEAD **/
291 nigel 77 /* 20 */
292 ph10 240 "regular expression is too large\0"
293     "failed to get memory\0"
294     "unmatched parentheses\0"
295     "internal error: code overflow\0"
296     "unrecognized character after (?<\0"
297 nigel 77 /* 25 */
298 ph10 240 "lookbehind assertion is not fixed length\0"
299     "malformed number or name after (?(\0"
300     "conditional group contains more than two branches\0"
301     "assertion expected after (?(\0"
302     "(?R or (?[+-]digits must be followed by )\0"
303 nigel 77 /* 30 */
304 ph10 240 "unknown POSIX class name\0"
305     "POSIX collating elements are not supported\0"
306     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
307     "spare error\0" /** DEAD **/
308     "character value in \\x{...} sequence is too large\0"
309 nigel 77 /* 35 */
310 ph10 240 "invalid condition (?(0)\0"
311     "\\C not allowed in lookbehind assertion\0"
312     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
313     "number after (?C is > 255\0"
314     "closing ) for (?C expected\0"
315 nigel 77 /* 40 */
316 ph10 240 "recursive call could loop indefinitely\0"
317     "unrecognized character after (?P\0"
318     "syntax error in subpattern name (missing terminator)\0"
319     "two named subpatterns have the same name\0"
320     "invalid UTF-8 string\0"
321 nigel 77 /* 45 */
322 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
323     "malformed \\P or \\p sequence\0"
324     "unknown property name after \\P or \\p\0"
325     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
326     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
327 nigel 91 /* 50 */
328 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
329     "octal value is greater than \\377 (not in UTF-8 mode)\0"
330     "internal error: overran compiling workspace\0"
331     "internal error: previously-checked referenced subpattern not found\0"
332     "DEFINE group contains more than one branch\0"
333 nigel 93 /* 55 */
334 ph10 240 "repeating a DEFINE group is not allowed\0"
335     "inconsistent NEWLINE options\0"
336 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
337     "a numbered reference must not be zero\0"
338 ph10 240 "(*VERB) with an argument is not supported\0"
339 ph10 211 /* 60 */
340 ph10 240 "(*VERB) not recognized\0"
341 ph10 268 "number is too big\0"
342 ph10 272 "subpattern name expected\0"
343 ph10 336 "digit expected after (?+\0"
344 ph10 345 "] is an invalid data character in JavaScript compatibility mode";
345 nigel 77
346    
347     /* Table to identify digits and hex digits. This is used when compiling
348     patterns. Note that the tables in chartables are dependent on the locale, and
349     may mark arbitrary characters as digits - but the PCRE compiling code expects
350     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
351     a private table here. It costs 256 bytes, but it is a lot faster than doing
352     character value tests (at least in some simple cases I timed), and in some
353     applications one wants PCRE to compile efficiently as well as match
354     efficiently.
355    
356     For convenience, we use the same bit definitions as in chartables:
357    
358     0x04 decimal digit
359     0x08 hexadecimal digit
360    
361     Then we can use ctype_digit and ctype_xdigit in the code. */
362    
363 ph10 392 #ifndef EBCDIC
364 ph10 391
365 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
366 ph10 391 UTF-8 mode. */
367    
368 nigel 77 static const unsigned char digitab[] =
369     {
370     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
371     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
374     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
375     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
376     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
377     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
378     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
379     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
380     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
381     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
382     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
383     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
387     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
388     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
393     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
394     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
395     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
396     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
397     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
398     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
399     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
400     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
401     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
402    
403 ph10 392 #else
404 ph10 391
405     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
406    
407 nigel 77 static const unsigned char digitab[] =
408     {
409     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
410     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
411     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
412     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
413     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
414     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
415     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
416     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
417     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
418     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
419     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
420 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
421 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
422     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
423     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
424     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
425     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
426     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
427     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
428     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
429     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
430     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
431     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
432     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
433     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
434     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
435     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
436     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
437     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
438     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
439     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
440     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
441    
442     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
443     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
444     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
445     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
446     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
447     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
448     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
449     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
450     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
451     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
452     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
453     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
454 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
455 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
456     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
457     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
458     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
459     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
460     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
461     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
462     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
463     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
464     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
465     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
466     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
467     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
468     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
469     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
470     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
471     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
472     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
473     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
474     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
475     #endif
476    
477    
478     /* Definition to allow mutual recursion */
479    
480     static BOOL
481 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
482 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
483 nigel 77
484    
485    
486     /*************************************************
487 ph10 240 * Find an error text *
488     *************************************************/
489    
490 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
491     some of the text is of unknown length, we can't use a table of offsets.
492     Instead, just count through the strings. This is not a performance issue
493 ph10 240 because it happens only when there has been a compilation error.
494    
495     Argument: the error number
496     Returns: pointer to the error string
497     */
498    
499     static const char *
500     find_error_text(int n)
501     {
502     const char *s = error_texts;
503 ph10 369 for (; n > 0; n--) while (*s++ != 0) {};
504 ph10 240 return s;
505     }
506    
507    
508     /*************************************************
509 nigel 77 * Handle escapes *
510     *************************************************/
511    
512     /* This function is called when a \ has been encountered. It either returns a
513     positive value for a simple escape such as \n, or a negative value which
514 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
515     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
516     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
517     ptr is pointing at the \. On exit, it is on the final character of the escape
518     sequence.
519 nigel 77
520     Arguments:
521     ptrptr points to the pattern position pointer
522     errorcodeptr points to the errorcode variable
523     bracount number of previous extracting brackets
524     options the options bits
525     isclass TRUE if inside a character class
526    
527     Returns: zero or positive => a data character
528     negative => a special escape sequence
529 ph10 213 on error, errorcodeptr is set
530 nigel 77 */
531    
532     static int
533     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
534     int options, BOOL isclass)
535     {
536 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
537     const uschar *ptr = *ptrptr + 1;
538 nigel 77 int c, i;
539    
540 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
541     ptr--; /* Set pointer back to the last byte */
542    
543 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
544    
545     if (c == 0) *errorcodeptr = ERR1;
546    
547 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
548     in a table. A non-zero result is something that can be returned immediately.
549 nigel 77 Otherwise further processing may be required. */
550    
551 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
552     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
553     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
554 nigel 77
555 ph10 97 #else /* EBCDIC coding */
556 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
557 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
558     #endif
559    
560     /* Escapes that need further processing, or are illegal. */
561    
562     else
563     {
564     const uschar *oldptr;
565 nigel 93 BOOL braced, negated;
566    
567 nigel 77 switch (c)
568     {
569     /* A number of Perl escapes are not handled by PCRE. We give an explicit
570     error. */
571    
572 ph10 391 case CHAR_l:
573     case CHAR_L:
574     case CHAR_N:
575     case CHAR_u:
576     case CHAR_U:
577 nigel 77 *errorcodeptr = ERR37;
578     break;
579    
580 ph10 333 /* \g must be followed by one of a number of specific things:
581 ph10 345
582 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
583     backreference. If negative, it is a relative backreference. This is a Perl
584     5.10 feature.
585 ph10 345
586 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
587     is part of Perl's movement towards a unified syntax for back references. As
588     this is synonymous with \k{name}, we fudge it up by pretending it really
589     was \k.
590 ph10 345
591     (3) For Oniguruma compatibility we also support \g followed by a name or a
592     number either in angle brackets or in single quotes. However, these are
593     (possibly recursive) subroutine calls, _not_ backreferences. Just return
594 ph10 333 the -ESC_g code (cf \k). */
595 nigel 93
596 ph10 391 case CHAR_g:
597     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
598 ph10 333 {
599     c = -ESC_g;
600 ph10 345 break;
601     }
602 ph10 333
603     /* Handle the Perl-compatible cases */
604 ph10 345
605 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
606 nigel 93 {
607 ph10 171 const uschar *p;
608 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
609     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
610     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
611 ph10 171 {
612     c = -ESC_k;
613     break;
614 ph10 172 }
615 nigel 93 braced = TRUE;
616     ptr++;
617     }
618     else braced = FALSE;
619    
620 ph10 391 if (ptr[1] == CHAR_MINUS)
621 nigel 93 {
622     negated = TRUE;
623     ptr++;
624     }
625     else negated = FALSE;
626    
627     c = 0;
628     while ((digitab[ptr[1]] & ctype_digit) != 0)
629 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
630 ph10 220
631 ph10 333 if (c < 0) /* Integer overflow */
632 ph10 213 {
633     *errorcodeptr = ERR61;
634     break;
635 ph10 220 }
636 ph10 345
637 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
638 nigel 93 {
639     *errorcodeptr = ERR57;
640 ph10 213 break;
641 nigel 93 }
642 ph10 345
643 ph10 333 if (c == 0)
644     {
645     *errorcodeptr = ERR58;
646     break;
647 ph10 345 }
648 nigel 93
649     if (negated)
650     {
651     if (c > bracount)
652     {
653     *errorcodeptr = ERR15;
654 ph10 213 break;
655 nigel 93 }
656     c = bracount - (c - 1);
657     }
658    
659     c = -(ESC_REF + c);
660     break;
661    
662 nigel 77 /* The handling of escape sequences consisting of a string of digits
663     starting with one that is not zero is not straightforward. By experiment,
664     the way Perl works seems to be as follows:
665    
666     Outside a character class, the digits are read as a decimal number. If the
667     number is less than 10, or if there are that many previous extracting
668     left brackets, then it is a back reference. Otherwise, up to three octal
669     digits are read to form an escaped byte. Thus \123 is likely to be octal
670     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
671     value is greater than 377, the least significant 8 bits are taken. Inside a
672     character class, \ followed by a digit is always an octal number. */
673    
674 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
675     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
676 nigel 77
677     if (!isclass)
678     {
679     oldptr = ptr;
680 ph10 391 c -= CHAR_0;
681 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
682 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
683 ph10 333 if (c < 0) /* Integer overflow */
684 ph10 213 {
685     *errorcodeptr = ERR61;
686 ph10 220 break;
687     }
688 nigel 77 if (c < 10 || c <= bracount)
689     {
690     c = -(ESC_REF + c);
691     break;
692     }
693     ptr = oldptr; /* Put the pointer back and fall through */
694     }
695    
696     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
697     generates a binary zero byte and treats the digit as a following literal.
698     Thus we have to pull back the pointer by one. */
699    
700 ph10 391 if ((c = *ptr) >= CHAR_8)
701 nigel 77 {
702     ptr--;
703     c = 0;
704     break;
705     }
706    
707     /* \0 always starts an octal number, but we may drop through to here with a
708 nigel 91 larger first octal digit. The original code used just to take the least
709     significant 8 bits of octal numbers (I think this is what early Perls used
710     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
711     than 3 octal digits. */
712 nigel 77
713 ph10 391 case CHAR_0:
714     c -= CHAR_0;
715     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
716     c = c * 8 + *(++ptr) - CHAR_0;
717 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
718 nigel 77 break;
719    
720 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
721     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
722     treated as a data character. */
723 nigel 77
724 ph10 391 case CHAR_x:
725     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
726 nigel 77 {
727     const uschar *pt = ptr + 2;
728 nigel 87 int count = 0;
729    
730 nigel 77 c = 0;
731     while ((digitab[*pt] & ctype_xdigit) != 0)
732     {
733 nigel 87 register int cc = *pt++;
734 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
735 nigel 77 count++;
736 nigel 87
737 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
738     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
739     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
740 ph10 97 #else /* EBCDIC coding */
741 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
742     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
743 nigel 77 #endif
744     }
745 nigel 87
746 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
747 nigel 77 {
748 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
749 nigel 77 ptr = pt;
750     break;
751     }
752 nigel 87
753 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
754     recognize this construct; fall through to the normal \x handling. */
755     }
756    
757 nigel 87 /* Read just a single-byte hex-defined char */
758 nigel 77
759     c = 0;
760     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
761     {
762 ph10 391 int cc; /* Some compilers don't like */
763     cc = *(++ptr); /* ++ in initializers */
764     #ifndef EBCDIC /* ASCII/UTF-8 coding */
765     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
766     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
767 ph10 97 #else /* EBCDIC coding */
768 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
769     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
770 nigel 77 #endif
771     }
772     break;
773    
774 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
775     This coding is ASCII-specific, but then the whole concept of \cx is
776     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
777 nigel 77
778 ph10 391 case CHAR_c:
779 nigel 77 c = *(++ptr);
780     if (c == 0)
781     {
782     *errorcodeptr = ERR2;
783 ph10 213 break;
784 nigel 77 }
785    
786 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
787     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
788 nigel 77 c ^= 0x40;
789 ph10 97 #else /* EBCDIC coding */
790 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
791 nigel 77 c ^= 0xC0;
792     #endif
793     break;
794    
795     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
796 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
797     otherwise, for Perl compatibility, it is a literal. This code looks a bit
798     odd, but there used to be some cases other than the default, and there may
799     be again in future, so I haven't "optimized" it. */
800 nigel 77
801     default:
802     if ((options & PCRE_EXTRA) != 0) switch(c)
803     {
804     default:
805     *errorcodeptr = ERR3;
806     break;
807     }
808     break;
809     }
810     }
811    
812     *ptrptr = ptr;
813     return c;
814     }
815    
816    
817    
818     #ifdef SUPPORT_UCP
819     /*************************************************
820     * Handle \P and \p *
821     *************************************************/
822    
823     /* This function is called after \P or \p has been encountered, provided that
824     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
825     pointing at the P or p. On exit, it is pointing at the final character of the
826     escape sequence.
827    
828     Argument:
829     ptrptr points to the pattern position pointer
830     negptr points to a boolean that is set TRUE for negation else FALSE
831 nigel 87 dptr points to an int that is set to the detailed property value
832 nigel 77 errorcodeptr points to the error code variable
833    
834 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
835 nigel 77 */
836    
837     static int
838 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
839 nigel 77 {
840     int c, i, bot, top;
841     const uschar *ptr = *ptrptr;
842 nigel 87 char name[32];
843 nigel 77
844     c = *(++ptr);
845     if (c == 0) goto ERROR_RETURN;
846    
847     *negptr = FALSE;
848    
849 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
850     negation. */
851 nigel 77
852 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
853 nigel 77 {
854 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
855 nigel 77 {
856     *negptr = TRUE;
857     ptr++;
858     }
859 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
860 nigel 77 {
861     c = *(++ptr);
862     if (c == 0) goto ERROR_RETURN;
863 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
864 nigel 77 name[i] = c;
865     }
866 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
867 nigel 77 name[i] = 0;
868     }
869    
870     /* Otherwise there is just one following character */
871    
872     else
873     {
874     name[0] = c;
875     name[1] = 0;
876     }
877    
878     *ptrptr = ptr;
879    
880     /* Search for a recognized property name using binary chop */
881    
882     bot = 0;
883     top = _pcre_utt_size;
884    
885     while (bot < top)
886     {
887 nigel 87 i = (bot + top) >> 1;
888 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
889 nigel 87 if (c == 0)
890     {
891     *dptr = _pcre_utt[i].value;
892     return _pcre_utt[i].type;
893     }
894 nigel 77 if (c > 0) bot = i + 1; else top = i;
895     }
896    
897     *errorcodeptr = ERR47;
898     *ptrptr = ptr;
899     return -1;
900    
901     ERROR_RETURN:
902     *errorcodeptr = ERR46;
903     *ptrptr = ptr;
904     return -1;
905     }
906     #endif
907    
908    
909    
910    
911     /*************************************************
912     * Check for counted repeat *
913     *************************************************/
914    
915     /* This function is called when a '{' is encountered in a place where it might
916     start a quantifier. It looks ahead to see if it really is a quantifier or not.
917     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
918     where the ddds are digits.
919    
920     Arguments:
921     p pointer to the first char after '{'
922    
923     Returns: TRUE or FALSE
924     */
925    
926     static BOOL
927     is_counted_repeat(const uschar *p)
928     {
929     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
930     while ((digitab[*p] & ctype_digit) != 0) p++;
931 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
932 nigel 77
933 ph10 391 if (*p++ != CHAR_COMMA) return FALSE;
934     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
935 nigel 77
936     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
937     while ((digitab[*p] & ctype_digit) != 0) p++;
938    
939 ph10 391 return (*p == CHAR_RIGHT_CURLY_BRACKET);
940 nigel 77 }
941    
942    
943    
944     /*************************************************
945     * Read repeat counts *
946     *************************************************/
947    
948     /* Read an item of the form {n,m} and return the values. This is called only
949     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
950     so the syntax is guaranteed to be correct, but we need to check the values.
951    
952     Arguments:
953     p pointer to first char after '{'
954     minp pointer to int for min
955     maxp pointer to int for max
956     returned as -1 if no max
957     errorcodeptr points to error code variable
958    
959     Returns: pointer to '}' on success;
960     current ptr on error, with errorcodeptr set non-zero
961     */
962    
963     static const uschar *
964     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
965     {
966     int min = 0;
967     int max = -1;
968    
969 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
970     an integer overflow. */
971    
972 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
973 nigel 81 if (min < 0 || min > 65535)
974     {
975     *errorcodeptr = ERR5;
976     return p;
977     }
978 nigel 77
979 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
980     Also, max must not be less than min. */
981    
982 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
983 nigel 77 {
984 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
985 nigel 77 {
986     max = 0;
987 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
988 nigel 81 if (max < 0 || max > 65535)
989     {
990     *errorcodeptr = ERR5;
991     return p;
992     }
993 nigel 77 if (max < min)
994     {
995     *errorcodeptr = ERR4;
996     return p;
997     }
998     }
999     }
1000    
1001 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1002     '}'. */
1003 nigel 77
1004 nigel 81 *minp = min;
1005     *maxp = max;
1006 nigel 77 return p;
1007     }
1008    
1009    
1010    
1011     /*************************************************
1012 ph10 408 * Subroutine for finding forward reference *
1013 nigel 91 *************************************************/
1014    
1015 ph10 408 /* This recursive function is called only from find_parens() below. The
1016     top-level call starts at the beginning of the pattern. All other calls must
1017     start at a parenthesis. It scans along a pattern's text looking for capturing
1018 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1019     name it is given, it returns its number. Alternatively, if the name is NULL, it
1020 ph10 408 returns when it reaches a given numbered subpattern. We know that if (?P< is
1021     encountered, the name will be terminated by '>' because that is checked in the
1022 ph10 411 first pass. Recursion is used to keep track of subpatterns that reset the
1023 ph10 408 capturing group numbers - the (?| feature.
1024 nigel 91
1025     Arguments:
1026 ph10 408 ptrptr address of the current character pointer (updated)
1027 ph10 345 cd compile background data
1028 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1029     lorn name length, or subpattern number if name is NULL
1030     xmode TRUE if we are in /x mode
1031 ph10 411 count pointer to the current capturing subpattern number (updated)
1032 nigel 91
1033     Returns: the number of the named subpattern, or -1 if not found
1034     */
1035    
1036     static int
1037 ph10 408 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1038     BOOL xmode, int *count)
1039 nigel 91 {
1040 ph10 408 uschar *ptr = *ptrptr;
1041     int start_count = *count;
1042     int hwm_count = start_count;
1043     BOOL dup_parens = FALSE;
1044 nigel 93
1045 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1046 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1047    
1048     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1049     {
1050     if (ptr[1] == CHAR_QUESTION_MARK &&
1051 ph10 411 ptr[2] == CHAR_VERTICAL_LINE)
1052 ph10 408 {
1053     ptr += 3;
1054 ph10 411 dup_parens = TRUE;
1055     }
1056 ph10 408
1057     /* Handle a normal, unnamed capturing parenthesis */
1058 ph10 411
1059 ph10 408 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1060     {
1061     *count += 1;
1062     if (name == NULL && *count == lorn) return *count;
1063 ph10 411 ptr++;
1064 ph10 408 }
1065    
1066     /* Handle a condition. If it is an assertion, just carry on so that it
1067     is processed as normal. If not, skip to the closing parenthesis of the
1068 ph10 411 condition (there can't be any nested parens. */
1069    
1070 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1071     {
1072 ph10 411 ptr += 2;
1073 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1074     {
1075     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1076 ph10 411 if (*ptr != 0) ptr++;
1077 ph10 408 }
1078 ph10 411 }
1079    
1080 ph10 408 /* We have either (? or (* and not a condition */
1081    
1082     else
1083 ph10 411 {
1084 ph10 408 ptr += 2;
1085     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1086    
1087     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1088 ph10 411
1089 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1090     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1091     {
1092     int term;
1093     const uschar *thisname;
1094     *count += 1;
1095     if (name == NULL && *count == lorn) return *count;
1096     term = *ptr++;
1097     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1098     thisname = ptr;
1099     while (*ptr != term) ptr++;
1100     if (name != NULL && lorn == ptr - thisname &&
1101     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1102     return *count;
1103 ph10 411 }
1104 ph10 408 }
1105 ph10 411 }
1106 ph10 408
1107 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1108 ph10 408 bars. */
1109    
1110 nigel 91 for (; *ptr != 0; ptr++)
1111     {
1112 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1113    
1114 ph10 391 if (*ptr == CHAR_BACKSLASH)
1115 nigel 93 {
1116 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1117 ph10 391 if (*ptr == CHAR_Q) for (;;)
1118 nigel 93 {
1119 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1120 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1121 ph10 391 if (*(++ptr) == CHAR_E) break;
1122 nigel 93 }
1123     continue;
1124     }
1125    
1126 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1127     are handled for real. If the first character is '^', skip it. Also, if the
1128     first few characters (either before or after ^) are \Q\E or \E we skip them
1129 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1130 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1131 nigel 93
1132 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1133 nigel 93 {
1134 ph10 340 BOOL negate_class = FALSE;
1135     for (;;)
1136     {
1137     int c = *(++ptr);
1138 ph10 391 if (c == CHAR_BACKSLASH)
1139 ph10 340 {
1140 ph10 392 if (ptr[1] == CHAR_E)
1141 ph10 391 ptr++;
1142 ph10 392 else if (strncmp((const char *)ptr+1,
1143     STR_Q STR_BACKSLASH STR_E, 3) == 0)
1144 ph10 391 ptr += 3;
1145 ph10 392 else
1146 ph10 391 break;
1147 ph10 340 }
1148 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
1149 ph10 340 negate_class = TRUE;
1150     else break;
1151     }
1152    
1153     /* If the next character is ']', it is a data character that must be
1154 ph10 341 skipped, except in JavaScript compatibility mode. */
1155 ph10 345
1156 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1157 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1158 ph10 345 ptr++;
1159    
1160 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1161 nigel 93 {
1162 ph10 220 if (*ptr == 0) return -1;
1163 ph10 391 if (*ptr == CHAR_BACKSLASH)
1164 nigel 93 {
1165 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1166 ph10 391 if (*ptr == CHAR_Q) for (;;)
1167 nigel 93 {
1168 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1169 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1170 ph10 391 if (*(++ptr) == CHAR_E) break;
1171 nigel 93 }
1172     continue;
1173     }
1174     }
1175     continue;
1176     }
1177    
1178     /* Skip comments in /x mode */
1179    
1180 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1181 nigel 93 {
1182 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1183 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1184 nigel 93 continue;
1185     }
1186    
1187 ph10 408 /* Check for the special metacharacters */
1188 ph10 411
1189 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1190 nigel 93 {
1191 ph10 408 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1192     if (rc > 0) return rc;
1193     if (*ptr == 0) goto FAIL_EXIT;
1194 nigel 93 }
1195 ph10 411
1196 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1197     {
1198     if (dup_parens && *count < hwm_count) *count = hwm_count;
1199     *ptrptr = ptr;
1200     return -1;
1201     }
1202 ph10 411
1203     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1204 ph10 408 {
1205     if (*count > hwm_count) hwm_count = *count;
1206     *count = start_count;
1207 ph10 411 }
1208 ph10 408 }
1209 nigel 93
1210 ph10 408 FAIL_EXIT:
1211     *ptrptr = ptr;
1212     return -1;
1213     }
1214 nigel 93
1215    
1216    
1217    
1218 ph10 408 /*************************************************
1219     * Find forward referenced subpattern *
1220     *************************************************/
1221 nigel 93
1222 ph10 408 /* This function scans along a pattern's text looking for capturing
1223     subpatterns, and counting them. If it finds a named pattern that matches the
1224     name it is given, it returns its number. Alternatively, if the name is NULL, it
1225     returns when it reaches a given numbered subpattern. This is used for forward
1226     references to subpatterns. We used to be able to start this scan from the
1227     current compiling point, using the current count value from cd->bracount, and
1228     do it all in a single loop, but the addition of the possibility of duplicate
1229     subpattern numbers means that we have to scan from the very start, in order to
1230     take account of such duplicates, and to use a recursive function to keep track
1231     of the different types of group.
1232    
1233     Arguments:
1234     cd compile background data
1235     name name to seek, or NULL if seeking a numbered subpattern
1236     lorn name length, or subpattern number if name is NULL
1237     xmode TRUE if we are in /x mode
1238    
1239     Returns: the number of the found subpattern, or -1 if not found
1240     */
1241    
1242     static int
1243     find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1244     {
1245     uschar *ptr = (uschar *)cd->start_pattern;
1246     int count = 0;
1247     int rc;
1248    
1249     /* If the pattern does not start with an opening parenthesis, the first call
1250     to find_parens_sub() will scan right to the end (if necessary). However, if it
1251     does start with a parenthesis, find_parens_sub() will return when it hits the
1252     matching closing parens. That is why we have to have a loop. */
1253    
1254 ph10 411 for (;;)
1255     {
1256 ph10 408 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1257 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1258     }
1259    
1260 ph10 408 return rc;
1261 nigel 91 }
1262    
1263    
1264    
1265 ph10 408
1266 nigel 91 /*************************************************
1267 nigel 77 * Find first significant op code *
1268     *************************************************/
1269    
1270     /* This is called by several functions that scan a compiled expression looking
1271     for a fixed first character, or an anchoring op code etc. It skips over things
1272     that do not influence this. For some calls, a change of option is important.
1273     For some calls, it makes sense to skip negative forward and all backward
1274     assertions, and also the \b assertion; for others it does not.
1275    
1276     Arguments:
1277     code pointer to the start of the group
1278     options pointer to external options
1279     optbit the option bit whose changing is significant, or
1280     zero if none are
1281     skipassert TRUE if certain assertions are to be skipped
1282    
1283     Returns: pointer to the first significant opcode
1284     */
1285    
1286     static const uschar*
1287     first_significant_code(const uschar *code, int *options, int optbit,
1288     BOOL skipassert)
1289     {
1290     for (;;)
1291     {
1292     switch ((int)*code)
1293     {
1294     case OP_OPT:
1295     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1296     *options = (int)code[1];
1297     code += 2;
1298     break;
1299    
1300     case OP_ASSERT_NOT:
1301     case OP_ASSERTBACK:
1302     case OP_ASSERTBACK_NOT:
1303     if (!skipassert) return code;
1304     do code += GET(code, 1); while (*code == OP_ALT);
1305     code += _pcre_OP_lengths[*code];
1306     break;
1307    
1308     case OP_WORD_BOUNDARY:
1309     case OP_NOT_WORD_BOUNDARY:
1310     if (!skipassert) return code;
1311     /* Fall through */
1312    
1313     case OP_CALLOUT:
1314     case OP_CREF:
1315 nigel 93 case OP_RREF:
1316     case OP_DEF:
1317 nigel 77 code += _pcre_OP_lengths[*code];
1318     break;
1319    
1320     default:
1321     return code;
1322     }
1323     }
1324     /* Control never reaches here */
1325     }
1326    
1327    
1328    
1329    
1330     /*************************************************
1331     * Find the fixed length of a pattern *
1332     *************************************************/
1333    
1334     /* Scan a pattern and compute the fixed length of subject that will match it,
1335     if the length is fixed. This is needed for dealing with backward assertions.
1336     In UTF8 mode, the result is in characters rather than bytes.
1337    
1338     Arguments:
1339     code points to the start of the pattern (the bracket)
1340     options the compiling options
1341    
1342     Returns: the fixed length, or -1 if there is no fixed length,
1343     or -2 if \C was encountered
1344     */
1345    
1346     static int
1347     find_fixedlength(uschar *code, int options)
1348     {
1349     int length = -1;
1350    
1351     register int branchlength = 0;
1352     register uschar *cc = code + 1 + LINK_SIZE;
1353    
1354     /* Scan along the opcodes for this branch. If we get to the end of the
1355     branch, check the length against that of the other branches. */
1356    
1357     for (;;)
1358     {
1359     int d;
1360     register int op = *cc;
1361     switch (op)
1362     {
1363 nigel 93 case OP_CBRA:
1364 nigel 77 case OP_BRA:
1365     case OP_ONCE:
1366     case OP_COND:
1367 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1368 nigel 77 if (d < 0) return d;
1369     branchlength += d;
1370     do cc += GET(cc, 1); while (*cc == OP_ALT);
1371     cc += 1 + LINK_SIZE;
1372     break;
1373    
1374     /* Reached end of a branch; if it's a ket it is the end of a nested
1375     call. If it's ALT it is an alternation in a nested call. If it is
1376     END it's the end of the outer call. All can be handled by the same code. */
1377    
1378     case OP_ALT:
1379     case OP_KET:
1380     case OP_KETRMAX:
1381     case OP_KETRMIN:
1382     case OP_END:
1383     if (length < 0) length = branchlength;
1384     else if (length != branchlength) return -1;
1385     if (*cc != OP_ALT) return length;
1386     cc += 1 + LINK_SIZE;
1387     branchlength = 0;
1388     break;
1389    
1390     /* Skip over assertive subpatterns */
1391    
1392     case OP_ASSERT:
1393     case OP_ASSERT_NOT:
1394     case OP_ASSERTBACK:
1395     case OP_ASSERTBACK_NOT:
1396     do cc += GET(cc, 1); while (*cc == OP_ALT);
1397     /* Fall through */
1398    
1399     /* Skip over things that don't match chars */
1400    
1401     case OP_REVERSE:
1402     case OP_CREF:
1403 nigel 93 case OP_RREF:
1404     case OP_DEF:
1405 nigel 77 case OP_OPT:
1406     case OP_CALLOUT:
1407     case OP_SOD:
1408     case OP_SOM:
1409     case OP_EOD:
1410     case OP_EODN:
1411     case OP_CIRC:
1412     case OP_DOLL:
1413     case OP_NOT_WORD_BOUNDARY:
1414     case OP_WORD_BOUNDARY:
1415     cc += _pcre_OP_lengths[*cc];
1416     break;
1417    
1418     /* Handle literal characters */
1419    
1420     case OP_CHAR:
1421     case OP_CHARNC:
1422 nigel 91 case OP_NOT:
1423 nigel 77 branchlength++;
1424     cc += 2;
1425     #ifdef SUPPORT_UTF8
1426     if ((options & PCRE_UTF8) != 0)
1427     {
1428     while ((*cc & 0xc0) == 0x80) cc++;
1429     }
1430     #endif
1431     break;
1432    
1433     /* Handle exact repetitions. The count is already in characters, but we
1434     need to skip over a multibyte character in UTF8 mode. */
1435    
1436     case OP_EXACT:
1437     branchlength += GET2(cc,1);
1438     cc += 4;
1439     #ifdef SUPPORT_UTF8
1440     if ((options & PCRE_UTF8) != 0)
1441     {
1442     while((*cc & 0x80) == 0x80) cc++;
1443     }
1444     #endif
1445     break;
1446    
1447     case OP_TYPEEXACT:
1448     branchlength += GET2(cc,1);
1449 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1450 nigel 77 cc += 4;
1451     break;
1452    
1453     /* Handle single-char matchers */
1454    
1455     case OP_PROP:
1456     case OP_NOTPROP:
1457 nigel 87 cc += 2;
1458 nigel 77 /* Fall through */
1459    
1460     case OP_NOT_DIGIT:
1461     case OP_DIGIT:
1462     case OP_NOT_WHITESPACE:
1463     case OP_WHITESPACE:
1464     case OP_NOT_WORDCHAR:
1465     case OP_WORDCHAR:
1466     case OP_ANY:
1467 ph10 342 case OP_ALLANY:
1468 nigel 77 branchlength++;
1469     cc++;
1470     break;
1471    
1472     /* The single-byte matcher isn't allowed */
1473    
1474     case OP_ANYBYTE:
1475     return -2;
1476    
1477     /* Check a class for variable quantification */
1478    
1479     #ifdef SUPPORT_UTF8
1480     case OP_XCLASS:
1481     cc += GET(cc, 1) - 33;
1482     /* Fall through */
1483     #endif
1484    
1485     case OP_CLASS:
1486     case OP_NCLASS:
1487     cc += 33;
1488    
1489     switch (*cc)
1490     {
1491     case OP_CRSTAR:
1492     case OP_CRMINSTAR:
1493     case OP_CRQUERY:
1494     case OP_CRMINQUERY:
1495     return -1;
1496    
1497     case OP_CRRANGE:
1498     case OP_CRMINRANGE:
1499     if (GET2(cc,1) != GET2(cc,3)) return -1;
1500     branchlength += GET2(cc,1);
1501     cc += 5;
1502     break;
1503    
1504     default:
1505     branchlength++;
1506     }
1507     break;
1508    
1509     /* Anything else is variable length */
1510    
1511     default:
1512     return -1;
1513     }
1514     }
1515     /* Control never gets here */
1516     }
1517    
1518    
1519    
1520    
1521     /*************************************************
1522     * Scan compiled regex for numbered bracket *
1523     *************************************************/
1524    
1525     /* This little function scans through a compiled pattern until it finds a
1526     capturing bracket with the given number.
1527    
1528     Arguments:
1529     code points to start of expression
1530     utf8 TRUE in UTF-8 mode
1531     number the required bracket number
1532    
1533     Returns: pointer to the opcode for the bracket, or NULL if not found
1534     */
1535    
1536     static const uschar *
1537     find_bracket(const uschar *code, BOOL utf8, int number)
1538     {
1539     for (;;)
1540     {
1541     register int c = *code;
1542     if (c == OP_END) return NULL;
1543 nigel 91
1544     /* XCLASS is used for classes that cannot be represented just by a bit
1545     map. This includes negated single high-valued characters. The length in
1546     the table is zero; the actual length is stored in the compiled code. */
1547    
1548     if (c == OP_XCLASS) code += GET(code, 1);
1549    
1550 nigel 93 /* Handle capturing bracket */
1551 nigel 91
1552 nigel 93 else if (c == OP_CBRA)
1553 nigel 77 {
1554 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1555 nigel 77 if (n == number) return (uschar *)code;
1556 nigel 93 code += _pcre_OP_lengths[c];
1557 nigel 77 }
1558 nigel 91
1559 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1560     repeated character types, we have to test for \p and \P, which have an extra
1561 ph10 218 two bytes of parameters. */
1562 nigel 91
1563 nigel 77 else
1564     {
1565 ph10 218 switch(c)
1566     {
1567     case OP_TYPESTAR:
1568     case OP_TYPEMINSTAR:
1569     case OP_TYPEPLUS:
1570     case OP_TYPEMINPLUS:
1571     case OP_TYPEQUERY:
1572     case OP_TYPEMINQUERY:
1573     case OP_TYPEPOSSTAR:
1574     case OP_TYPEPOSPLUS:
1575     case OP_TYPEPOSQUERY:
1576     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1577 ph10 220 break;
1578 ph10 221
1579     case OP_TYPEUPTO:
1580     case OP_TYPEMINUPTO:
1581     case OP_TYPEEXACT:
1582     case OP_TYPEPOSUPTO:
1583     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1584     break;
1585 ph10 220 }
1586    
1587 ph10 218 /* Add in the fixed length from the table */
1588 ph10 220
1589 nigel 77 code += _pcre_OP_lengths[c];
1590 ph10 220
1591 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1592     a multi-byte character. The length in the table is a minimum, so we have to
1593     arrange to skip the extra bytes. */
1594 ph10 220
1595 ph10 107 #ifdef SUPPORT_UTF8
1596 nigel 77 if (utf8) switch(c)
1597     {
1598     case OP_CHAR:
1599     case OP_CHARNC:
1600     case OP_EXACT:
1601     case OP_UPTO:
1602     case OP_MINUPTO:
1603 nigel 93 case OP_POSUPTO:
1604 nigel 77 case OP_STAR:
1605     case OP_MINSTAR:
1606 nigel 93 case OP_POSSTAR:
1607 nigel 77 case OP_PLUS:
1608     case OP_MINPLUS:
1609 nigel 93 case OP_POSPLUS:
1610 nigel 77 case OP_QUERY:
1611     case OP_MINQUERY:
1612 nigel 93 case OP_POSQUERY:
1613     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1614 nigel 77 break;
1615     }
1616 ph10 369 #else
1617     (void)(utf8); /* Keep compiler happy by referencing function argument */
1618 ph10 111 #endif
1619 nigel 77 }
1620     }
1621     }
1622    
1623    
1624    
1625     /*************************************************
1626     * Scan compiled regex for recursion reference *
1627     *************************************************/
1628    
1629     /* This little function scans through a compiled pattern until it finds an
1630     instance of OP_RECURSE.
1631    
1632     Arguments:
1633     code points to start of expression
1634     utf8 TRUE in UTF-8 mode
1635    
1636     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1637     */
1638    
1639     static const uschar *
1640     find_recurse(const uschar *code, BOOL utf8)
1641     {
1642     for (;;)
1643     {
1644     register int c = *code;
1645     if (c == OP_END) return NULL;
1646 nigel 91 if (c == OP_RECURSE) return code;
1647 ph10 220
1648 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1649     map. This includes negated single high-valued characters. The length in
1650     the table is zero; the actual length is stored in the compiled code. */
1651    
1652     if (c == OP_XCLASS) code += GET(code, 1);
1653    
1654 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1655     repeated character types, we have to test for \p and \P, which have an extra
1656 ph10 218 two bytes of parameters. */
1657 nigel 91
1658 nigel 77 else
1659     {
1660 ph10 218 switch(c)
1661     {
1662     case OP_TYPESTAR:
1663     case OP_TYPEMINSTAR:
1664     case OP_TYPEPLUS:
1665     case OP_TYPEMINPLUS:
1666     case OP_TYPEQUERY:
1667     case OP_TYPEMINQUERY:
1668     case OP_TYPEPOSSTAR:
1669     case OP_TYPEPOSPLUS:
1670     case OP_TYPEPOSQUERY:
1671     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1672 ph10 220 break;
1673 ph10 221
1674     case OP_TYPEPOSUPTO:
1675     case OP_TYPEUPTO:
1676     case OP_TYPEMINUPTO:
1677     case OP_TYPEEXACT:
1678     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1679     break;
1680 ph10 220 }
1681    
1682 ph10 218 /* Add in the fixed length from the table */
1683    
1684 nigel 77 code += _pcre_OP_lengths[c];
1685 ph10 220
1686 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1687     by a multi-byte character. The length in the table is a minimum, so we have
1688     to arrange to skip the extra bytes. */
1689 ph10 220
1690 ph10 107 #ifdef SUPPORT_UTF8
1691 nigel 77 if (utf8) switch(c)
1692     {
1693     case OP_CHAR:
1694     case OP_CHARNC:
1695     case OP_EXACT:
1696     case OP_UPTO:
1697     case OP_MINUPTO:
1698 nigel 93 case OP_POSUPTO:
1699 nigel 77 case OP_STAR:
1700     case OP_MINSTAR:
1701 nigel 93 case OP_POSSTAR:
1702 nigel 77 case OP_PLUS:
1703     case OP_MINPLUS:
1704 nigel 93 case OP_POSPLUS:
1705 nigel 77 case OP_QUERY:
1706     case OP_MINQUERY:
1707 nigel 93 case OP_POSQUERY:
1708     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1709 nigel 77 break;
1710     }
1711 ph10 369 #else
1712     (void)(utf8); /* Keep compiler happy by referencing function argument */
1713 ph10 111 #endif
1714 nigel 77 }
1715     }
1716     }
1717    
1718    
1719    
1720     /*************************************************
1721     * Scan compiled branch for non-emptiness *
1722     *************************************************/
1723    
1724     /* This function scans through a branch of a compiled pattern to see whether it
1725 nigel 93 can match the empty string or not. It is called from could_be_empty()
1726     below and from compile_branch() when checking for an unlimited repeat of a
1727     group that can match nothing. Note that first_significant_code() skips over
1728 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1729     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1730     bracket whose current branch will already have been scanned.
1731 nigel 77
1732     Arguments:
1733     code points to start of search
1734     endcode points to where to stop
1735     utf8 TRUE if in UTF8 mode
1736    
1737     Returns: TRUE if what is matched could be empty
1738     */
1739    
1740     static BOOL
1741     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1742     {
1743     register int c;
1744 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1745 nigel 77 code < endcode;
1746     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1747     {
1748     const uschar *ccode;
1749    
1750     c = *code;
1751 ph10 286
1752     /* Skip over forward assertions; the other assertions are skipped by
1753 ph10 282 first_significant_code() with a TRUE final argument. */
1754 ph10 286
1755 ph10 282 if (c == OP_ASSERT)
1756 ph10 286 {
1757 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1758     c = *code;
1759     continue;
1760 ph10 286 }
1761 ph10 172
1762 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1763 nigel 77
1764 ph10 335 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1765 ph10 170 {
1766 ph10 172 code += _pcre_OP_lengths[c];
1767 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1768     c = *code;
1769     continue;
1770     }
1771    
1772     /* For other groups, scan the branches. */
1773 ph10 172
1774 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1775 nigel 77 {
1776     BOOL empty_branch;
1777     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1778 ph10 406
1779     /* If a conditional group has only one branch, there is a second, implied,
1780 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
1781     Otherwise, scan the individual branches of the group. */
1782 ph10 406
1783 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1784 nigel 77 code += GET(code, 1);
1785 ph10 395 else
1786 ph10 406 {
1787 ph10 395 empty_branch = FALSE;
1788     do
1789     {
1790     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1791     empty_branch = TRUE;
1792     code += GET(code, 1);
1793     }
1794     while (*code == OP_ALT);
1795     if (!empty_branch) return FALSE; /* All branches are non-empty */
1796 nigel 77 }
1797 ph10 406
1798 ph10 172 c = *code;
1799 nigel 93 continue;
1800 nigel 77 }
1801    
1802 nigel 93 /* Handle the other opcodes */
1803    
1804     switch (c)
1805 nigel 77 {
1806 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1807     cannot be represented just by a bit map. This includes negated single
1808     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1809 ph10 220 actual length is stored in the compiled code, so we must update "code"
1810 ph10 216 here. */
1811 nigel 77
1812     #ifdef SUPPORT_UTF8
1813     case OP_XCLASS:
1814 ph10 216 ccode = code += GET(code, 1);
1815 nigel 77 goto CHECK_CLASS_REPEAT;
1816     #endif
1817    
1818     case OP_CLASS:
1819     case OP_NCLASS:
1820     ccode = code + 33;
1821    
1822     #ifdef SUPPORT_UTF8
1823     CHECK_CLASS_REPEAT:
1824     #endif
1825    
1826     switch (*ccode)
1827     {
1828     case OP_CRSTAR: /* These could be empty; continue */
1829     case OP_CRMINSTAR:
1830     case OP_CRQUERY:
1831     case OP_CRMINQUERY:
1832     break;
1833    
1834     default: /* Non-repeat => class must match */
1835     case OP_CRPLUS: /* These repeats aren't empty */
1836     case OP_CRMINPLUS:
1837     return FALSE;
1838    
1839     case OP_CRRANGE:
1840     case OP_CRMINRANGE:
1841     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1842     break;
1843     }
1844     break;
1845    
1846     /* Opcodes that must match a character */
1847    
1848     case OP_PROP:
1849     case OP_NOTPROP:
1850     case OP_EXTUNI:
1851     case OP_NOT_DIGIT:
1852     case OP_DIGIT:
1853     case OP_NOT_WHITESPACE:
1854     case OP_WHITESPACE:
1855     case OP_NOT_WORDCHAR:
1856     case OP_WORDCHAR:
1857     case OP_ANY:
1858 ph10 345 case OP_ALLANY:
1859 nigel 77 case OP_ANYBYTE:
1860     case OP_CHAR:
1861     case OP_CHARNC:
1862     case OP_NOT:
1863     case OP_PLUS:
1864     case OP_MINPLUS:
1865 nigel 93 case OP_POSPLUS:
1866 nigel 77 case OP_EXACT:
1867     case OP_NOTPLUS:
1868     case OP_NOTMINPLUS:
1869 nigel 93 case OP_NOTPOSPLUS:
1870 nigel 77 case OP_NOTEXACT:
1871     case OP_TYPEPLUS:
1872     case OP_TYPEMINPLUS:
1873 nigel 93 case OP_TYPEPOSPLUS:
1874 nigel 77 case OP_TYPEEXACT:
1875     return FALSE;
1876 ph10 227
1877     /* These are going to continue, as they may be empty, but we have to
1878     fudge the length for the \p and \P cases. */
1879    
1880 ph10 224 case OP_TYPESTAR:
1881     case OP_TYPEMINSTAR:
1882     case OP_TYPEPOSSTAR:
1883     case OP_TYPEQUERY:
1884     case OP_TYPEMINQUERY:
1885     case OP_TYPEPOSQUERY:
1886     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1887 ph10 227 break;
1888    
1889 ph10 224 /* Same for these */
1890 ph10 227
1891 ph10 224 case OP_TYPEUPTO:
1892     case OP_TYPEMINUPTO:
1893     case OP_TYPEPOSUPTO:
1894     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1895     break;
1896 nigel 77
1897     /* End of branch */
1898    
1899     case OP_KET:
1900     case OP_KETRMAX:
1901     case OP_KETRMIN:
1902     case OP_ALT:
1903     return TRUE;
1904    
1905 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1906     MINUPTO, and POSUPTO may be followed by a multibyte character */
1907 nigel 77
1908     #ifdef SUPPORT_UTF8
1909     case OP_STAR:
1910     case OP_MINSTAR:
1911 nigel 93 case OP_POSSTAR:
1912 nigel 77 case OP_QUERY:
1913     case OP_MINQUERY:
1914 nigel 93 case OP_POSQUERY:
1915 nigel 77 case OP_UPTO:
1916     case OP_MINUPTO:
1917 nigel 93 case OP_POSUPTO:
1918 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1919     break;
1920     #endif
1921     }
1922     }
1923    
1924     return TRUE;
1925     }
1926    
1927    
1928    
1929     /*************************************************
1930     * Scan compiled regex for non-emptiness *
1931     *************************************************/
1932    
1933     /* This function is called to check for left recursive calls. We want to check
1934     the current branch of the current pattern to see if it could match the empty
1935     string. If it could, we must look outwards for branches at other levels,
1936     stopping when we pass beyond the bracket which is the subject of the recursion.
1937    
1938     Arguments:
1939     code points to start of the recursion
1940     endcode points to where to stop (current RECURSE item)
1941     bcptr points to the chain of current (unclosed) branch starts
1942     utf8 TRUE if in UTF-8 mode
1943    
1944     Returns: TRUE if what is matched could be empty
1945     */
1946    
1947     static BOOL
1948     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1949     BOOL utf8)
1950     {
1951     while (bcptr != NULL && bcptr->current >= code)
1952     {
1953     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1954     bcptr = bcptr->outer;
1955     }
1956     return TRUE;
1957     }
1958    
1959    
1960    
1961     /*************************************************
1962     * Check for POSIX class syntax *
1963     *************************************************/
1964    
1965     /* This function is called when the sequence "[:" or "[." or "[=" is
1966 ph10 295 encountered in a character class. It checks whether this is followed by a
1967 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1968 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
1969 nigel 77
1970 ph10 298 Originally, this function only recognized a sequence of letters between the
1971     terminators, but it seems that Perl recognizes any sequence of characters,
1972     though of course unknown POSIX names are subsequently rejected. Perl gives an
1973     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1974     didn't consider this to be a POSIX class. Likewise for [:1234:].
1975 ph10 295
1976 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
1977     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1978     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1979     below handles the special case of \], but does not try to do any other escape
1980     processing. This makes it different from Perl for cases such as [:l\ower:]
1981 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1982 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1983 ph10 295 I think.
1984    
1985     Arguments:
1986 nigel 77 ptr pointer to the initial [
1987     endptr where to return the end pointer
1988    
1989     Returns: TRUE or FALSE
1990     */
1991    
1992     static BOOL
1993 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1994 nigel 77 {
1995     int terminator; /* Don't combine these lines; the Solaris cc */
1996     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1997 ph10 295 for (++ptr; *ptr != 0; ptr++)
1998 nigel 77 {
1999 ph10 391 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2000 ph10 298 {
2001 ph10 391 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2002     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2003 ph10 295 {
2004     *endptr = ptr;
2005     return TRUE;
2006 ph10 298 }
2007     }
2008     }
2009 nigel 77 return FALSE;
2010     }
2011    
2012    
2013    
2014    
2015     /*************************************************
2016     * Check POSIX class name *
2017     *************************************************/
2018    
2019     /* This function is called to check the name given in a POSIX-style class entry
2020     such as [:alnum:].
2021    
2022     Arguments:
2023     ptr points to the first letter
2024     len the length of the name
2025    
2026     Returns: a value representing the name, or -1 if unknown
2027     */
2028    
2029     static int
2030     check_posix_name(const uschar *ptr, int len)
2031     {
2032 ph10 240 const char *pn = posix_names;
2033 nigel 77 register int yield = 0;
2034     while (posix_name_lengths[yield] != 0)
2035     {
2036     if (len == posix_name_lengths[yield] &&
2037 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2038 ph10 243 pn += posix_name_lengths[yield] + 1;
2039 nigel 77 yield++;
2040     }
2041     return -1;
2042     }
2043    
2044    
2045     /*************************************************
2046     * Adjust OP_RECURSE items in repeated group *
2047     *************************************************/
2048    
2049     /* OP_RECURSE items contain an offset from the start of the regex to the group
2050     that is referenced. This means that groups can be replicated for fixed
2051     repetition simply by copying (because the recursion is allowed to refer to
2052     earlier groups that are outside the current group). However, when a group is
2053 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2054     inserted before it, after it has been compiled. This means that any OP_RECURSE
2055     items within it that refer to the group itself or any contained groups have to
2056     have their offsets adjusted. That one of the jobs of this function. Before it
2057     is called, the partially compiled regex must be temporarily terminated with
2058     OP_END.
2059 nigel 77
2060 nigel 93 This function has been extended with the possibility of forward references for
2061     recursions and subroutine calls. It must also check the list of such references
2062     for the group we are dealing with. If it finds that one of the recursions in
2063     the current group is on this list, it adjusts the offset in the list, not the
2064     value in the reference (which is a group number).
2065    
2066 nigel 77 Arguments:
2067     group points to the start of the group
2068     adjust the amount by which the group is to be moved
2069     utf8 TRUE in UTF-8 mode
2070     cd contains pointers to tables etc.
2071 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2072 nigel 77
2073     Returns: nothing
2074     */
2075    
2076     static void
2077 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2078     uschar *save_hwm)
2079 nigel 77 {
2080     uschar *ptr = group;
2081 ph10 224
2082 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2083     {
2084 nigel 93 int offset;
2085     uschar *hc;
2086    
2087     /* See if this recursion is on the forward reference list. If so, adjust the
2088     reference. */
2089 ph10 345
2090 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2091     {
2092     offset = GET(hc, 0);
2093     if (cd->start_code + offset == ptr + 1)
2094     {
2095     PUT(hc, 0, offset + adjust);
2096     break;
2097     }
2098     }
2099    
2100     /* Otherwise, adjust the recursion offset if it's after the start of this
2101     group. */
2102    
2103     if (hc >= cd->hwm)
2104     {
2105     offset = GET(ptr, 1);
2106     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2107     }
2108    
2109 nigel 77 ptr += 1 + LINK_SIZE;
2110     }
2111     }
2112    
2113    
2114    
2115     /*************************************************
2116     * Insert an automatic callout point *
2117     *************************************************/
2118    
2119     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2120     callout points before each pattern item.
2121    
2122     Arguments:
2123     code current code pointer
2124     ptr current pattern pointer
2125     cd pointers to tables etc
2126    
2127     Returns: new code pointer
2128     */
2129    
2130     static uschar *
2131     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2132     {
2133     *code++ = OP_CALLOUT;
2134     *code++ = 255;
2135     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2136     PUT(code, LINK_SIZE, 0); /* Default length */
2137     return code + 2*LINK_SIZE;
2138     }
2139    
2140    
2141    
2142     /*************************************************
2143     * Complete a callout item *
2144     *************************************************/
2145    
2146     /* A callout item contains the length of the next item in the pattern, which
2147     we can't fill in till after we have reached the relevant point. This is used
2148     for both automatic and manual callouts.
2149    
2150     Arguments:
2151     previous_callout points to previous callout item
2152     ptr current pattern pointer
2153     cd pointers to tables etc
2154    
2155     Returns: nothing
2156     */
2157    
2158     static void
2159     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2160     {
2161     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2162     PUT(previous_callout, 2 + LINK_SIZE, length);
2163     }
2164    
2165    
2166    
2167     #ifdef SUPPORT_UCP
2168     /*************************************************
2169     * Get othercase range *
2170     *************************************************/
2171    
2172     /* This function is passed the start and end of a class range, in UTF-8 mode
2173     with UCP support. It searches up the characters, looking for internal ranges of
2174     characters in the "other" case. Each call returns the next one, updating the
2175     start address.
2176    
2177     Arguments:
2178     cptr points to starting character value; updated
2179     d end value
2180     ocptr where to put start of othercase range
2181     odptr where to put end of othercase range
2182    
2183     Yield: TRUE when range returned; FALSE when no more
2184     */
2185    
2186     static BOOL
2187 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2188     unsigned int *odptr)
2189 nigel 77 {
2190 nigel 93 unsigned int c, othercase, next;
2191 nigel 77
2192     for (c = *cptr; c <= d; c++)
2193 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2194 nigel 77
2195     if (c > d) return FALSE;
2196    
2197     *ocptr = othercase;
2198     next = othercase + 1;
2199    
2200     for (++c; c <= d; c++)
2201     {
2202 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2203 nigel 77 next++;
2204     }
2205    
2206     *odptr = next - 1;
2207     *cptr = c;
2208    
2209     return TRUE;
2210     }
2211     #endif /* SUPPORT_UCP */
2212    
2213    
2214 nigel 93
2215 nigel 77 /*************************************************
2216 nigel 93 * Check if auto-possessifying is possible *
2217     *************************************************/
2218    
2219     /* This function is called for unlimited repeats of certain items, to see
2220     whether the next thing could possibly match the repeated item. If not, it makes
2221     sense to automatically possessify the repeated item.
2222    
2223     Arguments:
2224     op_code the repeated op code
2225     this data for this item, depends on the opcode
2226     utf8 TRUE in UTF-8 mode
2227     utf8_char used for utf8 character bytes, NULL if not relevant
2228     ptr next character in pattern
2229     options options bits
2230     cd contains pointers to tables etc.
2231    
2232     Returns: TRUE if possessifying is wanted
2233     */
2234    
2235     static BOOL
2236     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2237     const uschar *ptr, int options, compile_data *cd)
2238     {
2239     int next;
2240    
2241     /* Skip whitespace and comments in extended mode */
2242    
2243     if ((options & PCRE_EXTENDED) != 0)
2244     {
2245     for (;;)
2246     {
2247     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2248 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2249 nigel 93 {
2250     while (*(++ptr) != 0)
2251     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2252     }
2253     else break;
2254     }
2255     }
2256    
2257     /* If the next item is one that we can handle, get its value. A non-negative
2258     value is a character, a negative value is an escape value. */
2259    
2260 ph10 391 if (*ptr == CHAR_BACKSLASH)
2261 nigel 93 {
2262     int temperrorcode = 0;
2263     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2264     if (temperrorcode != 0) return FALSE;
2265     ptr++; /* Point after the escape sequence */
2266     }
2267    
2268     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2269     {
2270     #ifdef SUPPORT_UTF8
2271     if (utf8) { GETCHARINC(next, ptr); } else
2272     #endif
2273     next = *ptr++;
2274     }
2275    
2276     else return FALSE;
2277    
2278     /* Skip whitespace and comments in extended mode */
2279    
2280     if ((options & PCRE_EXTENDED) != 0)
2281     {
2282     for (;;)
2283     {
2284     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2285 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2286 nigel 93 {
2287     while (*(++ptr) != 0)
2288     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2289     }
2290     else break;
2291     }
2292     }
2293    
2294     /* If the next thing is itself optional, we have to give up. */
2295    
2296 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2297 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2298     return FALSE;
2299 nigel 93
2300     /* Now compare the next item with the previous opcode. If the previous is a
2301     positive single character match, "item" either contains the character or, if
2302     "item" is greater than 127 in utf8 mode, the character's bytes are in
2303     utf8_char. */
2304    
2305    
2306     /* Handle cases when the next item is a character. */
2307    
2308     if (next >= 0) switch(op_code)
2309     {
2310     case OP_CHAR:
2311     #ifdef SUPPORT_UTF8
2312     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2313 ph10 369 #else
2314     (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2315 nigel 93 #endif
2316     return item != next;
2317    
2318     /* For CHARNC (caseless character) we must check the other case. If we have
2319     Unicode property support, we can use it to test the other case of
2320     high-valued characters. */
2321    
2322     case OP_CHARNC:
2323     #ifdef SUPPORT_UTF8
2324     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2325     #endif
2326     if (item == next) return FALSE;
2327     #ifdef SUPPORT_UTF8
2328     if (utf8)
2329     {
2330     unsigned int othercase;
2331     if (next < 128) othercase = cd->fcc[next]; else
2332     #ifdef SUPPORT_UCP
2333 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2334 nigel 93 #else
2335     othercase = NOTACHAR;
2336     #endif
2337     return (unsigned int)item != othercase;
2338     }
2339     else
2340     #endif /* SUPPORT_UTF8 */
2341     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2342    
2343     /* For OP_NOT, "item" must be a single-byte character. */
2344    
2345     case OP_NOT:
2346     if (item == next) return TRUE;
2347     if ((options & PCRE_CASELESS) == 0) return FALSE;
2348     #ifdef SUPPORT_UTF8
2349     if (utf8)
2350     {
2351     unsigned int othercase;
2352     if (next < 128) othercase = cd->fcc[next]; else
2353     #ifdef SUPPORT_UCP
2354 ph10 349 othercase = UCD_OTHERCASE(next);
2355 nigel 93 #else
2356     othercase = NOTACHAR;
2357     #endif
2358     return (unsigned int)item == othercase;
2359     }
2360     else
2361     #endif /* SUPPORT_UTF8 */
2362     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2363    
2364     case OP_DIGIT:
2365     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2366    
2367     case OP_NOT_DIGIT:
2368     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2369    
2370     case OP_WHITESPACE:
2371     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2372    
2373     case OP_NOT_WHITESPACE:
2374     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2375    
2376     case OP_WORDCHAR:
2377     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2378    
2379     case OP_NOT_WORDCHAR:
2380     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2381    
2382 ph10 180 case OP_HSPACE:
2383     case OP_NOT_HSPACE:
2384     switch(next)
2385     {
2386     case 0x09:
2387     case 0x20:
2388     case 0xa0:
2389     case 0x1680:
2390     case 0x180e:
2391     case 0x2000:
2392     case 0x2001:
2393     case 0x2002:
2394     case 0x2003:
2395     case 0x2004:
2396     case 0x2005:
2397     case 0x2006:
2398     case 0x2007:
2399     case 0x2008:
2400     case 0x2009:
2401     case 0x200A:
2402     case 0x202f:
2403     case 0x205f:
2404     case 0x3000:
2405     return op_code != OP_HSPACE;
2406     default:
2407     return op_code == OP_HSPACE;
2408     }
2409    
2410     case OP_VSPACE:
2411     case OP_NOT_VSPACE:
2412     switch(next)
2413     {
2414     case 0x0a:
2415     case 0x0b:
2416     case 0x0c:
2417     case 0x0d:
2418     case 0x85:
2419     case 0x2028:
2420     case 0x2029:
2421     return op_code != OP_VSPACE;
2422     default:
2423     return op_code == OP_VSPACE;
2424     }
2425    
2426 nigel 93 default:
2427     return FALSE;
2428     }
2429    
2430    
2431     /* Handle the case when the next item is \d, \s, etc. */
2432    
2433     switch(op_code)
2434     {
2435     case OP_CHAR:
2436     case OP_CHARNC:
2437     #ifdef SUPPORT_UTF8
2438     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2439     #endif
2440     switch(-next)
2441     {
2442     case ESC_d:
2443     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2444    
2445     case ESC_D:
2446     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2447    
2448     case ESC_s:
2449     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2450    
2451     case ESC_S:
2452     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2453    
2454     case ESC_w:
2455     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2456    
2457     case ESC_W:
2458     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2459 ph10 182
2460 ph10 180 case ESC_h:
2461     case ESC_H:
2462     switch(item)
2463     {
2464     case 0x09:
2465     case 0x20:
2466     case 0xa0:
2467     case 0x1680:
2468     case 0x180e:
2469     case 0x2000:
2470     case 0x2001:
2471     case 0x2002:
2472     case 0x2003:
2473     case 0x2004:
2474     case 0x2005:
2475     case 0x2006:
2476     case 0x2007:
2477     case 0x2008:
2478     case 0x2009:
2479     case 0x200A:
2480     case 0x202f:
2481     case 0x205f:
2482     case 0x3000:
2483     return -next != ESC_h;
2484     default:
2485     return -next == ESC_h;
2486 ph10 182 }
2487    
2488 ph10 180 case ESC_v:
2489     case ESC_V:
2490     switch(item)
2491     {
2492     case 0x0a:
2493     case 0x0b:
2494     case 0x0c:
2495     case 0x0d:
2496     case 0x85:
2497     case 0x2028:
2498     case 0x2029:
2499     return -next != ESC_v;
2500     default:
2501     return -next == ESC_v;
2502 ph10 182 }
2503 nigel 93
2504     default:
2505     return FALSE;
2506     }
2507    
2508     case OP_DIGIT:
2509 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2510     next == -ESC_h || next == -ESC_v;
2511 nigel 93
2512     case OP_NOT_DIGIT:
2513     return next == -ESC_d;
2514    
2515     case OP_WHITESPACE:
2516     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2517    
2518     case OP_NOT_WHITESPACE:
2519 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2520 nigel 93
2521 ph10 180 case OP_HSPACE:
2522     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2523    
2524     case OP_NOT_HSPACE:
2525     return next == -ESC_h;
2526 ph10 182
2527 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2528 ph10 182 case OP_VSPACE:
2529 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2530    
2531     case OP_NOT_VSPACE:
2532 ph10 182 return next == -ESC_v;
2533 ph10 180
2534 nigel 93 case OP_WORDCHAR:
2535 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2536 nigel 93
2537     case OP_NOT_WORDCHAR:
2538     return next == -ESC_w || next == -ESC_d;
2539 ph10 182
2540 nigel 93 default:
2541     return FALSE;
2542     }
2543    
2544     /* Control does not reach here */
2545     }
2546    
2547    
2548    
2549     /*************************************************
2550 nigel 77 * Compile one branch *
2551     *************************************************/
2552    
2553 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2554 nigel 77 changed during the branch, the pointer is used to change the external options
2555 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2556     to find out the amount of memory needed, as well as during the real compile
2557     phase. The value of lengthptr distinguishes the two phases.
2558 nigel 77
2559     Arguments:
2560     optionsptr pointer to the option bits
2561     codeptr points to the pointer to the current code point
2562     ptrptr points to the current pattern pointer
2563     errorcodeptr points to error code variable
2564     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2565     reqbyteptr set to the last literal character required, else < 0
2566     bcptr points to current branch chain
2567     cd contains pointers to tables etc.
2568 nigel 93 lengthptr NULL during the real compile phase
2569     points to length accumulator during pre-compile phase
2570 nigel 77
2571     Returns: TRUE on success
2572     FALSE, with *errorcodeptr set non-zero on error
2573     */
2574    
2575     static BOOL
2576 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2577     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2578     compile_data *cd, int *lengthptr)
2579 nigel 77 {
2580     int repeat_type, op_type;
2581     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2582     int bravalue = 0;
2583     int greedy_default, greedy_non_default;
2584     int firstbyte, reqbyte;
2585     int zeroreqbyte, zerofirstbyte;
2586     int req_caseopt, reqvary, tempreqvary;
2587     int options = *optionsptr;
2588     int after_manual_callout = 0;
2589 nigel 93 int length_prevgroup = 0;
2590 nigel 77 register int c;
2591     register uschar *code = *codeptr;
2592 nigel 93 uschar *last_code = code;
2593     uschar *orig_code = code;
2594 nigel 77 uschar *tempcode;
2595     BOOL inescq = FALSE;
2596     BOOL groupsetfirstbyte = FALSE;
2597     const uschar *ptr = *ptrptr;
2598     const uschar *tempptr;
2599     uschar *previous = NULL;
2600     uschar *previous_callout = NULL;
2601 nigel 93 uschar *save_hwm = NULL;
2602 nigel 77 uschar classbits[32];
2603    
2604     #ifdef SUPPORT_UTF8
2605     BOOL class_utf8;
2606     BOOL utf8 = (options & PCRE_UTF8) != 0;
2607     uschar *class_utf8data;
2608 ph10 300 uschar *class_utf8data_base;
2609 nigel 77 uschar utf8_char[6];
2610     #else
2611     BOOL utf8 = FALSE;
2612 nigel 93 uschar *utf8_char = NULL;
2613 nigel 77 #endif
2614    
2615 nigel 93 #ifdef DEBUG
2616     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2617     #endif
2618    
2619 nigel 77 /* Set up the default and non-default settings for greediness */
2620    
2621     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2622     greedy_non_default = greedy_default ^ 1;
2623    
2624     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2625     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2626     matches a non-fixed char first char; reqbyte just remains unset if we never
2627     find one.
2628    
2629     When we hit a repeat whose minimum is zero, we may have to adjust these values
2630     to take the zero repeat into account. This is implemented by setting them to
2631     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2632     item types that can be repeated set these backoff variables appropriately. */
2633    
2634     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2635    
2636     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2637     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2638     value > 255. It is added into the firstbyte or reqbyte variables to record the
2639     case status of the value. This is used only for ASCII characters. */
2640    
2641     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2642    
2643     /* Switch on next character until the end of the branch */
2644    
2645     for (;; ptr++)
2646     {
2647     BOOL negate_class;
2648 ph10 286 BOOL should_flip_negation;
2649 nigel 77 BOOL possessive_quantifier;
2650     BOOL is_quantifier;
2651 nigel 93 BOOL is_recurse;
2652 ph10 180 BOOL reset_bracount;
2653 nigel 77 int class_charcount;
2654     int class_lastchar;
2655     int newoptions;
2656     int recno;
2657 ph10 172 int refsign;
2658 nigel 77 int skipbytes;
2659     int subreqbyte;
2660     int subfirstbyte;
2661 nigel 93 int terminator;
2662 nigel 77 int mclength;
2663     uschar mcbuffer[8];
2664    
2665 nigel 93 /* Get next byte in the pattern */
2666 nigel 77
2667     c = *ptr;
2668 ph10 345
2669 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2670     previous cycle of this loop. */
2671    
2672     if (lengthptr != NULL)
2673     {
2674     #ifdef DEBUG
2675     if (code > cd->hwm) cd->hwm = code; /* High water info */
2676     #endif
2677     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2678     {
2679     *errorcodeptr = ERR52;
2680     goto FAILED;
2681     }
2682    
2683     /* There is at least one situation where code goes backwards: this is the
2684     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2685     the class is simply eliminated. However, it is created first, so we have to
2686     allow memory for it. Therefore, don't ever reduce the length at this point.
2687     */
2688    
2689     if (code < last_code) code = last_code;
2690 ph10 202
2691     /* Paranoid check for integer overflow */
2692    
2693     if (OFLOW_MAX - *lengthptr < code - last_code)
2694     {
2695     *errorcodeptr = ERR20;
2696     goto FAILED;
2697     }
2698    
2699 nigel 93 *lengthptr += code - last_code;
2700     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2701    
2702     /* If "previous" is set and it is not at the start of the work space, move
2703     it back to there, in order to avoid filling up the work space. Otherwise,
2704     if "previous" is NULL, reset the current code pointer to the start. */
2705    
2706     if (previous != NULL)
2707     {
2708     if (previous > orig_code)
2709     {
2710     memmove(orig_code, previous, code - previous);
2711     code -= previous - orig_code;
2712     previous = orig_code;
2713     }
2714     }
2715     else code = orig_code;
2716    
2717     /* Remember where this code item starts so we can pick up the length
2718     next time round. */
2719    
2720     last_code = code;
2721     }
2722    
2723     /* In the real compile phase, just check the workspace used by the forward
2724     reference list. */
2725    
2726     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2727     {
2728     *errorcodeptr = ERR52;
2729     goto FAILED;
2730     }
2731    
2732 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2733    
2734     if (inescq && c != 0)
2735     {
2736 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2737 nigel 77 {
2738     inescq = FALSE;
2739     ptr++;
2740     continue;
2741     }
2742     else
2743     {
2744     if (previous_callout != NULL)
2745     {
2746 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2747     complete_callout(previous_callout, ptr, cd);
2748 nigel 77 previous_callout = NULL;
2749     }
2750     if ((options & PCRE_AUTO_CALLOUT) != 0)
2751     {
2752     previous_callout = code;
2753     code = auto_callout(code, ptr, cd);
2754     }
2755     goto NORMAL_CHAR;
2756     }
2757     }
2758    
2759     /* Fill in length of a previous callout, except when the next thing is
2760     a quantifier. */
2761    
2762 ph10 392 is_quantifier =
2763 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2764     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2765 nigel 77
2766     if (!is_quantifier && previous_callout != NULL &&
2767     after_manual_callout-- <= 0)
2768     {
2769 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2770     complete_callout(previous_callout, ptr, cd);
2771 nigel 77 previous_callout = NULL;
2772     }
2773    
2774     /* In extended mode, skip white space and comments */
2775    
2776     if ((options & PCRE_EXTENDED) != 0)
2777     {
2778     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2779 ph10 391 if (c == CHAR_NUMBER_SIGN)
2780 nigel 77 {
2781 nigel 93 while (*(++ptr) != 0)
2782 nigel 91 {
2783 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2784 nigel 91 }
2785 nigel 93 if (*ptr != 0) continue;
2786    
2787 nigel 91 /* Else fall through to handle end of string */
2788     c = 0;
2789 nigel 77 }
2790     }
2791    
2792     /* No auto callout for quantifiers. */
2793    
2794     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2795     {
2796     previous_callout = code;
2797     code = auto_callout(code, ptr, cd);
2798     }
2799    
2800     switch(c)
2801     {
2802 nigel 93 /* ===================================================================*/
2803     case 0: /* The branch terminates at string end */
2804 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
2805     case CHAR_RIGHT_PARENTHESIS:
2806 nigel 77 *firstbyteptr = firstbyte;
2807     *reqbyteptr = reqbyte;
2808     *codeptr = code;
2809     *ptrptr = ptr;
2810 nigel 93 if (lengthptr != NULL)
2811     {
2812 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2813     {
2814     *errorcodeptr = ERR20;
2815     goto FAILED;
2816     }
2817 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2818     DPRINTF((">> end branch\n"));
2819     }
2820 nigel 77 return TRUE;
2821    
2822 nigel 93
2823     /* ===================================================================*/
2824 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2825     the setting of any following char as a first character. */
2826    
2827 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
2828 nigel 77 if ((options & PCRE_MULTILINE) != 0)
2829     {
2830     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2831     }
2832     previous = NULL;
2833     *code++ = OP_CIRC;
2834     break;
2835    
2836 ph10 391 case CHAR_DOLLAR_SIGN:
2837 nigel 77 previous = NULL;
2838     *code++ = OP_DOLL;
2839     break;
2840    
2841     /* There can never be a first char if '.' is first, whatever happens about
2842     repeats. The value of reqbyte doesn't change either. */
2843    
2844 ph10 391 case CHAR_DOT:
2845 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2846     zerofirstbyte = firstbyte;
2847     zeroreqbyte = reqbyte;
2848     previous = code;
2849 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2850 nigel 77 break;
2851    
2852 nigel 93
2853     /* ===================================================================*/
2854 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2855     32-byte bitmap of the permitted characters, except in the special case
2856     where there is only one such character. For negated classes, we build the
2857     map as usual, then invert it at the end. However, we use a different opcode
2858     so that data characters > 255 can be handled correctly.
2859 nigel 77
2860     If the class contains characters outside the 0-255 range, a different
2861     opcode is compiled. It may optionally have a bit map for characters < 256,
2862     but those above are are explicitly listed afterwards. A flag byte tells
2863     whether the bitmap is present, and whether this is a negated class or not.
2864 ph10 345
2865 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
2866     default (Perl) mode, it is treated as a data character. */
2867 ph10 345
2868 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
2869 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2870     {
2871     *errorcodeptr = ERR64;
2872 ph10 345 goto FAILED;
2873 ph10 336 }
2874 ph10 345 goto NORMAL_CHAR;
2875 nigel 77
2876 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
2877 nigel 77 previous = code;
2878    
2879     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2880     they are encountered at the top level, so we'll do that too. */
2881    
2882 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2883 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
2884 ph10 295 check_posix_syntax(ptr, &tempptr))
2885 nigel 77 {
2886 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2887 nigel 77 goto FAILED;
2888     }
2889    
2890 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2891 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2892 ph10 205 skip them too. This makes for compatibility with Perl. */
2893 ph10 208
2894 ph10 205 negate_class = FALSE;
2895     for (;;)
2896 nigel 77 {
2897     c = *(++ptr);
2898 ph10 391 if (c == CHAR_BACKSLASH)
2899 ph10 205 {
2900 ph10 392 if (ptr[1] == CHAR_E)
2901 ph10 391 ptr++;
2902 ph10 392 else if (strncmp((const char *)ptr+1,
2903     STR_Q STR_BACKSLASH STR_E, 3) == 0)
2904 ph10 391 ptr += 3;
2905 ph10 392 else
2906 ph10 391 break;
2907 ph10 205 }
2908 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2909 ph10 205 negate_class = TRUE;
2910     else break;
2911 ph10 208 }
2912 ph10 345
2913     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2914     an initial ']' is taken as a data character -- the code below handles
2915 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2916     [^] must match any character, so generate OP_ALLANY. */
2917 ph10 345
2918 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2919 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2920 ph10 341 {
2921     *code++ = negate_class? OP_ALLANY : OP_FAIL;
2922     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2923     zerofirstbyte = firstbyte;
2924     break;
2925 ph10 345 }
2926 nigel 77
2927 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
2928     negation flag at the end, so that support for characters > 255 works
2929 ph10 264 correctly (they are all included in the class). */
2930    
2931     should_flip_negation = FALSE;
2932    
2933 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
2934 nigel 93 of just a single character (as long as it's < 256). However, For higher
2935     valued UTF-8 characters, we don't yet do any optimization. */
2936 nigel 77
2937     class_charcount = 0;
2938     class_lastchar = -1;
2939    
2940 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2941     temporary bit of memory, in case the class contains only 1 character (less
2942     than 256), because in that case the compiled code doesn't use the bit map.
2943     */
2944    
2945     memset(classbits, 0, 32 * sizeof(uschar));
2946    
2947 nigel 77 #ifdef SUPPORT_UTF8
2948     class_utf8 = FALSE; /* No chars >= 256 */
2949 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2950 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2951 nigel 77 #endif
2952    
2953     /* Process characters until ] is reached. By writing this as a "do" it
2954 nigel 93 means that an initial ] is taken as a data character. At the start of the
2955     loop, c contains the first byte of the character. */
2956 nigel 77
2957 nigel 93 if (c != 0) do
2958 nigel 77 {
2959 nigel 93 const uschar *oldptr;
2960    
2961 nigel 77 #ifdef SUPPORT_UTF8
2962     if (utf8 && c > 127)
2963     { /* Braces are required because the */
2964     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2965     }
2966 ph10 309
2967 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2968 ph10 309 data and reset the pointer. This is so that very large classes that
2969 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
2970 ph10 309 (which is on the stack). */
2971    
2972 ph10 300 if (lengthptr != NULL)
2973     {
2974     *lengthptr += class_utf8data - class_utf8data_base;
2975 ph10 309 class_utf8data = class_utf8data_base;
2976     }
2977    
2978 nigel 77 #endif
2979    
2980     /* Inside \Q...\E everything is literal except \E */
2981    
2982     if (inescq)
2983     {
2984 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
2985 nigel 77 {
2986 nigel 93 inescq = FALSE; /* Reset literal state */
2987     ptr++; /* Skip the 'E' */
2988     continue; /* Carry on with next */
2989 nigel 77 }
2990 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2991 nigel 77 }
2992    
2993     /* Handle POSIX class names. Perl allows a negation extension of the
2994     form [:^name:]. A square bracket that doesn't match the syntax is
2995     treated as a literal. We also recognize the POSIX constructions
2996     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2997     5.6 and 5.8 do. */
2998    
2999 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3000 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3001 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3002 nigel 77 {
3003     BOOL local_negate = FALSE;
3004 nigel 87 int posix_class, taboffset, tabopt;
3005 nigel 77 register const uschar *cbits = cd->cbits;
3006 nigel 87 uschar pbits[32];
3007 nigel 77
3008 ph10 391 if (ptr[1] != CHAR_COLON)
3009 nigel 77 {
3010     *errorcodeptr = ERR31;
3011     goto FAILED;
3012     }
3013    
3014     ptr += 2;
3015 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3016 nigel 77 {
3017     local_negate = TRUE;
3018 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3019 nigel 77 ptr++;
3020     }
3021    
3022     posix_class = check_posix_name(ptr, tempptr - ptr);
3023     if (posix_class < 0)
3024     {
3025     *errorcodeptr = ERR30;
3026     goto FAILED;
3027     }
3028    
3029     /* If matching is caseless, upper and lower are converted to
3030     alpha. This relies on the fact that the class table starts with
3031     alpha, lower, upper as the first 3 entries. */
3032    
3033     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3034     posix_class = 0;
3035    
3036 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
3037     because we may be adding and subtracting from it, and we don't want to
3038     subtract bits that may be in the main map already. At the end we or the
3039     result into the bit map that is being built. */
3040 nigel 77
3041     posix_class *= 3;
3042 nigel 87
3043     /* Copy in the first table (always present) */
3044    
3045     memcpy(pbits, cbits + posix_class_maps[posix_class],
3046     32 * sizeof(uschar));
3047    
3048     /* If there is a second table, add or remove it as required. */
3049    
3050     taboffset = posix_class_maps[posix_class + 1];
3051     tabopt = posix_class_maps[posix_class + 2];
3052    
3053     if (taboffset >= 0)
3054 nigel 77 {
3055 nigel 87 if (tabopt >= 0)
3056     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3057 nigel 77 else
3058 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3059 nigel 77 }
3060    
3061 nigel 87 /* Not see if we need to remove any special characters. An option
3062     value of 1 removes vertical space and 2 removes underscore. */
3063    
3064     if (tabopt < 0) tabopt = -tabopt;
3065     if (tabopt == 1) pbits[1] &= ~0x3c;
3066     else if (tabopt == 2) pbits[11] &= 0x7f;
3067    
3068     /* Add the POSIX table or its complement into the main table that is
3069     being built and we are done. */
3070    
3071     if (local_negate)
3072     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3073     else
3074     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3075    
3076 nigel 77 ptr = tempptr + 1;
3077     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3078     continue; /* End of POSIX syntax handling */
3079     }
3080    
3081     /* Backslash may introduce a single character, or it may introduce one
3082 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3083     case. Inside a class (and only there) it is treated as backspace.
3084     Elsewhere it marks a word boundary. Other escapes have preset maps ready
3085 ph10 205 to 'or' into the one we are building. We assume they have more than one
3086 nigel 77 character in them, so set class_charcount bigger than one. */
3087    
3088 ph10 391 if (c == CHAR_BACKSLASH)
3089 nigel 77 {
3090 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3091     if (*errorcodeptr != 0) goto FAILED;
3092 nigel 77
3093 ph10 391 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3094     else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
3095     else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
3096 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3097     {
3098 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3099 nigel 77 {
3100     ptr += 2; /* avoid empty string */
3101     }
3102     else inescq = TRUE;
3103     continue;
3104     }
3105 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3106 nigel 77
3107     if (c < 0)
3108     {
3109     register const uschar *cbits = cd->cbits;
3110     class_charcount += 2; /* Greater than 1 is what matters */
3111 nigel 93
3112     /* Save time by not doing this in the pre-compile phase. */
3113    
3114     if (lengthptr == NULL) switch (-c)
3115 nigel 77 {
3116     case ESC_d:
3117     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3118     continue;
3119    
3120     case ESC_D:
3121 ph10 286 should_flip_negation = TRUE;
3122 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3123     continue;
3124    
3125     case ESC_w:
3126     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3127     continue;
3128    
3129     case ESC_W:
3130 ph10 286 should_flip_negation = TRUE;
3131 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3132     continue;
3133    
3134     case ESC_s:
3135     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3136     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3137     continue;
3138    
3139     case ESC_S:
3140 ph10 286 should_flip_negation = TRUE;
3141 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3142     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3143     continue;
3144    
3145 nigel 93 default: /* Not recognized; fall through */
3146     break; /* Need "default" setting to stop compiler warning. */
3147     }
3148    
3149     /* In the pre-compile phase, just do the recognition. */
3150    
3151     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3152     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3153 ph10 180
3154 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
3155     they use extra memory. */
3156 ph10 180
3157 ph10 178 if (-c == ESC_h)
3158     {
3159     SETBIT(classbits, 0x09); /* VT */
3160     SETBIT(classbits, 0x20); /* SPACE */
3161 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
3162 ph10 178 #ifdef SUPPORT_UTF8
3163     if (utf8)
3164 ph10 180 {
3165 ph10 178 class_utf8 = TRUE;
3166     *class_utf8data++ = XCL_SINGLE;
3167 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3168 ph10 178 *class_utf8data++ = XCL_SINGLE;
3169 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3170     *class_utf8data++ = XCL_RANGE;
3171     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3172     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3173 ph10 178 *class_utf8data++ = XCL_SINGLE;
3174 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3175 ph10 178 *class_utf8data++ = XCL_SINGLE;
3176 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3177 ph10 178 *class_utf8data++ = XCL_SINGLE;
3178 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3179     }
3180     #endif
3181     continue;
3182     }
3183 nigel 93
3184 ph10 178 if (-c == ESC_H)
3185     {
3186     for (c = 0; c < 32; c++)
3187     {
3188     int x = 0xff;
3189     switch (c)
3190 ph10 180 {
3191 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
3192     case 0x20/8: x ^= 1 << (0x20%8); break;
3193     case 0xa0/8: x ^= 1 << (0xa0%8); break;
3194     default: break;
3195     }
3196     classbits[c] |= x;
3197 ph10 180 }
3198    
3199 ph10 178 #ifdef SUPPORT_UTF8
3200     if (utf8)
3201 ph10 180 {
3202 ph10 178 class_utf8 = TRUE;
3203 ph10 180 *class_utf8data++ = XCL_RANGE;
3204     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3205     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3206     *class_utf8data++ = XCL_RANGE;
3207     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3208     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3209     *class_utf8data++ = XCL_RANGE;
3210     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3211     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3212     *class_utf8data++ = XCL_RANGE;
3213     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3214     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3215     *class_utf8data++ = XCL_RANGE;
3216     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3217     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3218     *class_utf8data++ = XCL_RANGE;
3219     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3220     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3221     *class_utf8data++ = XCL_RANGE;
3222     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3223     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3224     }
3225     #endif
3226     continue;
3227     }
3228 ph10 178
3229     if (-c == ESC_v)
3230     {
3231     SETBIT(classbits, 0x0a); /* LF */
3232     SETBIT(classbits, 0x0b); /* VT */
3233 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3234     SETBIT(classbits, 0x0d); /* CR */
3235     SETBIT(classbits, 0x85); /* NEL */
3236 ph10 178 #ifdef SUPPORT_UTF8
3237     if (utf8)
3238 ph10 180 {
3239 ph10 178 class_utf8 = TRUE;
3240 ph10 180 *class_utf8data++ = XCL_RANGE;
3241     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3242     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3243     }
3244     #endif
3245     continue;
3246     }
3247 ph10 178
3248     if (-c == ESC_V)
3249     {
3250     for (c = 0; c < 32; c++)
3251     {
3252     int x = 0xff;
3253     switch (c)
3254 ph10 180 {
3255 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3256     x ^= 1 << (0x0b%8);
3257     x ^= 1 << (0x0c%8);
3258 ph10 180 x ^= 1 << (0x0d%8);
3259 ph10 178 break;
3260     case 0x85/8: x ^= 1 << (0x85%8); break;
3261     default: break;
3262     }
3263     classbits[c] |= x;
3264 ph10 180 }
3265    
3266 ph10 178 #ifdef SUPPORT_UTF8
3267     if (utf8)
3268 ph10 180 {
3269 ph10 178 class_utf8 = TRUE;
3270 ph10 180 *class_utf8data++ = XCL_RANGE;
3271     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3272     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3273     *class_utf8data++ = XCL_RANGE;
3274     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3275     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3276     }
3277     #endif
3278     continue;
3279     }
3280 ph10 178
3281 nigel 93 /* We need to deal with \P and \p in both phases. */
3282    
3283 nigel 77 #ifdef SUPPORT_UCP
3284 nigel 93 if (-c == ESC_p || -c == ESC_P)
3285     {
3286     BOOL negated;
3287     int pdata;
3288     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3289     if (ptype < 0) goto FAILED;
3290     class_utf8 = TRUE;
3291     *class_utf8data++ = ((-c == ESC_p) != negated)?
3292     XCL_PROP : XCL_NOTPROP;
3293     *class_utf8data++ = ptype;
3294     *class_utf8data++ = pdata;
3295     class_charcount -= 2; /* Not a < 256 character */
3296 nigel 77 continue;
3297 nigel 93 }
3298 nigel 77 #endif
3299 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
3300     strict mode. By default, for compatibility with Perl, they are
3301     treated as literals. */
3302 nigel 77
3303 nigel 93 if ((options & PCRE_EXTRA) != 0)
3304     {
3305     *errorcodeptr = ERR7;
3306     goto FAILED;
3307     }
3308 nigel 77
3309 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3310     c = *ptr; /* Get the final character and fall through */
3311 nigel 77 }
3312    
3313     /* Fall through if we have a single character (c >= 0). This may be
3314 nigel 93 greater than 256 in UTF-8 mode. */
3315 nigel 77
3316     } /* End of backslash handling */
3317    
3318     /* A single character may be followed by '-' to form a range. However,
3319     Perl does not permit ']' to be the end of the range. A '-' character
3320 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3321     entirely. The code for handling \Q and \E is messy. */
3322 nigel 77
3323 nigel 93 CHECK_RANGE:
3324 ph10 391 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3325 nigel 77 {
3326 nigel 93 inescq = FALSE;
3327     ptr += 2;
3328     }
3329    
3330     oldptr = ptr;
3331 ph10 231
3332 ph10 230 /* Remember \r or \n */
3333 ph10 231
3334 ph10 391 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3335 ph10 231
3336 ph10 230 /* Check for range */
3337 nigel 93
3338 ph10 391 if (!inescq && ptr[1] == CHAR_MINUS)
3339 nigel 93 {
3340 nigel 77 int d;
3341     ptr += 2;
3342 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3343 nigel 77
3344 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3345     mode. */
3346    
3347 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3348 nigel 93 {
3349     ptr += 2;
3350 ph10 392 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3351 ph10 391 { ptr += 2; continue; }
3352 nigel 93 inescq = TRUE;
3353     break;
3354     }
3355    
3356 ph10 391 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3357 nigel 93 {
3358     ptr = oldptr;
3359     goto LONE_SINGLE_CHARACTER;
3360     }
3361    
3362 nigel 77 #ifdef SUPPORT_UTF8
3363     if (utf8)
3364     { /* Braces are required because the */
3365     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3366     }
3367     else
3368     #endif
3369     d = *ptr; /* Not UTF-8 mode */
3370    
3371     /* The second part of a range can be a single-character escape, but
3372     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3373     in such circumstances. */
3374    
3375 ph10 391 if (!inescq && d == CHAR_BACKSLASH)
3376 nigel 77 {
3377 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3378     if (*errorcodeptr != 0) goto FAILED;
3379 nigel 77
3380 ph10 275 /* \b is backspace; \X is literal X; \R is literal R; any other
3381 nigel 93 special means the '-' was literal */
3382 nigel 77
3383     if (d < 0)
3384     {
3385 ph10 391 if (d == -ESC_b) d = CHAR_BS;
3386     else if (d == -ESC_X) d = CHAR_X;
3387     else if (d == -ESC_R) d = CHAR_R; else
3388 nigel 77 {
3389 nigel 93 ptr = oldptr;
3390 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3391     }
3392     }
3393     }
3394    
3395 nigel 93 /* Check that the two values are in the correct order. Optimize
3396     one-character ranges */
3397 nigel 77
3398 nigel 93 if (d < c)
3399     {
3400     *errorcodeptr = ERR8;
3401     goto FAILED;
3402     }
3403    
3404 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3405    
3406 ph10 230 /* Remember \r or \n */
3407 ph10 231
3408 ph10 391 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3409 ph10 231
3410 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3411     matching, we have to use an XCLASS with extra data items. Caseless
3412     matching for characters > 127 is available only if UCP support is
3413     available. */
3414    
3415     #ifdef SUPPORT_UTF8
3416     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3417     {
3418     class_utf8 = TRUE;
3419    
3420     /* With UCP support, we can find the other case equivalents of
3421     the relevant characters. There may be several ranges. Optimize how
3422     they fit with the basic range. */
3423    
3424     #ifdef SUPPORT_UCP
3425     if ((options & PCRE_CASELESS) != 0)
3426     {
3427 nigel 93 unsigned int occ, ocd;
3428     unsigned int cc = c;
3429     unsigned int origd = d;
3430 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3431     {
3432 ph10 180 if (occ >= (unsigned int)c &&
3433     ocd <= (unsigned int)d)
3434 ph10 176 continue; /* Skip embedded ranges */
3435 nigel 77
3436 ph10 180 if (occ < (unsigned int)c &&
3437 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3438 nigel 77 { /* if there is overlap, */
3439     c = occ; /* noting that if occ < c */
3440     continue; /* we can't have ocd > d */
3441     } /* because a subrange is */
3442 ph10 180 if (ocd > (unsigned int)d &&
3443 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3444 nigel 77 { /* the basic range. */
3445     d = ocd;
3446     continue;
3447     }
3448    
3449     if (occ == ocd)
3450     {
3451     *class_utf8data++ = XCL_SINGLE;
3452     }
3453     else
3454     {
3455     *class_utf8data++ = XCL_RANGE;
3456     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3457     }
3458     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3459     }
3460     }
3461     #endif /* SUPPORT_UCP */
3462    
3463     /* Now record the original range, possibly modified for UCP caseless
3464     overlapping ranges. */
3465    
3466     *class_utf8data++ = XCL_RANGE;
3467     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3468     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3469    
3470     /* With UCP support, we are done. Without UCP support, there is no
3471     caseless matching for UTF-8 characters > 127; we can use the bit map
3472     for the smaller ones. */
3473    
3474     #ifdef SUPPORT_UCP
3475     continue; /* With next character in the class */
3476     #else
3477     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3478    
3479     /* Adjust upper limit and fall through to set up the map */
3480    
3481     d = 127;
3482    
3483     #endif /* SUPPORT_UCP */
3484     }
3485     #endif /* SUPPORT_UTF8 */
3486    
3487     /* We use the bit map for all cases when not in UTF-8 mode; else
3488     ranges that lie entirely within 0-127 when there is UCP support; else
3489     for partial ranges without UCP support. */
3490    
3491 nigel 93 class_charcount += d - c + 1;
3492     class_lastchar = d;
3493    
3494     /* We can save a bit of time by skipping this in the pre-compile. */
3495    
3496     if (lengthptr == NULL) for (; c <= d; c++)
3497 nigel 77 {
3498     classbits[c/8] |= (1 << (c&7));
3499     if ((options & PCRE_CASELESS) != 0)
3500     {
3501     int uc = cd->fcc[c]; /* flip case */
3502     classbits[uc/8] |= (1 << (uc&7));
3503     }
3504     }
3505    
3506     continue; /* Go get the next char in the class */
3507     }
3508    
3509     /* Handle a lone single character - we can get here for a normal
3510     non-escape char, or after \ that introduces a single character or for an
3511     apparent range that isn't. */
3512    
3513     LONE_SINGLE_CHARACTER:
3514 ph10 231
3515 nigel 77 /* Handle a character that cannot go in the bit map */
3516    
3517     #ifdef SUPPORT_UTF8
3518     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3519     {
3520     class_utf8 = TRUE;
3521     *class_utf8data++ = XCL_SINGLE;
3522     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3523    
3524     #ifdef SUPPORT_UCP
3525     if ((options & PCRE_CASELESS) != 0)
3526     {
3527 nigel 93 unsigned int othercase;
3528 ph10 349 if ((othercase = UCD_OTHERCASE(c)) != c)
3529 nigel 77 {
3530     *class_utf8data++ = XCL_SINGLE;
3531     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3532     }
3533     }
3534     #endif /* SUPPORT_UCP */
3535    
3536     }
3537     else
3538     #endif /* SUPPORT_UTF8 */
3539    
3540     /* Handle a single-byte character */
3541     {
3542     classbits[c/8] |= (1 << (c&7));
3543     if ((options & PCRE_CASELESS) != 0)
3544     {
3545     c = cd->fcc[c]; /* flip case */
3546     classbits[c/8] |= (1 << (c&7));
3547     }
3548     class_charcount++;
3549     class_lastchar = c;
3550     }
3551     }
3552    
3553 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3554 nigel 77
3555 ph10 391 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3556 nigel 77
3557 nigel 93 if (c == 0) /* Missing terminating ']' */
3558     {
3559     *errorcodeptr = ERR6;
3560     goto FAILED;
3561     }
3562 ph10 231
3563    
3564 ph10 230 /* This code has been disabled because it would mean that \s counts as
3565     an explicit \r or \n reference, and that's not really what is wanted. Now
3566     we set the flag only if there is a literal "\r" or "\n" in the class. */
3567 ph10 227
3568 ph10 230 #if 0
3569 ph10 226 /* Remember whether \r or \n are in this class */
3570 ph10 227
3571 ph10 226 if (negate_class)
3572     {
3573 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3574 ph10 226 }
3575     else
3576     {
3577 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3578 ph10 227 }
3579 ph10 230 #endif
3580 ph10 227
3581 ph10 231
3582 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3583 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3584     use of \p or \P, in other words, no use of any XCLASS features, we can
3585     optimize.
3586    
3587 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3588     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3589     operate on single-bytes only. This is an historical hangover. Maybe one day
3590     we can tidy these opcodes to handle multi-byte characters.
3591 nigel 77
3592     The optimization throws away the bit map. We turn the item into a
3593     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3594     that OP_NOT does not support multibyte characters. In the positive case, it
3595     can cause firstbyte to be set. Otherwise, there can be no first char if
3596     this item is first, whatever repeat count may follow. In the case of
3597     reqbyte, save the previous value for reinstating. */
3598    
3599     #ifdef SUPPORT_UTF8
3600 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3601 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3602 nigel 77 #else
3603     if (class_charcount == 1)
3604     #endif
3605     {
3606     zeroreqbyte = reqbyte;
3607    
3608     /* The OP_NOT opcode works on one-byte characters only. */
3609    
3610     if (negate_class)
3611     {
3612     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3613     zerofirstbyte = firstbyte;
3614     *code++ = OP_NOT;
3615     *code++ = class_lastchar;
3616     break;
3617     }
3618    
3619     /* For a single, positive character, get the value into mcbuffer, and
3620     then we can handle this with the normal one-character code. */
3621    
3622     #ifdef SUPPORT_UTF8
3623     if (utf8 && class_lastchar > 127)
3624     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3625     else
3626     #endif
3627     {
3628     mcbuffer[0] = class_lastchar;
3629     mclength = 1;
3630     }
3631     goto ONE_CHAR;
3632     } /* End of 1-char optimization */
3633    
3634     /* The general case - not the one-char optimization. If this is the first
3635     thing in the branch, there can be no first char setting, whatever the
3636     repeat count. Any reqbyte setting must remain unchanged after any kind of
3637     repeat. */
3638    
3639     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3640     zerofirstbyte = firstbyte;
3641     zeroreqbyte = reqbyte;
3642    
3643     /* If there are characters with values > 255, we have to compile an
3644 ph10 286 extended class, with its own opcode, unless there was a negated special
3645     such as \S in the class, because in that case all characters > 255 are in
3646     the class, so any that were explicitly given as well can be ignored. If
3647 ph10 264 (when there are explicit characters > 255 that must be listed) there are no
3648     characters < 256, we can omit the bitmap in the actual compiled code. */
3649 nigel 77
3650     #ifdef SUPPORT_UTF8
3651 ph10 264 if (class_utf8 && !should_flip_negation)
3652 nigel 77 {
3653     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3654     *code++ = OP_XCLASS;
3655     code += LINK_SIZE;
3656     *code = negate_class? XCL_NOT : 0;
3657    
3658 nigel 93 /* If the map is required, move up the extra data to make room for it;
3659     otherwise just move the code pointer to the end of the extra data. */
3660 nigel 77
3661     if (class_charcount > 0)
3662     {
3663     *code++ |= XCL_MAP;
3664 nigel 93 memmove(code + 32, code, class_utf8data - code);
3665 nigel 77 memcpy(code, classbits, 32);
3666 nigel 93 code = class_utf8data + 32;
3667 nigel 77 }
3668 nigel 93 else code = class_utf8data;
3669 nigel 77
3670     /* Now fill in the complete length of the item */
3671    
3672     PUT(previous, 1, code - previous);
3673     break; /* End of class handling */
3674     }
3675     #endif
3676    
3677 ph10 286 /* If there are no characters > 255, set the opcode to OP_CLASS or
3678     OP_NCLASS, depending on whether the whole class was negated and whether
3679     there were negative specials such as \S in the class. Then copy the 32-byte
3680 ph10 264 map into the code vector, negating it if necessary. */
3681 ph10 286
3682 ph10 264 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3683 nigel 77 if (negate_class)
3684     {
3685 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3686     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3687 nigel 77 }
3688     else
3689     {
3690     memcpy(code, classbits, 32);
3691     }
3692     code += 32;
3693     break;
3694    
3695 nigel 93
3696     /* ===================================================================*/
3697 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3698     has been tested above. */
3699    
3700 ph10 391 case CHAR_LEFT_CURLY_BRACKET:
3701 nigel 77 if (!is_quantifier) goto NORMAL_CHAR;
3702     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3703     if (*errorcodeptr != 0) goto FAILED;
3704     goto REPEAT;
3705    
3706 ph10 391 case CHAR_ASTERISK:
3707 nigel 77 repeat_min = 0;
3708     repeat_max = -1;
3709     goto REPEAT;
3710    
3711 ph10 391 case CHAR_PLUS:
3712 nigel 77 repeat_min = 1;
3713     repeat_max = -1;
3714     goto REPEAT;
3715    
3716 ph10 391 case CHAR_QUESTION_MARK:
3717 nigel 77 repeat_min = 0;
3718     repeat_max = 1;
3719    
3720     REPEAT:
3721     if (previous == NULL)
3722     {
3723     *errorcodeptr = ERR9;
3724     goto FAILED;
3725     }
3726    
3727     if (repeat_min == 0)
3728     {
3729     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3730     reqbyte = zeroreqbyte; /* Ditto */
3731     }
3732    
3733     /* Remember whether this is a variable length repeat */
3734    
3735     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3736    
3737     op_type = 0; /* Default single-char op codes */
3738     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3739    
3740     /* Save start of previous item, in case we have to move it up to make space
3741     for an inserted OP_ONCE for the additional '+' extension. */
3742    
3743     tempcode = previous;
3744    
3745     /* If the next character is '+', we have a possessive quantifier. This
3746     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3747     If the next character is '?' this is a minimizing repeat, by default,
3748     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3749     repeat type to the non-default. */
3750    
3751 ph10 391 if (ptr[1] == CHAR_PLUS)
3752 nigel 77 {
3753     repeat_type = 0; /* Force greedy */
3754     possessive_quantifier = TRUE;
3755     ptr++;
3756     }
3757 ph10 391 else if (ptr[1] == CHAR_QUESTION_MARK)
3758 nigel 77 {
3759     repeat_type = greedy_non_default;
3760     ptr++;
3761     }
3762     else repeat_type = greedy_default;
3763    
3764     /* If previous was a character match, abolish the item and generate a
3765     repeat item instead. If a char item has a minumum of more than one, ensure
3766     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3767     the first thing in a branch because the x will have gone into firstbyte
3768     instead. */
3769    
3770     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3771     {
3772     /* Deal with UTF-8 characters that take up more than one byte. It's
3773     easier to write this out separately than try to macrify it. Use c to
3774     hold the length of the character in bytes, plus 0x80 to flag that it's a
3775     length rather than a small character. */
3776    
3777     #ifdef SUPPORT_UTF8
3778     if (utf8 && (code[-1] & 0x80) != 0)
3779     {
3780     uschar *lastchar = code - 1;
3781     while((*lastchar & 0xc0) == 0x80) lastchar--;
3782     c = code - lastchar; /* Length of UTF-8 character */
3783     memcpy(utf8_char, lastchar, c); /* Save the char */
3784     c |= 0x80; /* Flag c as a length */
3785     }
3786     else
3787     #endif
3788    
3789     /* Handle the case of a single byte - either with no UTF8 support, or
3790     with UTF-8 disabled, or for a UTF-8 character < 128. */
3791    
3792     {
3793     c = code[-1];
3794     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3795     }
3796    
3797 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3798     the line is something that cannot possibly match this character. If so,
3799     automatically possessifying this item gains some performance in the case
3800     where the match fails. */
3801    
3802     if (!possessive_quantifier &&
3803     repeat_max < 0 &&
3804     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3805     options, cd))
3806     {
3807     repeat_type = 0; /* Force greedy */
3808     possessive_quantifier = TRUE;
3809     }
3810    
3811 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3812     }
3813    
3814     /* If previous was a single negated character ([^a] or similar), we use
3815     one of the special opcodes, replacing it. The code is shared with single-
3816     character repeats by setting opt_type to add a suitable offset into
3817 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3818     currently used only for single-byte chars. */
3819 nigel 77
3820     else if (*previous == OP_NOT)
3821     {
3822     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3823     c = previous[1];
3824 nigel 93 if (!possessive_quantifier &&
3825     repeat_max < 0 &&
3826     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3827     {
3828     repeat_type = 0; /* Force greedy */
3829     possessive_quantifier = TRUE;
3830     }
3831 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3832     }
3833    
3834     /* If previous was a character type match (\d or similar), abolish it and
3835     create a suitable repeat item. The code is shared with single-character
3836     repeats by setting op_type to add a suitable offset into repeat_type. Note
3837     the the Unicode property types will be present only when SUPPORT_UCP is
3838     defined, but we don't wrap the little bits of code here because it just
3839     makes it horribly messy. */
3840    
3841     else if (*previous < OP_EODN)
3842     {
3843     uschar *oldcode;
3844 nigel 87 int prop_type, prop_value;
3845 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3846     c = *previous;
3847    
3848 nigel 93 if (!possessive_quantifier &&
3849     repeat_max < 0 &&
3850     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3851     {
3852     repeat_type = 0; /* Force greedy */
3853     possessive_quantifier = TRUE;
3854     }
3855    
3856 nigel 77 OUTPUT_SINGLE_REPEAT:
3857 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3858     {
3859     prop_type = previous[1];
3860     prop_value = previous[2];
3861     }
3862     else prop_type = prop_value = -1;
3863 nigel 77
3864     oldcode = code;
3865     code = previous; /* Usually overwrite previous item */
3866    
3867     /* If the maximum is zero then the minimum must also be zero; Perl allows
3868     this case, so we do too - by simply omitting the item altogether. */
3869    
3870     if (repeat_max == 0) goto END_REPEAT;
3871    
3872     /* All real repeats make it impossible to handle partial matching (maybe
3873     one day we will be able to remove this restriction). */
3874    
3875 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3876 nigel 77
3877     /* Combine the op_type with the repeat_type */
3878    
3879     repeat_type += op_type;
3880    
3881     /* A minimum of zero is handled either as the special case * or ?, or as
3882     an UPTO, with the maximum given. */
3883    
3884     if (repeat_min == 0)
3885     {
3886     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3887     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3888     else
3889     {
3890     *code++ = OP_UPTO + repeat_type;
3891     PUT2INC(code, 0, repeat_max);
3892     }
3893     }
3894    
3895     /* A repeat minimum of 1 is optimized into some special cases. If the
3896 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3897 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3898     one less than the maximum. */
3899    
3900     else if (repeat_min == 1)
3901     {
3902     if (repeat_max == -1)
3903     *code++ = OP_PLUS + repeat_type;
3904     else
3905     {
3906     code = oldcode; /* leave previous item in place */
3907     if (repeat_max == 1) goto END_REPEAT;
3908     *code++ = OP_UPTO + repeat_type;
3909     PUT2INC(code, 0, repeat_max - 1);
3910     }
3911     }
3912    
3913     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3914     handled as an EXACT followed by an UPTO. */
3915    
3916     else
3917     {
3918     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3919     PUT2INC(code, 0, repeat_min);
3920    
3921     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3922     we have to insert the character for the previous code. For a repeated
3923 nigel 87 Unicode property match, there are two extra bytes that define the
3924 nigel 77 required property. In UTF-8 mode, long characters have their length in
3925     c, with the 0x80 bit as a flag. */
3926    
3927     if (repeat_max < 0)
3928     {
3929     #ifdef SUPPORT_UTF8
3930     if (utf8 && c >= 128)
3931     {
3932     memcpy(code, utf8_char, c & 7);
3933     code += c & 7;
3934     }
3935     else
3936     #endif
3937     {
3938     *code++ = c;
3939 nigel 87 if (prop_type >= 0)
3940     {
3941     *code++ = prop_type;
3942     *code++ = prop_value;
3943     }
3944 nigel 77 }
3945     *code++ = OP_STAR + repeat_type;
3946     }
3947    
3948     /* Else insert an UPTO if the max is greater than the min, again
3949 nigel 93 preceded by the character, for the previously inserted code. If the
3950     UPTO is just for 1 instance, we can use QUERY instead. */
3951 nigel 77
3952     else if (repeat_max != repeat_min)
3953     {
3954     #ifdef SUPPORT_UTF8
3955     if (utf8 && c >= 128)
3956     {
3957     memcpy(code, utf8_char, c & 7);
3958     code += c & 7;
3959     }
3960     else
3961     #endif
3962     *code++ = c;
3963 nigel 87 if (prop_type >= 0)
3964     {
3965     *code++ = prop_type;
3966     *code++ = prop_value;
3967     }
3968 nigel 77 repeat_max -= repeat_min;
3969 nigel 93
3970     if (repeat_max == 1)
3971     {
3972     *code++ = OP_QUERY + repeat_type;
3973     }
3974     else
3975     {
3976     *code++ = OP_UPTO + repeat_type;
3977     PUT2INC(code, 0, repeat_max);
3978     }
3979 nigel 77 }
3980     }
3981    
3982     /* The character or character type itself comes last in all cases. */
3983    
3984     #ifdef SUPPORT_UTF8
3985     if (utf8 && c >= 128)
3986     {
3987     memcpy(code, utf8_char, c & 7);
3988     code += c & 7;
3989     }
3990     else
3991     #endif
3992     *code++ = c;
3993    
3994 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3995     define the required property. */
3996 nigel 77
3997     #ifdef SUPPORT_UCP
3998 nigel 87 if (prop_type >= 0)
3999     {
4000     *code++ = prop_type;
4001     *code++ = prop_value;
4002     }
4003 nigel 77 #endif
4004     }
4005    
4006     /* If previous was a character class or a back reference, we put the repeat
4007     stuff after it, but just skip the item if the repeat was {0,0}. */
4008    
4009     else if (*previous == OP_CLASS ||
4010     *previous == OP_NCLASS ||
4011     #ifdef SUPPORT_UTF8
4012     *previous == OP_XCLASS ||
4013     #endif
4014     *previous == OP_REF)
4015     {
4016     if (repeat_max == 0)
4017     {
4018     code = previous;
4019     goto END_REPEAT;
4020     }
4021    
4022     /* All real repeats make it impossible to handle partial matching (maybe
4023     one day we will be able to remove this restriction). */
4024