/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 513 - (hide annotations) (download)
Mon May 3 11:13:37 2010 UTC (4 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 225150 byte(s)
Make \R and \X in a character class behave more like Perl

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 475 Copyright (c) 1997-2010 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 ph10 475 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57     also used by pcretest. PCRE_DEBUG is not defined when building a production
58     library. */
59 nigel 85
60 ph10 475 #ifdef PCRE_DEBUG
61 nigel 85 #include "pcre_printint.src"
62     #endif
63    
64    
65 ph10 178 /* Macro for setting individual bits in class bitmaps. */
66    
67     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68    
69 ph10 202 /* Maximum length value to check against when making sure that the integer that
70     holds the compiled pattern length does not overflow. We make it a bit less than
71     INT_MAX to allow for adding in group terminating bytes, so that we don't have
72     to check them every time. */
73 ph10 178
74 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
75    
76    
77 nigel 77 /*************************************************
78     * Code parameters and static tables *
79     *************************************************/
80    
81 nigel 93 /* This value specifies the size of stack workspace that is used during the
82     first pre-compile phase that determines how much memory is required. The regex
83     is partly compiled into this space, but the compiled parts are discarded as
84     soon as they can be, so that hopefully there will never be an overrun. The code
85     does, however, check for an overrun. The largest amount I've seen used is 218,
86     so this number is very generous.
87 nigel 77
88 nigel 93 The same workspace is used during the second, actual compile phase for
89     remembering forward references to groups so that they can be filled in at the
90     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91     is 4 there is plenty of room. */
92 nigel 77
93 nigel 93 #define COMPILE_WORK_SIZE (4096)
94 nigel 77
95 ph10 507 /* The overrun tests check for a slightly smaller size so that they detect the
96 ph10 505 overrun before it actually does run off the end of the data block. */
97 nigel 93
98 ph10 505 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99    
100    
101 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
102     are simple data values; negative values are for special things like \d and so
103     on. Zero means further processing is needed (for things like \x), or the escape
104     is invalid. */
105    
106 ph10 391 #ifndef EBCDIC
107    
108     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
109 ph10 392 in UTF-8 mode. */
110 ph10 391
111 ph10 392 static const short int escapes[] = {
112 ph10 391 0, 0,
113     0, 0,
114 ph10 392 0, 0,
115     0, 0,
116     0, 0,
117 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
118 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
119 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
120 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
121     -ESC_B, -ESC_C,
122     -ESC_D, -ESC_E,
123     0, -ESC_G,
124     -ESC_H, 0,
125     0, -ESC_K,
126 ph10 391 0, 0,
127 ph10 392 0, 0,
128 ph10 391 -ESC_P, -ESC_Q,
129     -ESC_R, -ESC_S,
130 ph10 392 0, 0,
131     -ESC_V, -ESC_W,
132     -ESC_X, 0,
133     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
134 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
135 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
136 ph10 391 CHAR_GRAVE_ACCENT, 7,
137 ph10 392 -ESC_b, 0,
138     -ESC_d, ESC_e,
139 ph10 391 ESC_f, 0,
140     -ESC_h, 0,
141 ph10 392 0, -ESC_k,
142 ph10 391 0, 0,
143     ESC_n, 0,
144 ph10 392 -ESC_p, 0,
145     ESC_r, -ESC_s,
146 ph10 391 ESC_tee, 0,
147 ph10 392 -ESC_v, -ESC_w,
148     0, 0,
149 ph10 391 -ESC_z
150 nigel 77 };
151    
152 ph10 392 #else
153 ph10 391
154     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155    
156 nigel 77 static const short int escapes[] = {
157     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
158     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
159     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
160     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
161     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
162     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
163     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
164     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
165 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
166 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
167 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
168 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
169 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
170     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
171     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
172     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
173 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
174 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
175 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
176 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
177 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
178     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
179     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
180     };
181     #endif
182    
183    
184 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185     searched linearly. Put all the names into a single string, in order to reduce
186 ph10 392 the number of relocations when a shared library is dynamically linked. The
187     string is built from string macros so that it works in UTF-8 mode on EBCDIC
188 ph10 391 platforms. */
189 ph10 210
190     typedef struct verbitem {
191 ph10 510 int len; /* Length of verb name */
192     int op; /* Op when no arg, or -1 if arg mandatory */
193     int op_arg; /* Op when arg present, or -1 if not allowed */
194 ph10 211 } verbitem;
195 ph10 210
196 ph10 240 static const char verbnames[] =
197 ph10 510 "\0" /* Empty name is a shorthand for MARK */
198 ph10 512 STRING_MARK0
199 ph10 391 STRING_ACCEPT0
200     STRING_COMMIT0
201     STRING_F0
202     STRING_FAIL0
203     STRING_PRUNE0
204     STRING_SKIP0
205     STRING_THEN;
206 ph10 240
207 ph10 327 static const verbitem verbs[] = {
208 ph10 510 { 0, -1, OP_MARK },
209 ph10 512 { 4, -1, OP_MARK },
210 ph10 510 { 6, OP_ACCEPT, -1 },
211     { 6, OP_COMMIT, -1 },
212     { 1, OP_FAIL, -1 },
213     { 4, OP_FAIL, -1 },
214     { 5, OP_PRUNE, OP_PRUNE_ARG },
215     { 4, OP_SKIP, OP_SKIP_ARG },
216     { 4, OP_THEN, OP_THEN_ARG }
217 ph10 210 };
218    
219 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
220 ph10 210
221    
222 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
223     now all in a single string, to reduce the number of relocations when a shared
224 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
225     length entry. The first three must be alpha, lower, upper, as this is assumed
226     for handling case independence. */
227 nigel 77
228 ph10 240 static const char posix_names[] =
229 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
230     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
231 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
232     STRING_word0 STRING_xdigit;
233 nigel 77
234     static const uschar posix_name_lengths[] = {
235     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
236    
237 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
238     base map, with an optional addition or removal of another map. Then, for some
239     classes, there is some additional tweaking: for [:blank:] the vertical space
240     characters are removed, and for [:alpha:] and [:alnum:] the underscore
241     character is removed. The triples in the table consist of the base map offset,
242     second map offset or -1 if no second map, and a non-negative value for map
243     addition or a negative value for map subtraction (if there are two maps). The
244     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
245     remove vertical space characters, 2 => remove underscore. */
246 nigel 77
247     static const int posix_class_maps[] = {
248 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
249     cbit_lower, -1, 0, /* lower */
250     cbit_upper, -1, 0, /* upper */
251     cbit_word, -1, 2, /* alnum - word without underscore */
252     cbit_print, cbit_cntrl, 0, /* ascii */
253     cbit_space, -1, 1, /* blank - a GNU extension */
254     cbit_cntrl, -1, 0, /* cntrl */
255     cbit_digit, -1, 0, /* digit */
256     cbit_graph, -1, 0, /* graph */
257     cbit_print, -1, 0, /* print */
258     cbit_punct, -1, 0, /* punct */
259     cbit_space, -1, 0, /* space */
260     cbit_word, -1, 0, /* word - a Perl extension */
261     cbit_xdigit,-1, 0 /* xdigit */
262 nigel 77 };
263    
264    
265 nigel 93 #define STRING(a) # a
266     #define XSTRING(s) STRING(s)
267    
268 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
269 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
270     they are documented. Always add a new error instead. Messages marked DEAD below
271 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
272     the number of relocations needed when a shared library is loaded dynamically,
273     it is now one long string. We cannot use a table of offsets, because the
274     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
275     simply count through to the one we want - this isn't a performance issue
276 ph10 507 because these strings are used only when there is a compilation error.
277 nigel 77
278 ph10 507 Each substring ends with \0 to insert a null character. This includes the final
279     substring, so that the whole string ends with \0\0, which can be detected when
280 ph10 499 counting through. */
281    
282 ph10 240 static const char error_texts[] =
283     "no error\0"
284     "\\ at end of pattern\0"
285     "\\c at end of pattern\0"
286     "unrecognized character follows \\\0"
287     "numbers out of order in {} quantifier\0"
288 nigel 77 /* 5 */
289 ph10 240 "number too big in {} quantifier\0"
290     "missing terminating ] for character class\0"
291     "invalid escape sequence in character class\0"
292     "range out of order in character class\0"
293     "nothing to repeat\0"
294 nigel 77 /* 10 */
295 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
296     "internal error: unexpected repeat\0"
297 ph10 269 "unrecognized character after (? or (?-\0"
298 ph10 240 "POSIX named classes are supported only within a class\0"
299     "missing )\0"
300 nigel 77 /* 15 */
301 ph10 240 "reference to non-existent subpattern\0"
302     "erroffset passed as NULL\0"
303     "unknown option bit(s) set\0"
304     "missing ) after comment\0"
305     "parentheses nested too deeply\0" /** DEAD **/
306 nigel 77 /* 20 */
307 ph10 240 "regular expression is too large\0"
308     "failed to get memory\0"
309     "unmatched parentheses\0"
310     "internal error: code overflow\0"
311     "unrecognized character after (?<\0"
312 nigel 77 /* 25 */
313 ph10 240 "lookbehind assertion is not fixed length\0"
314     "malformed number or name after (?(\0"
315     "conditional group contains more than two branches\0"
316     "assertion expected after (?(\0"
317     "(?R or (?[+-]digits must be followed by )\0"
318 nigel 77 /* 30 */
319 ph10 240 "unknown POSIX class name\0"
320     "POSIX collating elements are not supported\0"
321     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
322     "spare error\0" /** DEAD **/
323     "character value in \\x{...} sequence is too large\0"
324 nigel 77 /* 35 */
325 ph10 240 "invalid condition (?(0)\0"
326     "\\C not allowed in lookbehind assertion\0"
327     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
328     "number after (?C is > 255\0"
329     "closing ) for (?C expected\0"
330 nigel 77 /* 40 */
331 ph10 240 "recursive call could loop indefinitely\0"
332     "unrecognized character after (?P\0"
333     "syntax error in subpattern name (missing terminator)\0"
334     "two named subpatterns have the same name\0"
335     "invalid UTF-8 string\0"
336 nigel 77 /* 45 */
337 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
338     "malformed \\P or \\p sequence\0"
339     "unknown property name after \\P or \\p\0"
340     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
341     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
342 nigel 91 /* 50 */
343 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
344     "octal value is greater than \\377 (not in UTF-8 mode)\0"
345     "internal error: overran compiling workspace\0"
346     "internal error: previously-checked referenced subpattern not found\0"
347     "DEFINE group contains more than one branch\0"
348 nigel 93 /* 55 */
349 ph10 240 "repeating a DEFINE group is not allowed\0"
350     "inconsistent NEWLINE options\0"
351 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
352     "a numbered reference must not be zero\0"
353 ph10 510 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
354 ph10 211 /* 60 */
355 ph10 240 "(*VERB) not recognized\0"
356 ph10 268 "number is too big\0"
357 ph10 272 "subpattern name expected\0"
358 ph10 336 "digit expected after (?+\0"
359 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
360     /* 65 */
361 ph10 510 "different names for subpatterns of the same number are not allowed\0"
362 ph10 512 "(*MARK) must have an argument\0"
363 ph10 510 ;
364 nigel 77
365     /* Table to identify digits and hex digits. This is used when compiling
366     patterns. Note that the tables in chartables are dependent on the locale, and
367     may mark arbitrary characters as digits - but the PCRE compiling code expects
368     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
369     a private table here. It costs 256 bytes, but it is a lot faster than doing
370     character value tests (at least in some simple cases I timed), and in some
371     applications one wants PCRE to compile efficiently as well as match
372     efficiently.
373    
374     For convenience, we use the same bit definitions as in chartables:
375    
376     0x04 decimal digit
377     0x08 hexadecimal digit
378    
379     Then we can use ctype_digit and ctype_xdigit in the code. */
380    
381 ph10 392 #ifndef EBCDIC
382 ph10 391
383 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
384 ph10 391 UTF-8 mode. */
385    
386 nigel 77 static const unsigned char digitab[] =
387     {
388     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
393     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
394     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
395     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
396     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
397     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
398     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
399     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
400     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
401     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
402     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
403     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
404     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
405     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
406     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
407     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
408     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
409     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
410     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
411     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
412     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
413     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
414     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
415     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
416     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
417     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
418     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
419     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
420    
421 ph10 392 #else
422 ph10 391
423     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
424    
425 nigel 77 static const unsigned char digitab[] =
426     {
427     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
428     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
429     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
430     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
431     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
432     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
433     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
434     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
435     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
436     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
437     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
438 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
439 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
440     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
441     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
442     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
443     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
444     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
445     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
446     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
447     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
448     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
449     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
450     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
451     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
452     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
453     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
454     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
455     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
456     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
457     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
458     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
459    
460     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
461     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
462     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
463     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
464     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
465     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
466     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
467     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
468     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
469     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
470     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
471     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
472 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
473 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
474     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
475     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
476     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
477     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
478     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
479     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
480     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
481     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
482     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
483     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
484     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
485     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
486     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
487     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
488     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
489     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
490     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
491     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
492     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
493     #endif
494    
495    
496     /* Definition to allow mutual recursion */
497    
498     static BOOL
499 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
500 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
501 nigel 77
502    
503    
504     /*************************************************
505 ph10 240 * Find an error text *
506     *************************************************/
507    
508 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
509     some of the text is of unknown length, we can't use a table of offsets.
510     Instead, just count through the strings. This is not a performance issue
511 ph10 240 because it happens only when there has been a compilation error.
512    
513     Argument: the error number
514     Returns: pointer to the error string
515     */
516    
517     static const char *
518     find_error_text(int n)
519     {
520     const char *s = error_texts;
521 ph10 507 for (; n > 0; n--)
522 ph10 499 {
523     while (*s++ != 0) {};
524     if (*s == 0) return "Error text not found (please report)";
525 ph10 507 }
526 ph10 240 return s;
527     }
528    
529    
530     /*************************************************
531 nigel 77 * Handle escapes *
532     *************************************************/
533    
534     /* This function is called when a \ has been encountered. It either returns a
535     positive value for a simple escape such as \n, or a negative value which
536 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
537     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
538     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
539     ptr is pointing at the \. On exit, it is on the final character of the escape
540     sequence.
541 nigel 77
542     Arguments:
543     ptrptr points to the pattern position pointer
544     errorcodeptr points to the errorcode variable
545     bracount number of previous extracting brackets
546     options the options bits
547     isclass TRUE if inside a character class
548    
549     Returns: zero or positive => a data character
550     negative => a special escape sequence
551 ph10 213 on error, errorcodeptr is set
552 nigel 77 */
553    
554     static int
555     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
556     int options, BOOL isclass)
557     {
558 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
559     const uschar *ptr = *ptrptr + 1;
560 nigel 77 int c, i;
561    
562 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
563     ptr--; /* Set pointer back to the last byte */
564    
565 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
566    
567     if (c == 0) *errorcodeptr = ERR1;
568    
569 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
570     in a table. A non-zero result is something that can be returned immediately.
571 nigel 77 Otherwise further processing may be required. */
572    
573 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
574     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
575     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
576 nigel 77
577 ph10 97 #else /* EBCDIC coding */
578 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
579 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
580     #endif
581    
582     /* Escapes that need further processing, or are illegal. */
583    
584     else
585     {
586     const uschar *oldptr;
587 nigel 93 BOOL braced, negated;
588    
589 nigel 77 switch (c)
590     {
591     /* A number of Perl escapes are not handled by PCRE. We give an explicit
592     error. */
593    
594 ph10 391 case CHAR_l:
595     case CHAR_L:
596     case CHAR_N:
597     case CHAR_u:
598     case CHAR_U:
599 nigel 77 *errorcodeptr = ERR37;
600     break;
601    
602 ph10 333 /* \g must be followed by one of a number of specific things:
603 ph10 345
604 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
605     backreference. If negative, it is a relative backreference. This is a Perl
606     5.10 feature.
607 ph10 345
608 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
609     is part of Perl's movement towards a unified syntax for back references. As
610     this is synonymous with \k{name}, we fudge it up by pretending it really
611     was \k.
612 ph10 345
613     (3) For Oniguruma compatibility we also support \g followed by a name or a
614     number either in angle brackets or in single quotes. However, these are
615     (possibly recursive) subroutine calls, _not_ backreferences. Just return
616 ph10 333 the -ESC_g code (cf \k). */
617 nigel 93
618 ph10 391 case CHAR_g:
619     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
620 ph10 333 {
621     c = -ESC_g;
622 ph10 345 break;
623     }
624 ph10 333
625     /* Handle the Perl-compatible cases */
626 ph10 345
627 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
628 nigel 93 {
629 ph10 171 const uschar *p;
630 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
631     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
632     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
633 ph10 171 {
634     c = -ESC_k;
635     break;
636 ph10 172 }
637 nigel 93 braced = TRUE;
638     ptr++;
639     }
640     else braced = FALSE;
641    
642 ph10 391 if (ptr[1] == CHAR_MINUS)
643 nigel 93 {
644     negated = TRUE;
645     ptr++;
646     }
647     else negated = FALSE;
648    
649     c = 0;
650     while ((digitab[ptr[1]] & ctype_digit) != 0)
651 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
652 ph10 220
653 ph10 333 if (c < 0) /* Integer overflow */
654 ph10 213 {
655     *errorcodeptr = ERR61;
656     break;
657 ph10 220 }
658 ph10 345
659 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
660 nigel 93 {
661     *errorcodeptr = ERR57;
662 ph10 213 break;
663 nigel 93 }
664 ph10 345
665 ph10 333 if (c == 0)
666     {
667     *errorcodeptr = ERR58;
668     break;
669 ph10 345 }
670 nigel 93
671     if (negated)
672     {
673     if (c > bracount)
674     {
675     *errorcodeptr = ERR15;
676 ph10 213 break;
677 nigel 93 }
678     c = bracount - (c - 1);
679     }
680    
681     c = -(ESC_REF + c);
682     break;
683    
684 nigel 77 /* The handling of escape sequences consisting of a string of digits
685     starting with one that is not zero is not straightforward. By experiment,
686     the way Perl works seems to be as follows:
687    
688     Outside a character class, the digits are read as a decimal number. If the
689     number is less than 10, or if there are that many previous extracting
690     left brackets, then it is a back reference. Otherwise, up to three octal
691     digits are read to form an escaped byte. Thus \123 is likely to be octal
692     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
693     value is greater than 377, the least significant 8 bits are taken. Inside a
694     character class, \ followed by a digit is always an octal number. */
695    
696 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
697     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
698 nigel 77
699     if (!isclass)
700     {
701     oldptr = ptr;
702 ph10 391 c -= CHAR_0;
703 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
704 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
705 ph10 333 if (c < 0) /* Integer overflow */
706 ph10 213 {
707     *errorcodeptr = ERR61;
708 ph10 220 break;
709     }
710 nigel 77 if (c < 10 || c <= bracount)
711     {
712     c = -(ESC_REF + c);
713     break;
714     }
715     ptr = oldptr; /* Put the pointer back and fall through */
716     }
717    
718     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
719     generates a binary zero byte and treats the digit as a following literal.
720     Thus we have to pull back the pointer by one. */
721    
722 ph10 391 if ((c = *ptr) >= CHAR_8)
723 nigel 77 {
724     ptr--;
725     c = 0;
726     break;
727     }
728    
729     /* \0 always starts an octal number, but we may drop through to here with a
730 nigel 91 larger first octal digit. The original code used just to take the least
731     significant 8 bits of octal numbers (I think this is what early Perls used
732     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
733     than 3 octal digits. */
734 nigel 77
735 ph10 391 case CHAR_0:
736     c -= CHAR_0;
737     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
738     c = c * 8 + *(++ptr) - CHAR_0;
739 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
740 nigel 77 break;
741    
742 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
743     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
744     treated as a data character. */
745 nigel 77
746 ph10 391 case CHAR_x:
747     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
748 nigel 77 {
749     const uschar *pt = ptr + 2;
750 nigel 87 int count = 0;
751    
752 nigel 77 c = 0;
753     while ((digitab[*pt] & ctype_xdigit) != 0)
754     {
755 nigel 87 register int cc = *pt++;
756 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
757 nigel 77 count++;
758 nigel 87
759 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
760     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
761     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
762 ph10 97 #else /* EBCDIC coding */
763 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
764     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
765 nigel 77 #endif
766     }
767 nigel 87
768 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
769 nigel 77 {
770 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
771 nigel 77 ptr = pt;
772     break;
773     }
774 nigel 87
775 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
776     recognize this construct; fall through to the normal \x handling. */
777     }
778    
779 nigel 87 /* Read just a single-byte hex-defined char */
780 nigel 77
781     c = 0;
782     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
783     {
784 ph10 391 int cc; /* Some compilers don't like */
785     cc = *(++ptr); /* ++ in initializers */
786     #ifndef EBCDIC /* ASCII/UTF-8 coding */
787     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
788     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
789 ph10 97 #else /* EBCDIC coding */
790 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
791     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
792 nigel 77 #endif
793     }
794     break;
795    
796 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
797     This coding is ASCII-specific, but then the whole concept of \cx is
798     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
799 nigel 77
800 ph10 391 case CHAR_c:
801 nigel 77 c = *(++ptr);
802     if (c == 0)
803     {
804     *errorcodeptr = ERR2;
805 ph10 213 break;
806 nigel 77 }
807    
808 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
809     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
810 nigel 77 c ^= 0x40;
811 ph10 97 #else /* EBCDIC coding */
812 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
813 nigel 77 c ^= 0xC0;
814     #endif
815     break;
816    
817     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
818 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
819     otherwise, for Perl compatibility, it is a literal. This code looks a bit
820     odd, but there used to be some cases other than the default, and there may
821     be again in future, so I haven't "optimized" it. */
822 nigel 77
823     default:
824     if ((options & PCRE_EXTRA) != 0) switch(c)
825     {
826     default:
827     *errorcodeptr = ERR3;
828     break;
829     }
830     break;
831     }
832     }
833    
834     *ptrptr = ptr;
835     return c;
836     }
837    
838    
839    
840     #ifdef SUPPORT_UCP
841     /*************************************************
842     * Handle \P and \p *
843     *************************************************/
844    
845     /* This function is called after \P or \p has been encountered, provided that
846     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
847     pointing at the P or p. On exit, it is pointing at the final character of the
848     escape sequence.
849    
850     Argument:
851     ptrptr points to the pattern position pointer
852     negptr points to a boolean that is set TRUE for negation else FALSE
853 nigel 87 dptr points to an int that is set to the detailed property value
854 nigel 77 errorcodeptr points to the error code variable
855    
856 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
857 nigel 77 */
858    
859     static int
860 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
861 nigel 77 {
862     int c, i, bot, top;
863     const uschar *ptr = *ptrptr;
864 nigel 87 char name[32];
865 nigel 77
866     c = *(++ptr);
867     if (c == 0) goto ERROR_RETURN;
868    
869     *negptr = FALSE;
870    
871 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
872     negation. */
873 nigel 77
874 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
875 nigel 77 {
876 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
877 nigel 77 {
878     *negptr = TRUE;
879     ptr++;
880     }
881 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
882 nigel 77 {
883     c = *(++ptr);
884     if (c == 0) goto ERROR_RETURN;
885 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
886 nigel 77 name[i] = c;
887     }
888 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
889 nigel 77 name[i] = 0;
890     }
891    
892     /* Otherwise there is just one following character */
893    
894     else
895     {
896     name[0] = c;
897     name[1] = 0;
898     }
899    
900     *ptrptr = ptr;
901    
902     /* Search for a recognized property name using binary chop */
903    
904     bot = 0;
905     top = _pcre_utt_size;
906    
907     while (bot < top)
908     {
909 nigel 87 i = (bot + top) >> 1;
910 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
911 nigel 87 if (c == 0)
912     {
913     *dptr = _pcre_utt[i].value;
914     return _pcre_utt[i].type;
915     }
916 nigel 77 if (c > 0) bot = i + 1; else top = i;
917     }
918    
919     *errorcodeptr = ERR47;
920     *ptrptr = ptr;
921     return -1;
922    
923     ERROR_RETURN:
924     *errorcodeptr = ERR46;
925     *ptrptr = ptr;
926     return -1;
927     }
928     #endif
929    
930    
931    
932    
933     /*************************************************
934     * Check for counted repeat *
935     *************************************************/
936    
937     /* This function is called when a '{' is encountered in a place where it might
938     start a quantifier. It looks ahead to see if it really is a quantifier or not.
939     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
940     where the ddds are digits.
941    
942     Arguments:
943     p pointer to the first char after '{'
944    
945     Returns: TRUE or FALSE
946     */
947    
948     static BOOL
949     is_counted_repeat(const uschar *p)
950     {
951     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
952     while ((digitab[*p] & ctype_digit) != 0) p++;
953 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
954 nigel 77
955 ph10 391 if (*p++ != CHAR_COMMA) return FALSE;
956     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
957 nigel 77
958     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
959     while ((digitab[*p] & ctype_digit) != 0) p++;
960    
961 ph10 391 return (*p == CHAR_RIGHT_CURLY_BRACKET);
962 nigel 77 }
963    
964    
965    
966     /*************************************************
967     * Read repeat counts *
968     *************************************************/
969    
970     /* Read an item of the form {n,m} and return the values. This is called only
971     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
972     so the syntax is guaranteed to be correct, but we need to check the values.
973    
974     Arguments:
975     p pointer to first char after '{'
976     minp pointer to int for min
977     maxp pointer to int for max
978     returned as -1 if no max
979     errorcodeptr points to error code variable
980    
981     Returns: pointer to '}' on success;
982     current ptr on error, with errorcodeptr set non-zero
983     */
984    
985     static const uschar *
986     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
987     {
988     int min = 0;
989     int max = -1;
990    
991 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
992     an integer overflow. */
993    
994 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
995 nigel 81 if (min < 0 || min > 65535)
996     {
997     *errorcodeptr = ERR5;
998     return p;
999     }
1000 nigel 77
1001 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
1002     Also, max must not be less than min. */
1003    
1004 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1005 nigel 77 {
1006 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1007 nigel 77 {
1008     max = 0;
1009 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1010 nigel 81 if (max < 0 || max > 65535)
1011     {
1012     *errorcodeptr = ERR5;
1013     return p;
1014     }
1015 nigel 77 if (max < min)
1016     {
1017     *errorcodeptr = ERR4;
1018     return p;
1019     }
1020     }
1021     }
1022    
1023 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1024     '}'. */
1025 nigel 77
1026 nigel 81 *minp = min;
1027     *maxp = max;
1028 nigel 77 return p;
1029     }
1030    
1031    
1032    
1033     /*************************************************
1034 ph10 408 * Subroutine for finding forward reference *
1035 nigel 91 *************************************************/
1036    
1037 ph10 408 /* This recursive function is called only from find_parens() below. The
1038     top-level call starts at the beginning of the pattern. All other calls must
1039     start at a parenthesis. It scans along a pattern's text looking for capturing
1040 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1041     name it is given, it returns its number. Alternatively, if the name is NULL, it
1042 ph10 408 returns when it reaches a given numbered subpattern. We know that if (?P< is
1043     encountered, the name will be terminated by '>' because that is checked in the
1044 ph10 411 first pass. Recursion is used to keep track of subpatterns that reset the
1045 ph10 408 capturing group numbers - the (?| feature.
1046 nigel 91
1047     Arguments:
1048 ph10 408 ptrptr address of the current character pointer (updated)
1049 ph10 345 cd compile background data
1050 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1051     lorn name length, or subpattern number if name is NULL
1052     xmode TRUE if we are in /x mode
1053 ph10 411 count pointer to the current capturing subpattern number (updated)
1054 nigel 91
1055     Returns: the number of the named subpattern, or -1 if not found
1056     */
1057    
1058     static int
1059 ph10 408 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1060     BOOL xmode, int *count)
1061 nigel 91 {
1062 ph10 408 uschar *ptr = *ptrptr;
1063     int start_count = *count;
1064     int hwm_count = start_count;
1065     BOOL dup_parens = FALSE;
1066 nigel 93
1067 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1068 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1069    
1070     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1071     {
1072     if (ptr[1] == CHAR_QUESTION_MARK &&
1073 ph10 411 ptr[2] == CHAR_VERTICAL_LINE)
1074 ph10 408 {
1075     ptr += 3;
1076 ph10 411 dup_parens = TRUE;
1077     }
1078 ph10 408
1079     /* Handle a normal, unnamed capturing parenthesis */
1080 ph10 411
1081 ph10 408 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1082     {
1083     *count += 1;
1084     if (name == NULL && *count == lorn) return *count;
1085 ph10 411 ptr++;
1086 ph10 408 }
1087    
1088     /* Handle a condition. If it is an assertion, just carry on so that it
1089     is processed as normal. If not, skip to the closing parenthesis of the
1090 ph10 411 condition (there can't be any nested parens. */
1091    
1092 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1093     {
1094 ph10 411 ptr += 2;
1095 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1096     {
1097     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1098 ph10 411 if (*ptr != 0) ptr++;
1099 ph10 408 }
1100 ph10 411 }
1101    
1102 ph10 408 /* We have either (? or (* and not a condition */
1103    
1104     else
1105 ph10 411 {
1106 ph10 408 ptr += 2;
1107     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1108    
1109     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1110 ph10 411
1111 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1112     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1113     {
1114     int term;
1115     const uschar *thisname;
1116     *count += 1;
1117     if (name == NULL && *count == lorn) return *count;
1118     term = *ptr++;
1119     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1120     thisname = ptr;
1121     while (*ptr != term) ptr++;
1122     if (name != NULL && lorn == ptr - thisname &&
1123     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1124     return *count;
1125 ph10 461 term++;
1126 ph10 411 }
1127 ph10 408 }
1128 ph10 411 }
1129 ph10 408
1130 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1131 ph10 408 bars. */
1132    
1133 nigel 91 for (; *ptr != 0; ptr++)
1134     {
1135 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1136    
1137 ph10 391 if (*ptr == CHAR_BACKSLASH)
1138 nigel 93 {
1139 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1140 ph10 391 if (*ptr == CHAR_Q) for (;;)
1141 nigel 93 {
1142 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1143 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1144 ph10 391 if (*(++ptr) == CHAR_E) break;
1145 nigel 93 }
1146     continue;
1147     }
1148    
1149 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1150     are handled for real. If the first character is '^', skip it. Also, if the
1151     first few characters (either before or after ^) are \Q\E or \E we skip them
1152 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1153 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1154 nigel 93
1155 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1156 nigel 93 {
1157 ph10 340 BOOL negate_class = FALSE;
1158     for (;;)
1159     {
1160 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1161 ph10 340 {
1162 ph10 438 if (ptr[2] == CHAR_E)
1163     ptr+= 2;
1164     else if (strncmp((const char *)ptr+2,
1165 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1166 ph10 438 ptr += 4;
1167 ph10 392 else
1168 ph10 391 break;
1169 ph10 340 }
1170 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1171 ph10 461 {
1172 ph10 340 negate_class = TRUE;
1173 ph10 438 ptr++;
1174 ph10 461 }
1175 ph10 340 else break;
1176     }
1177    
1178     /* If the next character is ']', it is a data character that must be
1179 ph10 341 skipped, except in JavaScript compatibility mode. */
1180 ph10 345
1181 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1182 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1183 ph10 345 ptr++;
1184    
1185 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1186 nigel 93 {
1187 ph10 220 if (*ptr == 0) return -1;
1188 ph10 391 if (*ptr == CHAR_BACKSLASH)
1189 nigel 93 {
1190 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1191 ph10 391 if (*ptr == CHAR_Q) for (;;)
1192 nigel 93 {
1193 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1194 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1195 ph10 391 if (*(++ptr) == CHAR_E) break;
1196 nigel 93 }
1197     continue;
1198     }
1199     }
1200     continue;
1201     }
1202    
1203     /* Skip comments in /x mode */
1204    
1205 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1206 nigel 93 {
1207 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1208 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1209 nigel 93 continue;
1210     }
1211    
1212 ph10 408 /* Check for the special metacharacters */
1213 ph10 411
1214 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1215 nigel 93 {
1216 ph10 408 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1217     if (rc > 0) return rc;
1218     if (*ptr == 0) goto FAIL_EXIT;
1219 nigel 93 }
1220 ph10 411
1221 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1222     {
1223     if (dup_parens && *count < hwm_count) *count = hwm_count;
1224     *ptrptr = ptr;
1225     return -1;
1226     }
1227 ph10 411
1228     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1229 ph10 408 {
1230     if (*count > hwm_count) hwm_count = *count;
1231     *count = start_count;
1232 ph10 411 }
1233 ph10 408 }
1234 nigel 93
1235 ph10 408 FAIL_EXIT:
1236     *ptrptr = ptr;
1237     return -1;
1238     }
1239 nigel 93
1240    
1241    
1242    
1243 ph10 408 /*************************************************
1244     * Find forward referenced subpattern *
1245     *************************************************/
1246 nigel 93
1247 ph10 408 /* This function scans along a pattern's text looking for capturing
1248     subpatterns, and counting them. If it finds a named pattern that matches the
1249     name it is given, it returns its number. Alternatively, if the name is NULL, it
1250     returns when it reaches a given numbered subpattern. This is used for forward
1251     references to subpatterns. We used to be able to start this scan from the
1252     current compiling point, using the current count value from cd->bracount, and
1253     do it all in a single loop, but the addition of the possibility of duplicate
1254     subpattern numbers means that we have to scan from the very start, in order to
1255     take account of such duplicates, and to use a recursive function to keep track
1256     of the different types of group.
1257    
1258     Arguments:
1259     cd compile background data
1260     name name to seek, or NULL if seeking a numbered subpattern
1261     lorn name length, or subpattern number if name is NULL
1262     xmode TRUE if we are in /x mode
1263    
1264     Returns: the number of the found subpattern, or -1 if not found
1265     */
1266    
1267     static int
1268     find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1269     {
1270     uschar *ptr = (uschar *)cd->start_pattern;
1271     int count = 0;
1272     int rc;
1273    
1274     /* If the pattern does not start with an opening parenthesis, the first call
1275     to find_parens_sub() will scan right to the end (if necessary). However, if it
1276     does start with a parenthesis, find_parens_sub() will return when it hits the
1277     matching closing parens. That is why we have to have a loop. */
1278    
1279 ph10 411 for (;;)
1280     {
1281 ph10 408 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1282 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1283     }
1284    
1285 ph10 408 return rc;
1286 nigel 91 }
1287    
1288    
1289    
1290 ph10 408
1291 nigel 91 /*************************************************
1292 nigel 77 * Find first significant op code *
1293     *************************************************/
1294    
1295     /* This is called by several functions that scan a compiled expression looking
1296     for a fixed first character, or an anchoring op code etc. It skips over things
1297     that do not influence this. For some calls, a change of option is important.
1298     For some calls, it makes sense to skip negative forward and all backward
1299     assertions, and also the \b assertion; for others it does not.
1300    
1301     Arguments:
1302     code pointer to the start of the group
1303     options pointer to external options
1304     optbit the option bit whose changing is significant, or
1305     zero if none are
1306     skipassert TRUE if certain assertions are to be skipped
1307    
1308     Returns: pointer to the first significant opcode
1309     */
1310    
1311     static const uschar*
1312     first_significant_code(const uschar *code, int *options, int optbit,
1313     BOOL skipassert)
1314     {
1315     for (;;)
1316     {
1317     switch ((int)*code)
1318     {
1319     case OP_OPT:
1320     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1321     *options = (int)code[1];
1322     code += 2;
1323     break;
1324    
1325     case OP_ASSERT_NOT:
1326     case OP_ASSERTBACK:
1327     case OP_ASSERTBACK_NOT:
1328     if (!skipassert) return code;
1329     do code += GET(code, 1); while (*code == OP_ALT);
1330     code += _pcre_OP_lengths[*code];
1331     break;
1332    
1333     case OP_WORD_BOUNDARY:
1334     case OP_NOT_WORD_BOUNDARY:
1335     if (!skipassert) return code;
1336     /* Fall through */
1337    
1338     case OP_CALLOUT:
1339     case OP_CREF:
1340 ph10 459 case OP_NCREF:
1341 nigel 93 case OP_RREF:
1342 ph10 459 case OP_NRREF:
1343 nigel 93 case OP_DEF:
1344 nigel 77 code += _pcre_OP_lengths[*code];
1345     break;
1346    
1347     default:
1348     return code;
1349     }
1350     }
1351     /* Control never reaches here */
1352     }
1353    
1354    
1355    
1356    
1357     /*************************************************
1358 ph10 454 * Find the fixed length of a branch *
1359 nigel 77 *************************************************/
1360    
1361 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1362 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1363 ph10 461 In UTF8 mode, the result is in characters rather than bytes. The branch is
1364 ph10 454 temporarily terminated with OP_END when this function is called.
1365 nigel 77
1366 ph10 461 This function is called when a backward assertion is encountered, so that if it
1367     fails, the error message can point to the correct place in the pattern.
1368 ph10 454 However, we cannot do this when the assertion contains subroutine calls,
1369 ph10 461 because they can be forward references. We solve this by remembering this case
1370 ph10 454 and doing the check at the end; a flag specifies which mode we are running in.
1371    
1372 nigel 77 Arguments:
1373     code points to the start of the pattern (the bracket)
1374     options the compiling options
1375 ph10 461 atend TRUE if called when the pattern is complete
1376     cd the "compile data" structure
1377 nigel 77
1378 ph10 461 Returns: the fixed length,
1379 ph10 454 or -1 if there is no fixed length,
1380 nigel 77 or -2 if \C was encountered
1381 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1382 nigel 77 */
1383    
1384     static int
1385 ph10 454 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1386 nigel 77 {
1387     int length = -1;
1388    
1389     register int branchlength = 0;
1390     register uschar *cc = code + 1 + LINK_SIZE;
1391    
1392     /* Scan along the opcodes for this branch. If we get to the end of the
1393     branch, check the length against that of the other branches. */
1394    
1395     for (;;)
1396     {
1397     int d;
1398 ph10 454 uschar *ce, *cs;
1399 nigel 77 register int op = *cc;
1400     switch (op)
1401     {
1402 nigel 93 case OP_CBRA:
1403 nigel 77 case OP_BRA:
1404     case OP_ONCE:
1405     case OP_COND:
1406 ph10 454 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1407 nigel 77 if (d < 0) return d;
1408     branchlength += d;
1409     do cc += GET(cc, 1); while (*cc == OP_ALT);
1410     cc += 1 + LINK_SIZE;
1411     break;
1412    
1413     /* Reached end of a branch; if it's a ket it is the end of a nested
1414     call. If it's ALT it is an alternation in a nested call. If it is
1415     END it's the end of the outer call. All can be handled by the same code. */
1416    
1417     case OP_ALT:
1418     case OP_KET:
1419     case OP_KETRMAX:
1420     case OP_KETRMIN:
1421     case OP_END:
1422     if (length < 0) length = branchlength;
1423     else if (length != branchlength) return -1;
1424     if (*cc != OP_ALT) return length;
1425     cc += 1 + LINK_SIZE;
1426     branchlength = 0;
1427     break;
1428 ph10 461
1429 ph10 454 /* A true recursion implies not fixed length, but a subroutine call may
1430     be OK. If the subroutine is a forward reference, we can't deal with
1431     it until the end of the pattern, so return -3. */
1432 ph10 461
1433 ph10 454 case OP_RECURSE:
1434     if (!atend) return -3;
1435     cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1436     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1437     if (cc > cs && cc < ce) return -1; /* Recursion */
1438     d = find_fixedlength(cs + 2, options, atend, cd);
1439 ph10 461 if (d < 0) return d;
1440 ph10 454 branchlength += d;
1441     cc += 1 + LINK_SIZE;
1442 ph10 461 break;
1443 nigel 77
1444     /* Skip over assertive subpatterns */
1445    
1446     case OP_ASSERT:
1447     case OP_ASSERT_NOT:
1448     case OP_ASSERTBACK:
1449     case OP_ASSERTBACK_NOT:
1450     do cc += GET(cc, 1); while (*cc == OP_ALT);
1451     /* Fall through */
1452    
1453     /* Skip over things that don't match chars */
1454    
1455     case OP_REVERSE:
1456     case OP_CREF:
1457 ph10 459 case OP_NCREF:
1458 nigel 93 case OP_RREF:
1459 ph10 459 case OP_NRREF:
1460 nigel 93 case OP_DEF:
1461 nigel 77 case OP_OPT:
1462     case OP_CALLOUT:
1463     case OP_SOD:
1464     case OP_SOM:
1465 ph10 500 case OP_SET_SOM:
1466 nigel 77 case OP_EOD:
1467     case OP_EODN:
1468     case OP_CIRC:
1469     case OP_DOLL:
1470     case OP_NOT_WORD_BOUNDARY:
1471     case OP_WORD_BOUNDARY:
1472     cc += _pcre_OP_lengths[*cc];
1473     break;
1474    
1475     /* Handle literal characters */
1476    
1477     case OP_CHAR:
1478     case OP_CHARNC:
1479 nigel 91 case OP_NOT:
1480 nigel 77 branchlength++;
1481     cc += 2;
1482     #ifdef SUPPORT_UTF8
1483 ph10 461 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1484 ph10 426 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1485 nigel 77 #endif
1486     break;
1487    
1488     /* Handle exact repetitions. The count is already in characters, but we
1489     need to skip over a multibyte character in UTF8 mode. */
1490    
1491     case OP_EXACT:
1492     branchlength += GET2(cc,1);
1493     cc += 4;
1494     #ifdef SUPPORT_UTF8
1495 ph10 461 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1496 ph10 426 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1497 nigel 77 #endif
1498     break;
1499    
1500     case OP_TYPEEXACT:
1501     branchlength += GET2(cc,1);
1502 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1503 nigel 77 cc += 4;
1504     break;
1505    
1506     /* Handle single-char matchers */
1507    
1508     case OP_PROP:
1509     case OP_NOTPROP:
1510 nigel 87 cc += 2;
1511 nigel 77 /* Fall through */
1512    
1513     case OP_NOT_DIGIT:
1514     case OP_DIGIT:
1515     case OP_NOT_WHITESPACE:
1516     case OP_WHITESPACE:
1517     case OP_NOT_WORDCHAR:
1518     case OP_WORDCHAR:
1519     case OP_ANY:
1520 ph10 342 case OP_ALLANY:
1521 nigel 77 branchlength++;
1522     cc++;
1523     break;
1524    
1525     /* The single-byte matcher isn't allowed */
1526    
1527     case OP_ANYBYTE:
1528     return -2;
1529    
1530     /* Check a class for variable quantification */
1531    
1532     #ifdef SUPPORT_UTF8
1533     case OP_XCLASS:
1534     cc += GET(cc, 1) - 33;
1535     /* Fall through */
1536     #endif
1537    
1538     case OP_CLASS:
1539     case OP_NCLASS:
1540     cc += 33;
1541    
1542     switch (*cc)
1543     {
1544     case OP_CRSTAR:
1545     case OP_CRMINSTAR:
1546     case OP_CRQUERY:
1547     case OP_CRMINQUERY:
1548     return -1;
1549    
1550     case OP_CRRANGE:
1551     case OP_CRMINRANGE:
1552     if (GET2(cc,1) != GET2(cc,3)) return -1;
1553     branchlength += GET2(cc,1);
1554     cc += 5;
1555     break;
1556    
1557     default:
1558     branchlength++;
1559     }
1560     break;
1561    
1562     /* Anything else is variable length */
1563    
1564     default:
1565     return -1;
1566     }
1567     }
1568     /* Control never gets here */
1569     }
1570    
1571    
1572    
1573    
1574     /*************************************************
1575 ph10 454 * Scan compiled regex for specific bracket *
1576 nigel 77 *************************************************/
1577    
1578     /* This little function scans through a compiled pattern until it finds a
1579 ph10 454 capturing bracket with the given number, or, if the number is negative, an
1580 ph10 461 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1581     so that it can be called from pcre_study() when finding the minimum matching
1582 ph10 455 length.
1583 nigel 77
1584     Arguments:
1585     code points to start of expression
1586     utf8 TRUE in UTF-8 mode
1587 ph10 454 number the required bracket number or negative to find a lookbehind
1588 nigel 77
1589     Returns: pointer to the opcode for the bracket, or NULL if not found
1590     */
1591    
1592 ph10 455 const uschar *
1593     _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1594 nigel 77 {
1595     for (;;)
1596     {
1597     register int c = *code;
1598     if (c == OP_END) return NULL;
1599 nigel 91
1600     /* XCLASS is used for classes that cannot be represented just by a bit
1601     map. This includes negated single high-valued characters. The length in
1602     the table is zero; the actual length is stored in the compiled code. */
1603    
1604     if (c == OP_XCLASS) code += GET(code, 1);
1605 ph10 461
1606 ph10 454 /* Handle recursion */
1607 ph10 461
1608 ph10 454 else if (c == OP_REVERSE)
1609     {
1610 ph10 461 if (number < 0) return (uschar *)code;
1611 ph10 454 code += _pcre_OP_lengths[c];
1612     }
1613 nigel 91
1614 nigel 93 /* Handle capturing bracket */
1615 nigel 91
1616 nigel 93 else if (c == OP_CBRA)
1617 nigel 77 {
1618 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1619 nigel 77 if (n == number) return (uschar *)code;
1620 nigel 93 code += _pcre_OP_lengths[c];
1621 nigel 77 }
1622 nigel 91
1623 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1624     repeated character types, we have to test for \p and \P, which have an extra
1625 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1626 ph10 510 must add in its length. */
1627 nigel 91
1628 nigel 77 else
1629     {
1630 ph10 218 switch(c)
1631     {
1632     case OP_TYPESTAR:
1633     case OP_TYPEMINSTAR:
1634     case OP_TYPEPLUS:
1635     case OP_TYPEMINPLUS:
1636     case OP_TYPEQUERY:
1637     case OP_TYPEMINQUERY:
1638     case OP_TYPEPOSSTAR:
1639     case OP_TYPEPOSPLUS:
1640     case OP_TYPEPOSQUERY:
1641     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1642 ph10 220 break;
1643 ph10 221
1644     case OP_TYPEUPTO:
1645     case OP_TYPEMINUPTO:
1646     case OP_TYPEEXACT:
1647     case OP_TYPEPOSUPTO:
1648     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1649     break;
1650 ph10 512
1651 ph10 510 case OP_MARK:
1652     case OP_PRUNE_ARG:
1653     case OP_SKIP_ARG:
1654     case OP_THEN_ARG:
1655     code += code[1];
1656 ph10 512 break;
1657 ph10 220 }
1658    
1659 ph10 218 /* Add in the fixed length from the table */
1660 ph10 220
1661 nigel 77 code += _pcre_OP_lengths[c];
1662 ph10 220
1663 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1664     a multi-byte character. The length in the table is a minimum, so we have to
1665     arrange to skip the extra bytes. */
1666 ph10 220
1667 ph10 107 #ifdef SUPPORT_UTF8
1668 nigel 77 if (utf8) switch(c)
1669     {
1670     case OP_CHAR:
1671     case OP_CHARNC:
1672     case OP_EXACT:
1673     case OP_UPTO:
1674     case OP_MINUPTO:
1675 nigel 93 case OP_POSUPTO:
1676 nigel 77 case OP_STAR:
1677     case OP_MINSTAR:
1678 nigel 93 case OP_POSSTAR:
1679 nigel 77 case OP_PLUS:
1680     case OP_MINPLUS:
1681 nigel 93 case OP_POSPLUS:
1682 nigel 77 case OP_QUERY:
1683     case OP_MINQUERY:
1684 nigel 93 case OP_POSQUERY:
1685     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1686 nigel 77 break;
1687     }
1688 ph10 369 #else
1689     (void)(utf8); /* Keep compiler happy by referencing function argument */
1690 ph10 111 #endif
1691 nigel 77 }
1692     }
1693     }
1694    
1695    
1696    
1697     /*************************************************
1698     * Scan compiled regex for recursion reference *
1699     *************************************************/
1700    
1701     /* This little function scans through a compiled pattern until it finds an
1702     instance of OP_RECURSE.
1703    
1704     Arguments:
1705     code points to start of expression
1706     utf8 TRUE in UTF-8 mode
1707    
1708     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1709     */
1710    
1711     static const uschar *
1712     find_recurse(const uschar *code, BOOL utf8)
1713     {
1714     for (;;)
1715     {
1716     register int c = *code;
1717     if (c == OP_END) return NULL;
1718 nigel 91 if (c == OP_RECURSE) return code;
1719 ph10 220
1720 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1721     map. This includes negated single high-valued characters. The length in
1722     the table is zero; the actual length is stored in the compiled code. */
1723    
1724     if (c == OP_XCLASS) code += GET(code, 1);
1725    
1726 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1727     repeated character types, we have to test for \p and \P, which have an extra
1728 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1729 ph10 510 must add in its length. */
1730 nigel 91
1731 nigel 77 else
1732     {
1733 ph10 218 switch(c)
1734     {
1735     case OP_TYPESTAR:
1736     case OP_TYPEMINSTAR:
1737     case OP_TYPEPLUS:
1738     case OP_TYPEMINPLUS:
1739     case OP_TYPEQUERY:
1740     case OP_TYPEMINQUERY:
1741     case OP_TYPEPOSSTAR:
1742     case OP_TYPEPOSPLUS:
1743     case OP_TYPEPOSQUERY:
1744     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1745 ph10 220 break;
1746 ph10 221
1747     case OP_TYPEPOSUPTO:
1748     case OP_TYPEUPTO:
1749     case OP_TYPEMINUPTO:
1750     case OP_TYPEEXACT:
1751     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1752     break;
1753 ph10 512
1754 ph10 510 case OP_MARK:
1755     case OP_PRUNE_ARG:
1756     case OP_SKIP_ARG:
1757     case OP_THEN_ARG:
1758     code += code[1];
1759 ph10 512 break;
1760 ph10 220 }
1761    
1762 ph10 218 /* Add in the fixed length from the table */
1763    
1764 nigel 77 code += _pcre_OP_lengths[c];
1765 ph10 220
1766 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1767     by a multi-byte character. The length in the table is a minimum, so we have
1768     to arrange to skip the extra bytes. */
1769 ph10 220
1770 ph10 107 #ifdef SUPPORT_UTF8
1771 nigel 77 if (utf8) switch(c)
1772     {
1773     case OP_CHAR:
1774     case OP_CHARNC:
1775     case OP_EXACT:
1776     case OP_UPTO:
1777     case OP_MINUPTO:
1778 nigel 93 case OP_POSUPTO:
1779 nigel 77 case OP_STAR:
1780     case OP_MINSTAR:
1781 nigel 93 case OP_POSSTAR:
1782 nigel 77 case OP_PLUS:
1783     case OP_MINPLUS:
1784 nigel 93 case OP_POSPLUS:
1785 nigel 77 case OP_QUERY:
1786     case OP_MINQUERY:
1787 nigel 93 case OP_POSQUERY:
1788     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1789 nigel 77 break;
1790     }
1791 ph10 369 #else
1792     (void)(utf8); /* Keep compiler happy by referencing function argument */
1793 ph10 111 #endif
1794 nigel 77 }
1795     }
1796     }
1797    
1798    
1799    
1800     /*************************************************
1801     * Scan compiled branch for non-emptiness *
1802     *************************************************/
1803    
1804     /* This function scans through a branch of a compiled pattern to see whether it
1805 nigel 93 can match the empty string or not. It is called from could_be_empty()
1806     below and from compile_branch() when checking for an unlimited repeat of a
1807     group that can match nothing. Note that first_significant_code() skips over
1808 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1809     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1810     bracket whose current branch will already have been scanned.
1811 nigel 77
1812     Arguments:
1813     code points to start of search
1814     endcode points to where to stop
1815     utf8 TRUE if in UTF8 mode
1816 ph10 503 cd contains pointers to tables etc.
1817 nigel 77
1818     Returns: TRUE if what is matched could be empty
1819     */
1820    
1821     static BOOL
1822 ph10 503 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1823     compile_data *cd)
1824 nigel 77 {
1825     register int c;
1826 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1827 nigel 77 code < endcode;
1828     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1829     {
1830     const uschar *ccode;
1831    
1832     c = *code;
1833 ph10 507
1834 ph10 286 /* Skip over forward assertions; the other assertions are skipped by
1835 ph10 282 first_significant_code() with a TRUE final argument. */
1836 ph10 286
1837 ph10 282 if (c == OP_ASSERT)
1838 ph10 286 {
1839 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1840     c = *code;
1841     continue;
1842 ph10 286 }
1843 ph10 172
1844 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1845 nigel 77
1846 ph10 335 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1847 ph10 170 {
1848 ph10 172 code += _pcre_OP_lengths[c];
1849 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1850     c = *code;
1851     continue;
1852     }
1853 ph10 507
1854 ph10 503 /* For a recursion/subroutine call, if its end has been reached, which
1855     implies a subroutine call, we can scan it. */
1856 ph10 507
1857 ph10 503 if (c == OP_RECURSE)
1858     {
1859 ph10 507 BOOL empty_branch = FALSE;
1860 ph10 503 const uschar *scode = cd->start_code + GET(code, 1);
1861     if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
1862     do
1863     {
1864 ph10 504 if (could_be_empty_branch(scode, endcode, utf8, cd))
1865     {
1866     empty_branch = TRUE;
1867 ph10 507 break;
1868     }
1869 ph10 503 scode += GET(scode, 1);
1870     }
1871     while (*scode == OP_ALT);
1872 ph10 504 if (!empty_branch) return FALSE; /* All branches are non-empty */
1873 ph10 503 continue;
1874 ph10 507 }
1875 ph10 170
1876     /* For other groups, scan the branches. */
1877 ph10 172
1878 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1879 nigel 77 {
1880     BOOL empty_branch;
1881     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1882 ph10 406
1883     /* If a conditional group has only one branch, there is a second, implied,
1884 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
1885     Otherwise, scan the individual branches of the group. */
1886 ph10 406
1887 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1888 nigel 77 code += GET(code, 1);
1889 ph10 395 else
1890 ph10 406 {
1891 ph10 395 empty_branch = FALSE;
1892     do
1893     {
1894 ph10 503 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
1895 ph10 395 empty_branch = TRUE;
1896     code += GET(code, 1);
1897     }
1898     while (*code == OP_ALT);
1899     if (!empty_branch) return FALSE; /* All branches are non-empty */
1900 nigel 77 }
1901 ph10 406
1902 ph10 172 c = *code;
1903 nigel 93 continue;
1904 nigel 77 }
1905    
1906 nigel 93 /* Handle the other opcodes */
1907    
1908     switch (c)
1909 nigel 77 {
1910 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1911     cannot be represented just by a bit map. This includes negated single
1912     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1913 ph10 220 actual length is stored in the compiled code, so we must update "code"
1914 ph10 216 here. */
1915 nigel 77
1916     #ifdef SUPPORT_UTF8
1917     case OP_XCLASS:
1918 ph10 216 ccode = code += GET(code, 1);
1919 nigel 77 goto CHECK_CLASS_REPEAT;
1920     #endif
1921    
1922     case OP_CLASS:
1923     case OP_NCLASS:
1924     ccode = code + 33;
1925    
1926     #ifdef SUPPORT_UTF8
1927     CHECK_CLASS_REPEAT:
1928     #endif
1929    
1930     switch (*ccode)
1931     {
1932     case OP_CRSTAR: /* These could be empty; continue */
1933     case OP_CRMINSTAR:
1934     case OP_CRQUERY:
1935     case OP_CRMINQUERY:
1936     break;
1937    
1938     default: /* Non-repeat => class must match */
1939     case OP_CRPLUS: /* These repeats aren't empty */
1940     case OP_CRMINPLUS:
1941     return FALSE;
1942    
1943     case OP_CRRANGE:
1944     case OP_CRMINRANGE:
1945     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1946     break;
1947     }
1948     break;
1949    
1950     /* Opcodes that must match a character */
1951    
1952     case OP_PROP:
1953     case OP_NOTPROP:
1954     case OP_EXTUNI:
1955     case OP_NOT_DIGIT:
1956     case OP_DIGIT:
1957     case OP_NOT_WHITESPACE:
1958     case OP_WHITESPACE:
1959     case OP_NOT_WORDCHAR:
1960     case OP_WORDCHAR:
1961     case OP_ANY:
1962 ph10 345 case OP_ALLANY:
1963 nigel 77 case OP_ANYBYTE:
1964     case OP_CHAR:
1965     case OP_CHARNC:
1966     case OP_NOT:
1967     case OP_PLUS:
1968     case OP_MINPLUS:
1969 nigel 93 case OP_POSPLUS:
1970 nigel 77 case OP_EXACT:
1971     case OP_NOTPLUS:
1972     case OP_NOTMINPLUS:
1973 nigel 93 case OP_NOTPOSPLUS:
1974 nigel 77 case OP_NOTEXACT:
1975     case OP_TYPEPLUS:
1976     case OP_TYPEMINPLUS:
1977 nigel 93 case OP_TYPEPOSPLUS:
1978 nigel 77 case OP_TYPEEXACT:
1979     return FALSE;
1980 ph10 227
1981     /* These are going to continue, as they may be empty, but we have to
1982     fudge the length for the \p and \P cases. */
1983    
1984 ph10 224 case OP_TYPESTAR:
1985     case OP_TYPEMINSTAR:
1986     case OP_TYPEPOSSTAR:
1987     case OP_TYPEQUERY:
1988     case OP_TYPEMINQUERY:
1989     case OP_TYPEPOSQUERY:
1990     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1991 ph10 227 break;
1992    
1993 ph10 224 /* Same for these */
1994 ph10 227
1995 ph10 224 case OP_TYPEUPTO:
1996     case OP_TYPEMINUPTO:
1997     case OP_TYPEPOSUPTO:
1998     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1999     break;
2000 nigel 77
2001     /* End of branch */
2002    
2003     case OP_KET:
2004     case OP_KETRMAX:
2005     case OP_KETRMIN:
2006     case OP_ALT:
2007     return TRUE;
2008    
2009 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2010     MINUPTO, and POSUPTO may be followed by a multibyte character */
2011 nigel 77
2012     #ifdef SUPPORT_UTF8
2013     case OP_STAR:
2014     case OP_MINSTAR:
2015 nigel 93 case OP_POSSTAR:
2016 nigel 77 case OP_QUERY:
2017     case OP_MINQUERY:
2018 nigel 93 case OP_POSQUERY:
2019 ph10 426 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2020     break;
2021 ph10 461
2022 nigel 77 case OP_UPTO:
2023     case OP_MINUPTO:
2024 nigel 93 case OP_POSUPTO:
2025 ph10 426 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2026 nigel 77 break;
2027     #endif
2028 ph10 503
2029 ph10 510 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2030     string. */
2031    
2032     case OP_MARK:
2033     case OP_PRUNE_ARG:
2034     case OP_SKIP_ARG:
2035     case OP_THEN_ARG:
2036     code += code[1];
2037 ph10 512 break;
2038 ph10 510
2039 ph10 503 /* None of the remaining opcodes are required to match a character. */
2040 ph10 507
2041 ph10 503 default:
2042 ph10 507 break;
2043 nigel 77 }
2044     }
2045    
2046     return TRUE;
2047     }
2048    
2049    
2050    
2051     /*************************************************
2052     * Scan compiled regex for non-emptiness *
2053     *************************************************/
2054    
2055     /* This function is called to check for left recursive calls. We want to check
2056     the current branch of the current pattern to see if it could match the empty
2057     string. If it could, we must look outwards for branches at other levels,
2058     stopping when we pass beyond the bracket which is the subject of the recursion.
2059    
2060     Arguments:
2061     code points to start of the recursion
2062     endcode points to where to stop (current RECURSE item)
2063     bcptr points to the chain of current (unclosed) branch starts
2064     utf8 TRUE if in UTF-8 mode
2065 ph10 507 cd pointers to tables etc
2066 nigel 77
2067     Returns: TRUE if what is matched could be empty
2068     */
2069    
2070     static BOOL
2071     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2072 ph10 503 BOOL utf8, compile_data *cd)
2073 nigel 77 {
2074 ph10 475 while (bcptr != NULL && bcptr->current_branch >= code)
2075 nigel 77 {
2076 ph10 503 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2077 ph10 475 return FALSE;
2078 nigel 77 bcptr = bcptr->outer;
2079     }
2080     return TRUE;
2081     }
2082    
2083    
2084    
2085     /*************************************************
2086     * Check for POSIX class syntax *
2087     *************************************************/
2088    
2089     /* This function is called when the sequence "[:" or "[." or "[=" is
2090 ph10 295 encountered in a character class. It checks whether this is followed by a
2091 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2092 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2093 nigel 77
2094 ph10 298 Originally, this function only recognized a sequence of letters between the
2095     terminators, but it seems that Perl recognizes any sequence of characters,
2096     though of course unknown POSIX names are subsequently rejected. Perl gives an
2097     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2098     didn't consider this to be a POSIX class. Likewise for [:1234:].
2099 ph10 295
2100 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2101     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2102     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2103     below handles the special case of \], but does not try to do any other escape
2104     processing. This makes it different from Perl for cases such as [:l\ower:]
2105 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2106 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2107 ph10 295 I think.
2108    
2109     Arguments:
2110 nigel 77 ptr pointer to the initial [
2111     endptr where to return the end pointer
2112    
2113     Returns: TRUE or FALSE
2114     */
2115    
2116     static BOOL
2117 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2118 nigel 77 {
2119     int terminator; /* Don't combine these lines; the Solaris cc */
2120     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2121 ph10 295 for (++ptr; *ptr != 0; ptr++)
2122 nigel 77 {
2123 ph10 391 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2124 ph10 298 {
2125 ph10 391 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2126     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2127 ph10 295 {
2128     *endptr = ptr;
2129     return TRUE;
2130 ph10 298 }
2131     }
2132     }
2133 nigel 77 return FALSE;
2134     }
2135    
2136    
2137    
2138    
2139     /*************************************************
2140     * Check POSIX class name *
2141     *************************************************/
2142    
2143     /* This function is called to check the name given in a POSIX-style class entry
2144     such as [:alnum:].
2145    
2146     Arguments:
2147     ptr points to the first letter
2148     len the length of the name
2149    
2150     Returns: a value representing the name, or -1 if unknown
2151     */
2152    
2153     static int
2154     check_posix_name(const uschar *ptr, int len)
2155     {
2156 ph10 240 const char *pn = posix_names;
2157 nigel 77 register int yield = 0;
2158     while (posix_name_lengths[yield] != 0)
2159     {
2160     if (len == posix_name_lengths[yield] &&
2161 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2162 ph10 243 pn += posix_name_lengths[yield] + 1;
2163 nigel 77 yield++;
2164     }
2165     return -1;
2166     }
2167    
2168    
2169     /*************************************************
2170     * Adjust OP_RECURSE items in repeated group *
2171     *************************************************/
2172    
2173     /* OP_RECURSE items contain an offset from the start of the regex to the group
2174     that is referenced. This means that groups can be replicated for fixed
2175     repetition simply by copying (because the recursion is allowed to refer to
2176     earlier groups that are outside the current group). However, when a group is
2177 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2178     inserted before it, after it has been compiled. This means that any OP_RECURSE
2179     items within it that refer to the group itself or any contained groups have to
2180     have their offsets adjusted. That one of the jobs of this function. Before it
2181     is called, the partially compiled regex must be temporarily terminated with
2182     OP_END.
2183 nigel 77
2184 nigel 93 This function has been extended with the possibility of forward references for
2185     recursions and subroutine calls. It must also check the list of such references
2186     for the group we are dealing with. If it finds that one of the recursions in
2187     the current group is on this list, it adjusts the offset in the list, not the
2188     value in the reference (which is a group number).
2189    
2190 nigel 77 Arguments:
2191     group points to the start of the group
2192     adjust the amount by which the group is to be moved
2193     utf8 TRUE in UTF-8 mode
2194     cd contains pointers to tables etc.
2195 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2196 nigel 77
2197     Returns: nothing
2198     */
2199    
2200     static void
2201 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2202     uschar *save_hwm)
2203 nigel 77 {
2204     uschar *ptr = group;
2205 ph10 224
2206 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2207     {
2208 nigel 93 int offset;
2209     uschar *hc;
2210    
2211     /* See if this recursion is on the forward reference list. If so, adjust the
2212     reference. */
2213 ph10 345
2214 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2215     {
2216     offset = GET(hc, 0);
2217     if (cd->start_code + offset == ptr + 1)
2218     {
2219     PUT(hc, 0, offset + adjust);
2220     break;
2221     }
2222     }
2223    
2224     /* Otherwise, adjust the recursion offset if it's after the start of this
2225     group. */
2226    
2227     if (hc >= cd->hwm)
2228     {
2229     offset = GET(ptr, 1);
2230     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2231     }
2232    
2233 nigel 77 ptr += 1 + LINK_SIZE;
2234     }
2235     }
2236    
2237    
2238    
2239     /*************************************************
2240     * Insert an automatic callout point *
2241     *************************************************/
2242    
2243     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2244     callout points before each pattern item.
2245    
2246     Arguments:
2247     code current code pointer
2248     ptr current pattern pointer
2249     cd pointers to tables etc
2250    
2251     Returns: new code pointer
2252     */
2253    
2254     static uschar *
2255     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2256     {
2257     *code++ = OP_CALLOUT;
2258     *code++ = 255;
2259     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2260     PUT(code, LINK_SIZE, 0); /* Default length */
2261     return code + 2*LINK_SIZE;
2262     }
2263    
2264    
2265    
2266     /*************************************************
2267     * Complete a callout item *
2268     *************************************************/
2269    
2270     /* A callout item contains the length of the next item in the pattern, which
2271     we can't fill in till after we have reached the relevant point. This is used
2272     for both automatic and manual callouts.
2273    
2274     Arguments:
2275     previous_callout points to previous callout item
2276     ptr current pattern pointer
2277     cd pointers to tables etc
2278    
2279     Returns: nothing
2280     */
2281    
2282     static void
2283     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2284     {
2285     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2286     PUT(previous_callout, 2 + LINK_SIZE, length);
2287     }
2288    
2289    
2290    
2291     #ifdef SUPPORT_UCP
2292     /*************************************************
2293     * Get othercase range *
2294     *************************************************/
2295    
2296     /* This function is passed the start and end of a class range, in UTF-8 mode
2297     with UCP support. It searches up the characters, looking for internal ranges of
2298     characters in the "other" case. Each call returns the next one, updating the
2299     start address.
2300    
2301     Arguments:
2302     cptr points to starting character value; updated
2303     d end value
2304     ocptr where to put start of othercase range
2305     odptr where to put end of othercase range
2306    
2307     Yield: TRUE when range returned; FALSE when no more
2308     */
2309    
2310     static BOOL
2311 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2312     unsigned int *odptr)
2313 nigel 77 {
2314 nigel 93 unsigned int c, othercase, next;
2315 nigel 77
2316     for (c = *cptr; c <= d; c++)
2317 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2318 nigel 77
2319     if (c > d) return FALSE;
2320    
2321     *ocptr = othercase;
2322     next = othercase + 1;
2323    
2324     for (++c; c <= d; c++)
2325     {
2326 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2327 nigel 77 next++;
2328     }
2329    
2330     *odptr = next - 1;
2331     *cptr = c;
2332    
2333     return TRUE;
2334     }
2335     #endif /* SUPPORT_UCP */
2336    
2337    
2338 nigel 93
2339 nigel 77 /*************************************************
2340 nigel 93 * Check if auto-possessifying is possible *
2341     *************************************************/
2342    
2343     /* This function is called for unlimited repeats of certain items, to see
2344     whether the next thing could possibly match the repeated item. If not, it makes
2345     sense to automatically possessify the repeated item.
2346    
2347     Arguments:
2348     op_code the repeated op code
2349     this data for this item, depends on the opcode
2350     utf8 TRUE in UTF-8 mode
2351     utf8_char used for utf8 character bytes, NULL if not relevant
2352     ptr next character in pattern
2353     options options bits
2354     cd contains pointers to tables etc.
2355    
2356     Returns: TRUE if possessifying is wanted
2357     */
2358    
2359     static BOOL
2360     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2361     const uschar *ptr, int options, compile_data *cd)
2362     {
2363     int next;
2364    
2365     /* Skip whitespace and comments in extended mode */
2366    
2367     if ((options & PCRE_EXTENDED) != 0)
2368     {
2369     for (;;)
2370     {
2371     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2372 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2373 nigel 93 {
2374     while (*(++ptr) != 0)
2375     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2376     }
2377     else break;
2378     }
2379     }
2380    
2381     /* If the next item is one that we can handle, get its value. A non-negative
2382     value is a character, a negative value is an escape value. */
2383    
2384 ph10 391 if (*ptr == CHAR_BACKSLASH)
2385 nigel 93 {
2386     int temperrorcode = 0;
2387     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2388     if (temperrorcode != 0) return FALSE;
2389     ptr++; /* Point after the escape sequence */
2390     }
2391    
2392     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2393     {
2394     #ifdef SUPPORT_UTF8
2395     if (utf8) { GETCHARINC(next, ptr); } else
2396     #endif
2397     next = *ptr++;
2398     }
2399    
2400     else return FALSE;
2401    
2402     /* Skip whitespace and comments in extended mode */
2403    
2404     if ((options & PCRE_EXTENDED) != 0)
2405     {
2406     for (;;)
2407     {
2408     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2409 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2410 nigel 93 {
2411     while (*(++ptr) != 0)
2412     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2413     }
2414     else break;
2415     }
2416     }
2417    
2418     /* If the next thing is itself optional, we have to give up. */
2419    
2420 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2421 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2422     return FALSE;
2423 nigel 93
2424     /* Now compare the next item with the previous opcode. If the previous is a
2425     positive single character match, "item" either contains the character or, if
2426     "item" is greater than 127 in utf8 mode, the character's bytes are in
2427     utf8_char. */
2428    
2429    
2430     /* Handle cases when the next item is a character. */
2431    
2432     if (next >= 0) switch(op_code)
2433     {
2434     case OP_CHAR:
2435     #ifdef SUPPORT_UTF8
2436     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2437 ph10 369 #else
2438     (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2439 nigel 93 #endif
2440     return item != next;
2441    
2442     /* For CHARNC (caseless character) we must check the other case. If we have
2443     Unicode property support, we can use it to test the other case of
2444     high-valued characters. */
2445    
2446     case OP_CHARNC:
2447     #ifdef SUPPORT_UTF8
2448     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2449     #endif
2450     if (item == next) return FALSE;
2451     #ifdef SUPPORT_UTF8
2452     if (utf8)
2453     {
2454     unsigned int othercase;
2455     if (next < 128) othercase = cd->fcc[next]; else
2456     #ifdef SUPPORT_UCP
2457 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2458 nigel 93 #else
2459     othercase = NOTACHAR;
2460     #endif
2461     return (unsigned int)item != othercase;
2462     }
2463     else
2464     #endif /* SUPPORT_UTF8 */
2465     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2466    
2467     /* For OP_NOT, "item" must be a single-byte character. */
2468    
2469     case OP_NOT:
2470     if (item == next) return TRUE;
2471     if ((options & PCRE_CASELESS) == 0) return FALSE;
2472     #ifdef SUPPORT_UTF8
2473     if (utf8)
2474     {
2475     unsigned int othercase;
2476     if (next < 128) othercase = cd->fcc[next]; else
2477     #ifdef SUPPORT_UCP
2478 ph10 349 othercase = UCD_OTHERCASE(next);
2479 nigel 93 #else
2480     othercase = NOTACHAR;
2481     #endif
2482     return (unsigned int)item == othercase;
2483     }
2484     else
2485     #endif /* SUPPORT_UTF8 */
2486     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2487    
2488     case OP_DIGIT:
2489     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2490    
2491     case OP_NOT_DIGIT:
2492     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2493    
2494     case OP_WHITESPACE:
2495     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2496    
2497     case OP_NOT_WHITESPACE:
2498     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2499    
2500     case OP_WORDCHAR:
2501     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2502    
2503     case OP_NOT_WORDCHAR:
2504     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2505    
2506 ph10 180 case OP_HSPACE:
2507     case OP_NOT_HSPACE:
2508     switch(next)
2509     {
2510     case 0x09:
2511     case 0x20:
2512     case 0xa0:
2513     case 0x1680:
2514     case 0x180e:
2515     case 0x2000:
2516     case 0x2001:
2517     case 0x2002:
2518     case 0x2003:
2519     case 0x2004:
2520     case 0x2005:
2521     case 0x2006:
2522     case 0x2007:
2523     case 0x2008:
2524     case 0x2009:
2525     case 0x200A:
2526     case 0x202f:
2527     case 0x205f:
2528     case 0x3000:
2529     return op_code != OP_HSPACE;
2530     default:
2531     return op_code == OP_HSPACE;
2532     }
2533    
2534     case OP_VSPACE:
2535     case OP_NOT_VSPACE:
2536     switch(next)
2537     {
2538     case 0x0a:
2539     case 0x0b:
2540     case 0x0c:
2541     case 0x0d:
2542     case 0x85:
2543     case 0x2028:
2544     case 0x2029:
2545     return op_code != OP_VSPACE;
2546     default:
2547     return op_code == OP_VSPACE;
2548     }
2549    
2550 nigel 93 default:
2551     return FALSE;
2552     }
2553    
2554    
2555     /* Handle the case when the next item is \d, \s, etc. */
2556    
2557     switch(op_code)
2558     {
2559     case OP_CHAR:
2560     case OP_CHARNC:
2561     #ifdef SUPPORT_UTF8
2562     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2563     #endif
2564     switch(-next)
2565     {
2566     case ESC_d:
2567     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2568    
2569     case ESC_D:
2570     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2571    
2572     case ESC_s:
2573     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2574    
2575     case ESC_S:
2576     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2577    
2578     case ESC_w:
2579     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2580    
2581     case ESC_W:
2582     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2583 ph10 182
2584 ph10 180 case ESC_h:
2585     case ESC_H:
2586     switch(item)
2587     {
2588     case 0x09:
2589     case 0x20:
2590     case 0xa0:
2591     case 0x1680:
2592     case 0x180e:
2593     case 0x2000:
2594     case 0x2001:
2595     case 0x2002:
2596     case 0x2003:
2597     case 0x2004:
2598     case 0x2005:
2599     case 0x2006:
2600     case 0x2007:
2601     case 0x2008:
2602     case 0x2009:
2603     case 0x200A:
2604     case 0x202f:
2605     case 0x205f:
2606     case 0x3000:
2607     return -next != ESC_h;
2608     default:
2609     return -next == ESC_h;
2610 ph10 182 }
2611    
2612 ph10 180 case ESC_v:
2613     case ESC_V:
2614     switch(item)
2615     {
2616     case 0x0a:
2617     case 0x0b:
2618     case 0x0c:
2619     case 0x0d:
2620     case 0x85:
2621     case 0x2028:
2622     case 0x2029:
2623     return -next != ESC_v;
2624     default:
2625     return -next == ESC_v;
2626 ph10 182 }
2627 nigel 93
2628     default:
2629     return FALSE;
2630     }
2631    
2632     case OP_DIGIT:
2633 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2634     next == -ESC_h || next == -ESC_v;
2635 nigel 93
2636     case OP_NOT_DIGIT:
2637     return next == -ESC_d;
2638    
2639     case OP_WHITESPACE:
2640     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2641    
2642     case OP_NOT_WHITESPACE:
2643 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2644 nigel 93
2645 ph10 180 case OP_HSPACE:
2646     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2647    
2648     case OP_NOT_HSPACE:
2649     return next == -ESC_h;
2650 ph10 182
2651 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2652 ph10 182 case OP_VSPACE:
2653 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2654    
2655     case OP_NOT_VSPACE:
2656 ph10 182 return next == -ESC_v;
2657 ph10 180
2658 nigel 93 case OP_WORDCHAR:
2659 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2660 nigel 93
2661     case OP_NOT_WORDCHAR:
2662     return next == -ESC_w || next == -ESC_d;
2663 ph10 182
2664 nigel 93 default:
2665     return FALSE;
2666     }
2667    
2668     /* Control does not reach here */
2669     }
2670    
2671    
2672    
2673     /*************************************************
2674 nigel 77 * Compile one branch *
2675     *************************************************/
2676    
2677 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2678 nigel 77 changed during the branch, the pointer is used to change the external options
2679 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2680     to find out the amount of memory needed, as well as during the real compile
2681     phase. The value of lengthptr distinguishes the two phases.
2682 nigel 77
2683     Arguments:
2684     optionsptr pointer to the option bits
2685     codeptr points to the pointer to the current code point
2686     ptrptr points to the current pattern pointer
2687     errorcodeptr points to error code variable
2688     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2689     reqbyteptr set to the last literal character required, else < 0
2690     bcptr points to current branch chain
2691     cd contains pointers to tables etc.
2692 nigel 93 lengthptr NULL during the real compile phase
2693     points to length accumulator during pre-compile phase
2694 nigel 77
2695     Returns: TRUE on success
2696     FALSE, with *errorcodeptr set non-zero on error
2697     */
2698    
2699     static BOOL
2700 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2701     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2702     compile_data *cd, int *lengthptr)
2703 nigel 77 {
2704     int repeat_type, op_type;
2705     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2706     int bravalue = 0;
2707     int greedy_default, greedy_non_default;
2708     int firstbyte, reqbyte;
2709     int zeroreqbyte, zerofirstbyte;
2710     int req_caseopt, reqvary, tempreqvary;
2711     int options = *optionsptr;
2712     int after_manual_callout = 0;
2713 nigel 93 int length_prevgroup = 0;
2714 nigel 77 register int c;
2715     register uschar *code = *codeptr;
2716 nigel 93 uschar *last_code = code;
2717     uschar *orig_code = code;
2718 nigel 77 uschar *tempcode;
2719     BOOL inescq = FALSE;
2720     BOOL groupsetfirstbyte = FALSE;
2721     const uschar *ptr = *ptrptr;
2722     const uschar *tempptr;
2723     uschar *previous = NULL;
2724     uschar *previous_callout = NULL;
2725 nigel 93 uschar *save_hwm = NULL;
2726 nigel 77 uschar classbits[32];
2727    
2728     #ifdef SUPPORT_UTF8
2729     BOOL class_utf8;
2730     BOOL utf8 = (options & PCRE_UTF8) != 0;
2731     uschar *class_utf8data;
2732 ph10 300 uschar *class_utf8data_base;
2733 nigel 77 uschar utf8_char[6];
2734     #else
2735     BOOL utf8 = FALSE;
2736 nigel 93 uschar *utf8_char = NULL;
2737 nigel 77 #endif
2738    
2739 ph10 475 #ifdef PCRE_DEBUG
2740 nigel 93 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2741     #endif
2742    
2743 nigel 77 /* Set up the default and non-default settings for greediness */
2744    
2745     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2746     greedy_non_default = greedy_default ^ 1;
2747    
2748     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2749     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2750     matches a non-fixed char first char; reqbyte just remains unset if we never
2751     find one.
2752    
2753     When we hit a repeat whose minimum is zero, we may have to adjust these values
2754     to take the zero repeat into account. This is implemented by setting them to
2755     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2756     item types that can be repeated set these backoff variables appropriately. */
2757    
2758     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2759    
2760     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2761     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2762     value > 255. It is added into the firstbyte or reqbyte variables to record the
2763     case status of the value. This is used only for ASCII characters. */
2764    
2765     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2766    
2767     /* Switch on next character until the end of the branch */
2768    
2769     for (;; ptr++)
2770     {
2771     BOOL negate_class;
2772 ph10 286 BOOL should_flip_negation;
2773 nigel 77 BOOL possessive_quantifier;
2774     BOOL is_quantifier;
2775 nigel 93 BOOL is_recurse;
2776 ph10 180 BOOL reset_bracount;
2777 nigel 77 int class_charcount;
2778     int class_lastchar;
2779     int newoptions;
2780     int recno;
2781 ph10 172 int refsign;
2782 nigel 77 int skipbytes;
2783     int subreqbyte;
2784     int subfirstbyte;
2785 nigel 93 int terminator;
2786 nigel 77 int mclength;
2787     uschar mcbuffer[8];
2788    
2789 nigel 93 /* Get next byte in the pattern */
2790 nigel 77
2791     c = *ptr;
2792 ph10 345
2793 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2794     previous cycle of this loop. */
2795    
2796     if (lengthptr != NULL)
2797     {
2798 ph10 475 #ifdef PCRE_DEBUG
2799 nigel 93 if (code > cd->hwm) cd->hwm = code; /* High water info */
2800     #endif
2801 ph10 505 if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */
2802 nigel 93 {
2803     *errorcodeptr = ERR52;
2804     goto FAILED;
2805     }
2806    
2807     /* There is at least one situation where code goes backwards: this is the
2808     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2809     the class is simply eliminated. However, it is created first, so we have to
2810     allow memory for it. Therefore, don't ever reduce the length at this point.
2811     */
2812    
2813     if (code < last_code) code = last_code;
2814 ph10 202
2815     /* Paranoid check for integer overflow */
2816    
2817     if (OFLOW_MAX - *lengthptr < code - last_code)
2818     {
2819     *errorcodeptr = ERR20;
2820     goto FAILED;
2821     }
2822    
2823 nigel 93 *lengthptr += code - last_code;
2824     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2825    
2826     /* If "previous" is set and it is not at the start of the work space, move
2827     it back to there, in order to avoid filling up the work space. Otherwise,
2828     if "previous" is NULL, reset the current code pointer to the start. */
2829    
2830     if (previous != NULL)
2831     {
2832     if (previous > orig_code)
2833     {
2834     memmove(orig_code, previous, code - previous);
2835     code -= previous - orig_code;
2836     previous = orig_code;
2837     }
2838     }
2839     else code = orig_code;
2840    
2841     /* Remember where this code item starts so we can pick up the length
2842     next time round. */
2843    
2844     last_code = code;
2845     }
2846    
2847     /* In the real compile phase, just check the workspace used by the forward
2848     reference list. */
2849    
2850 ph10 505 else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
2851 nigel 93 {
2852     *errorcodeptr = ERR52;
2853     goto FAILED;
2854     }
2855    
2856 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2857    
2858     if (inescq && c != 0)
2859     {
2860 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2861 nigel 77 {
2862     inescq = FALSE;
2863     ptr++;
2864     continue;
2865     }
2866     else
2867     {
2868     if (previous_callout != NULL)
2869     {
2870 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2871     complete_callout(previous_callout, ptr, cd);
2872 nigel 77 previous_callout = NULL;
2873     }
2874     if ((options & PCRE_AUTO_CALLOUT) != 0)
2875     {
2876     previous_callout = code;
2877     code = auto_callout(code, ptr, cd);
2878     }
2879     goto NORMAL_CHAR;
2880     }
2881     }
2882    
2883     /* Fill in length of a previous callout, except when the next thing is
2884     a quantifier. */
2885    
2886 ph10 392 is_quantifier =
2887 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2888     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2889 nigel 77
2890     if (!is_quantifier && previous_callout != NULL &&
2891     after_manual_callout-- <= 0)
2892     {
2893 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2894     complete_callout(previous_callout, ptr, cd);
2895 nigel 77 previous_callout = NULL;
2896     }
2897    
2898     /* In extended mode, skip white space and comments */
2899    
2900     if ((options & PCRE_EXTENDED) != 0)
2901     {
2902     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2903 ph10 391 if (c == CHAR_NUMBER_SIGN)
2904 nigel 77 {
2905 nigel 93 while (*(++ptr) != 0)
2906 nigel 91 {
2907 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2908 nigel 91 }
2909 nigel 93 if (*ptr != 0) continue;
2910    
2911 nigel 91 /* Else fall through to handle end of string */
2912     c = 0;
2913 nigel 77 }
2914     }
2915    
2916     /* No auto callout for quantifiers. */
2917    
2918     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2919     {
2920     previous_callout = code;
2921     code = auto_callout(code, ptr, cd);
2922     }
2923    
2924     switch(c)
2925     {
2926 nigel 93 /* ===================================================================*/
2927     case 0: /* The branch terminates at string end */
2928 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
2929     case CHAR_RIGHT_PARENTHESIS:
2930 nigel 77 *firstbyteptr = firstbyte;
2931     *reqbyteptr = reqbyte;
2932     *codeptr = code;
2933     *ptrptr = ptr;
2934 nigel 93 if (lengthptr != NULL)
2935     {
2936 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2937     {
2938     *errorcodeptr = ERR20;
2939     goto FAILED;
2940     }
2941 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2942     DPRINTF((">> end branch\n"));
2943     }
2944 nigel 77 return TRUE;
2945    
2946 nigel 93
2947     /* ===================================================================*/
2948 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2949     the setting of any following char as a first character. */
2950    
2951 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
2952 nigel 77 if ((options & PCRE_MULTILINE) != 0)
2953     {
2954     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2955     }
2956     previous = NULL;
2957     *code++ = OP_CIRC;
2958     break;
2959    
2960 ph10 391 case CHAR_DOLLAR_SIGN:
2961 nigel 77 previous = NULL;
2962     *code++ = OP_DOLL;
2963     break;
2964    
2965     /* There can never be a first char if '.' is first, whatever happens about
2966     repeats. The value of reqbyte doesn't change either. */
2967    
2968 ph10 391 case CHAR_DOT:
2969 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2970     zerofirstbyte = firstbyte;
2971     zeroreqbyte = reqbyte;
2972     previous = code;
2973 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2974 nigel 77 break;
2975    
2976 nigel 93
2977     /* ===================================================================*/
2978 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2979     32-byte bitmap of the permitted characters, except in the special case
2980     where there is only one such character. For negated classes, we build the
2981     map as usual, then invert it at the end. However, we use a different opcode
2982     so that data characters > 255 can be handled correctly.
2983 nigel 77
2984     If the class contains characters outside the 0-255 range, a different
2985     opcode is compiled. It may optionally have a bit map for characters < 256,
2986     but those above are are explicitly listed afterwards. A flag byte tells
2987     whether the bitmap is present, and whether this is a negated class or not.
2988 ph10 345
2989 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
2990     default (Perl) mode, it is treated as a data character. */
2991 ph10 345
2992 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
2993 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2994     {
2995     *errorcodeptr = ERR64;
2996 ph10 345 goto FAILED;
2997 ph10 336 }
2998 ph10 345 goto NORMAL_CHAR;
2999 nigel 77
3000 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
3001 nigel 77 previous = code;
3002    
3003     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3004     they are encountered at the top level, so we'll do that too. */
3005    
3006 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3007 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
3008 ph10 295 check_posix_syntax(ptr, &tempptr))
3009 nigel 77 {
3010 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3011 nigel 77 goto FAILED;
3012     }
3013    
3014 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
3015 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
3016 ph10 205 skip them too. This makes for compatibility with Perl. */
3017 ph10 208
3018 ph10 205 negate_class = FALSE;
3019     for (;;)
3020 nigel 77 {
3021     c = *(++ptr);
3022 ph10 391 if (c == CHAR_BACKSLASH)
3023 ph10 205 {
3024 ph10 392 if (ptr[1] == CHAR_E)
3025 ph10 391 ptr++;
3026 ph10 392 else if (strncmp((const char *)ptr+1,
3027     STR_Q STR_BACKSLASH STR_E, 3) == 0)
3028 ph10 391 ptr += 3;
3029 ph10 392 else
3030 ph10 391 break;
3031 ph10 205 }
3032 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3033 ph10 205 negate_class = TRUE;
3034     else break;
3035 ph10 208 }
3036 ph10 345
3037     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3038     an initial ']' is taken as a data character -- the code below handles
3039 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3040     [^] must match any character, so generate OP_ALLANY. */
3041 ph10 345
3042 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3043 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3044 ph10 341 {
3045     *code++ = negate_class? OP_ALLANY : OP_FAIL;
3046     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3047     zerofirstbyte = firstbyte;
3048     break;
3049 ph10 345 }
3050 nigel 77
3051 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
3052     negation flag at the end, so that support for characters > 255 works
3053 ph10 264 correctly (they are all included in the class). */
3054    
3055     should_flip_negation = FALSE;
3056    
3057 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
3058 nigel 93 of just a single character (as long as it's < 256). However, For higher
3059     valued UTF-8 characters, we don't yet do any optimization. */
3060 nigel 77
3061     class_charcount = 0;
3062     class_lastchar = -1;
3063    
3064 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
3065     temporary bit of memory, in case the class contains only 1 character (less
3066     than 256), because in that case the compiled code doesn't use the bit map.
3067     */
3068    
3069     memset(classbits, 0, 32 * sizeof(uschar));
3070    
3071 nigel 77 #ifdef SUPPORT_UTF8
3072     class_utf8 = FALSE; /* No chars >= 256 */
3073 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3074 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3075 nigel 77 #endif
3076    
3077     /* Process characters until ] is reached. By writing this as a "do" it
3078 nigel 93 means that an initial ] is taken as a data character. At the start of the
3079     loop, c contains the first byte of the character. */
3080 nigel 77
3081 nigel 93 if (c != 0) do
3082 nigel 77 {
3083 nigel 93 const uschar *oldptr;
3084    
3085 nigel 77 #ifdef SUPPORT_UTF8
3086     if (utf8 && c > 127)
3087     { /* Braces are required because the */
3088     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3089     }
3090 ph10 309
3091 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3092 ph10 309 data and reset the pointer. This is so that very large classes that
3093 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
3094 ph10 309 (which is on the stack). */
3095    
3096 ph10 300 if (lengthptr != NULL)
3097     {
3098     *lengthptr += class_utf8data - class_utf8data_base;
3099 ph10 309 class_utf8data = class_utf8data_base;
3100     }
3101    
3102 nigel 77 #endif
3103    
3104     /* Inside \Q...\E everything is literal except \E */
3105    
3106     if (inescq)
3107     {
3108 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3109 nigel 77 {
3110 nigel 93 inescq = FALSE; /* Reset literal state */
3111     ptr++; /* Skip the 'E' */
3112     continue; /* Carry on with next */
3113 nigel 77 }
3114 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
3115 nigel 77 }
3116    
3117     /* Handle POSIX class names. Perl allows a negation extension of the
3118     form [:^name:]. A square bracket that doesn't match the syntax is
3119     treated as a literal. We also recognize the POSIX constructions
3120     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3121     5.6 and 5.8 do. */
3122    
3123 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3124 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3125 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3126 nigel 77 {
3127     BOOL local_negate = FALSE;
3128 nigel 87 int posix_class, taboffset, tabopt;
3129 nigel 77 register const uschar *cbits = cd->cbits;
3130 nigel 87 uschar pbits[32];
3131 nigel 77
3132 ph10 391 if (ptr[1] != CHAR_COLON)
3133 nigel 77 {
3134     *errorcodeptr = ERR31;
3135     goto FAILED;
3136     }
3137    
3138     ptr += 2;
3139 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3140 nigel 77 {
3141     local_negate = TRUE;
3142 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3143 nigel 77 ptr++;
3144     }
3145    
3146     posix_class = check_posix_name(ptr, tempptr - ptr);
3147     if (posix_class < 0)
3148     {
3149     *errorcodeptr = ERR30;
3150     goto FAILED;
3151     }
3152    
3153     /* If matching is caseless, upper and lower are converted to
3154     alpha. This relies on the fact that the class table starts with
3155     alpha, lower, upper as the first 3 entries. */
3156    
3157     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3158     posix_class = 0;
3159    
3160 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
3161     because we may be adding and subtracting from it, and we don't want to
3162     subtract bits that may be in the main map already. At the end we or the
3163     result into the bit map that is being built. */
3164 nigel 77
3165     posix_class *= 3;
3166 nigel 87
3167     /* Copy in the first table (always present) */
3168    
3169     memcpy(pbits, cbits + posix_class_maps[posix_class],
3170     32 * sizeof(uschar));
3171    
3172     /* If there is a second table, add or remove it as required. */
3173    
3174     taboffset = posix_class_maps[posix_class + 1];
3175     tabopt = posix_class_maps[posix_class + 2];
3176    
3177     if (taboffset >= 0)
3178 nigel 77 {
3179 nigel 87 if (tabopt >= 0)
3180     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3181 nigel 77 else
3182 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3183 nigel 77 }
3184    
3185 nigel 87 /* Not see if we need to remove any special characters. An option
3186     value of 1 removes vertical space and 2 removes underscore. */
3187    
3188     if (tabopt < 0) tabopt = -tabopt;
3189     if (tabopt == 1) pbits[1] &= ~0x3c;
3190     else if (tabopt == 2) pbits[11] &= 0x7f;
3191    
3192     /* Add the POSIX table or its complement into the main table that is
3193     being built and we are done. */
3194    
3195     if (local_negate)
3196     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3197     else
3198     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3199    
3200 nigel 77 ptr = tempptr + 1;
3201     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3202     continue; /* End of POSIX syntax handling */
3203     }
3204    
3205     /* Backslash may introduce a single character, or it may introduce one
3206 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3207 ph10 513 case. Inside a class (and only there) it is treated as backspace. We
3208     assume that other escapes have more than one character in them, so set
3209     class_charcount bigger than one. Unrecognized escapes fall through and
3210     are either treated as literal characters (by default), or are faulted if
3211     PCRE_EXTRA is set. */
3212 nigel 77
3213 ph10 391 if (c == CHAR_BACKSLASH)
3214 nigel 77 {
3215 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3216     if (*errorcodeptr != 0) goto FAILED;
3217 nigel 77
3218 ph10 513 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3219 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3220     {
3221 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3222 nigel 77 {
3223     ptr += 2; /* avoid empty string */
3224     }
3225     else inescq = TRUE;
3226     continue;
3227     }
3228 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3229 nigel 77
3230     if (c < 0)
3231     {
3232     register const uschar *cbits = cd->cbits;
3233     class_charcount += 2; /* Greater than 1 is what matters */
3234 nigel 93
3235     /* Save time by not doing this in the pre-compile phase. */
3236    
3237     if (lengthptr == NULL) switch (-c)
3238 nigel 77 {
3239     case ESC_d:
3240     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3241     continue;
3242    
3243     case ESC_D:
3244 ph10 286 should_flip_negation = TRUE;
3245 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3246     continue;
3247    
3248     case ESC_w:
3249     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3250     continue;
3251    
3252     case ESC_W:
3253 ph10 286 should_flip_negation = TRUE;
3254 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3255     continue;
3256    
3257     case ESC_s:
3258     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3259     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3260     continue;
3261    
3262     case ESC_S:
3263 ph10 286 should_flip_negation = TRUE;
3264 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3265     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3266     continue;
3267    
3268 nigel 93 default: /* Not recognized; fall through */
3269     break; /* Need "default" setting to stop compiler warning. */
3270     }
3271    
3272     /* In the pre-compile phase, just do the recognition. */
3273    
3274     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3275     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3276 ph10 180
3277 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
3278     they use extra memory. */
3279 ph10 180
3280 ph10 178 if (-c == ESC_h)
3281     {
3282     SETBIT(classbits, 0x09); /* VT */
3283     SETBIT(classbits, 0x20); /* SPACE */
3284 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
3285 ph10 178 #ifdef SUPPORT_UTF8
3286     if (utf8)
3287 ph10 180 {
3288 ph10 178 class_utf8 = TRUE;
3289     *class_utf8data++ = XCL_SINGLE;
3290 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3291 ph10 178 *class_utf8data++ = XCL_SINGLE;
3292 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3293     *class_utf8data++ = XCL_RANGE;
3294     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3295     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3296 ph10 178 *class_utf8data++ = XCL_SINGLE;
3297 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3298 ph10 178 *class_utf8data++ = XCL_SINGLE;
3299 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3300 ph10 178 *class_utf8data++ = XCL_SINGLE;
3301 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3302     }
3303     #endif
3304     continue;
3305     }
3306 nigel 93
3307 ph10 178 if (-c == ESC_H)
3308     {
3309     for (c = 0; c < 32; c++)
3310     {
3311     int x = 0xff;
3312     switch (c)
3313 ph10 180 {
3314 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
3315     case 0x20/8: x ^= 1 << (0x20%8); break;
3316     case 0xa0/8: x ^= 1 << (0xa0%8); break;
3317     default: break;
3318     }
3319     classbits[c] |= x;
3320 ph10 180 }
3321    
3322 ph10 178 #ifdef SUPPORT_UTF8
3323     if (utf8)
3324 ph10 180 {
3325 ph10 178 class_utf8 = TRUE;
3326 ph10 180 *class_utf8data++ = XCL_RANGE;
3327     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3328     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3329     *class_utf8data++ = XCL_RANGE;
3330     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3331     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3332     *class_utf8data++ = XCL_RANGE;
3333     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3334     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3335     *class_utf8data++ = XCL_RANGE;
3336     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3337     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3338     *class_utf8data++ = XCL_RANGE;
3339     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3340     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3341     *class_utf8data++ = XCL_RANGE;
3342     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3343     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3344     *class_utf8data++ = XCL_RANGE;
3345     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3346     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3347     }
3348     #endif
3349     continue;
3350     }
3351 ph10 178
3352     if (-c == ESC_v)
3353     {
3354     SETBIT(classbits, 0x0a); /* LF */
3355     SETBIT(classbits, 0x0b); /* VT */
3356 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3357     SETBIT(classbits, 0x0d); /* CR */
3358     SETBIT(classbits, 0x85); /* NEL */
3359 ph10 178 #ifdef SUPPORT_UTF8
3360     if (utf8)
3361 ph10 180 {
3362 ph10 178 class_utf8 = TRUE;
3363 ph10 180 *class_utf8data++ = XCL_RANGE;
3364     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3365     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3366     }
3367     #endif
3368     continue;
3369     }
3370 ph10 178
3371     if (-c == ESC_V)
3372     {
3373     for (c = 0; c < 32; c++)
3374     {
3375     int x = 0xff;
3376     switch (c)
3377 ph10 180 {
3378 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3379     x ^= 1 << (0x0b%8);
3380     x ^= 1 << (0x0c%8);
3381 ph10 180 x ^= 1 << (0x0d%8);
3382 ph10 178 break;
3383     case 0x85/8: x ^= 1 << (0x85%8); break;
3384     default: break;
3385     }
3386     classbits[c] |= x;
3387 ph10 180 }
3388    
3389 ph10 178 #ifdef SUPPORT_UTF8
3390     if (utf8)
3391 ph10 180 {
3392 ph10 178 class_utf8 = TRUE;
3393 ph10 180 *class_utf8data++ = XCL_RANGE;
3394     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3395     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3396     *class_utf8data++ = XCL_RANGE;
3397     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3398     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3399     }
3400     #endif
3401     continue;
3402     }
3403 ph10 178
3404 nigel 93 /* We need to deal with \P and \p in both phases. */
3405    
3406 nigel 77 #ifdef SUPPORT_UCP
3407 nigel 93 if (-c == ESC_p || -c == ESC_P)
3408     {
3409     BOOL negated;
3410     int pdata;
3411     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3412     if (ptype < 0) goto FAILED;
3413     class_utf8 = TRUE;
3414     *class_utf8data++ = ((-c == ESC_p) != negated)?
3415     XCL_PROP : XCL_NOTPROP;
3416     *class_utf8data++ = ptype;
3417     *class_utf8data++ = pdata;
3418     class_charcount -= 2; /* Not a < 256 character */
3419 nigel 77 continue;
3420 nigel 93 }
3421 nigel 77 #endif
3422 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
3423     strict mode. By default, for compatibility with Perl, they are
3424     treated as literals. */
3425 nigel 77
3426 nigel 93 if ((options & PCRE_EXTRA) != 0)
3427     {
3428     *errorcodeptr = ERR7;
3429     goto FAILED;
3430     }
3431 nigel 77
3432 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3433     c = *ptr; /* Get the final character and fall through */
3434 nigel 77 }
3435    
3436     /* Fall through if we have a single character (c >= 0). This may be
3437 nigel 93 greater than 256 in UTF-8 mode. */
3438 nigel 77
3439     } /* End of backslash handling */
3440    
3441     /* A single character may be followed by '-' to form a range. However,
3442     Perl does not permit ']' to be the end of the range. A '-' character
3443 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3444     entirely. The code for handling \Q and \E is messy. */
3445 nigel 77
3446 nigel 93 CHECK_RANGE:
3447 ph10 391 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3448 nigel 77 {
3449 nigel 93 inescq = FALSE;
3450     ptr += 2;
3451     }
3452    
3453     oldptr = ptr;
3454 ph10 231
3455 ph10 230 /* Remember \r or \n */
3456 ph10 231
3457 ph10 391 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3458 ph10 231
3459 ph10 230 /* Check for range */
3460 nigel 93
3461 ph10 391 if (!inescq && ptr[1] == CHAR_MINUS)
3462 nigel 93 {
3463 nigel 77 int d;
3464     ptr += 2;
3465 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3466 nigel 77
3467 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3468     mode. */
3469    
3470 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3471 nigel 93 {
3472     ptr += 2;
3473 ph10 392 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3474 ph10 391 { ptr += 2; continue; }
3475 nigel 93 inescq = TRUE;
3476     break;
3477     }
3478    
3479 ph10 391 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3480 nigel 93 {
3481     ptr = oldptr;
3482     goto LONE_SINGLE_CHARACTER;
3483     }
3484    
3485 nigel 77 #ifdef SUPPORT_UTF8
3486     if (utf8)
3487     { /* Braces are required because the */
3488     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3489     }
3490     else
3491     #endif
3492     d = *ptr; /* Not UTF-8 mode */
3493    
3494     /* The second part of a range can be a single-character escape, but
3495     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3496     in such circumstances. */
3497    
3498 ph10 391 if (!inescq && d == CHAR_BACKSLASH)
3499 nigel 77 {
3500 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3501     if (*errorcodeptr != 0) goto FAILED;
3502 nigel 77
3503 ph10 275 /* \b is backspace; \X is literal X; \R is literal R; any other
3504 nigel 93 special means the '-' was literal */
3505 nigel 77
3506     if (d < 0)
3507     {
3508 ph10 391 if (d == -ESC_b) d = CHAR_BS;
3509     else if (d == -ESC_X) d = CHAR_X;
3510     else if (d == -ESC_R) d = CHAR_R; else
3511 nigel 77 {
3512 nigel 93 ptr = oldptr;
3513 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3514     }
3515     }
3516     }
3517    
3518 nigel 93 /* Check that the two values are in the correct order. Optimize
3519     one-character ranges */
3520 nigel 77
3521 nigel 93 if (d < c)
3522     {
3523     *errorcodeptr = ERR8;
3524     goto FAILED;
3525     }
3526    
3527 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3528    
3529 ph10 230 /* Remember \r or \n */
3530 ph10 231
3531 ph10 391 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3532 ph10 231
3533 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3534     matching, we have to use an XCLASS with extra data items. Caseless
3535     matching for characters > 127 is available only if UCP support is
3536     available. */
3537    
3538     #ifdef SUPPORT_UTF8
3539     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3540     {
3541     class_utf8 = TRUE;
3542    
3543     /* With UCP support, we can find the other case equivalents of
3544     the relevant characters. There may be several ranges. Optimize how
3545     they fit with the basic range. */
3546    
3547     #ifdef SUPPORT_UCP
3548     if ((options & PCRE_CASELESS) != 0)
3549     {
3550 nigel 93 unsigned int occ, ocd;
3551     unsigned int cc = c;
3552     unsigned int origd = d;
3553 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3554     {
3555 ph10 180 if (occ >= (unsigned int)c &&
3556     ocd <= (unsigned int)d)
3557 ph10 176 continue; /* Skip embedded ranges */
3558 nigel 77
3559 ph10 180 if (occ < (unsigned int)c &&
3560 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3561 nigel 77 { /* if there is overlap, */
3562     c = occ; /* noting that if occ < c */
3563     continue; /* we can't have ocd > d */
3564     } /* because a subrange is */
3565 ph10 180 if (ocd > (unsigned int)d &&
3566 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3567 nigel 77 { /* the basic range. */
3568     d = ocd;
3569     continue;
3570     }
3571    
3572     if (occ == ocd)
3573     {
3574     *class_utf8data++ = XCL_SINGLE;
3575     }
3576     else
3577     {
3578     *class_utf8data++ = XCL_RANGE;
3579     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3580     }
3581     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3582     }
3583     }
3584     #endif /* SUPPORT_UCP */
3585    
3586     /* Now record the original range, possibly modified for UCP caseless
3587     overlapping ranges. */
3588    
3589     *class_utf8data++ = XCL_RANGE;
3590     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3591     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3592    
3593     /* With UCP support, we are done. Without UCP support, there is no
3594     caseless matching for UTF-8 characters > 127; we can use the bit map
3595     for the smaller ones. */
3596    
3597     #ifdef SUPPORT_UCP
3598     continue; /* With next character in the class */
3599     #else
3600     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3601    
3602     /* Adjust upper limit and fall through to set up the map */
3603    
3604     d = 127;
3605    
3606     #endif /* SUPPORT_UCP */
3607     }
3608     #endif /* SUPPORT_UTF8 */
3609    
3610     /* We use the bit map for all cases when not in UTF-8 mode; else
3611     ranges that lie entirely within 0-127 when there is UCP support; else
3612     for partial ranges without UCP support. */
3613    
3614 nigel 93 class_charcount += d - c + 1;
3615     class_lastchar = d;
3616    
3617     /* We can save a bit of time by skipping this in the pre-compile. */
3618    
3619     if (lengthptr == NULL) for (; c <= d; c++)
3620 nigel 77 {
3621     classbits[c/8] |= (1 << (c&7));
3622     if ((options & PCRE_CASELESS) != 0)
3623     {
3624     int uc = cd->fcc[c]; /* flip case */
3625     classbits[uc/8] |= (1 << (uc&7));
3626     }
3627     }
3628    
3629     continue; /* Go get the next char in the class */
3630     }
3631    
3632     /* Handle a lone single character - we can get here for a normal
3633     non-escape char, or after \ that introduces a single character or for an
3634     apparent range that isn't. */
3635    
3636     LONE_SINGLE_CHARACTER:
3637 ph10 231
3638 nigel 77 /* Handle a character that cannot go in the bit map */
3639    
3640     #ifdef SUPPORT_UTF8
3641     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3642     {
3643     class_utf8 = TRUE;
3644     *class_utf8data++ = XCL_SINGLE;
3645     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3646    
3647     #ifdef SUPPORT_UCP
3648     if ((options & PCRE_CASELESS) != 0)
3649     {
3650 nigel 93 unsigned int othercase;
3651 ph10 349 if ((othercase = UCD_OTHERCASE(c)) != c)
3652 nigel 77 {
3653     *class_utf8data++ = XCL_SINGLE;
3654     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3655     }
3656     }
3657     #endif /* SUPPORT_UCP */
3658    
3659     }
3660     else
3661     #endif /* SUPPORT_UTF8 */
3662    
3663     /* Handle a single-byte character */
3664     {
3665     classbits[c/8] |= (1 << (c&7));
3666     if ((options & PCRE_CASELESS) != 0)
3667     {
3668     c = cd->fcc[c]; /* flip case */
3669     classbits[c/8] |= (1 << (c&7));
3670     }
3671     class_charcount++;
3672     class_lastchar = c;
3673     }
3674     }
3675    
3676 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3677 nigel 77
3678 ph10 391 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3679 nigel 77
3680 nigel 93 if (c == 0) /* Missing terminating ']' */
3681     {
3682     *errorcodeptr = ERR6;
3683     goto FAILED;
3684     }
3685 ph10 231
3686    
3687 ph10 230 /* This code has been disabled because it would mean that \s counts as
3688     an explicit \r or \n reference, and that's not really what is wanted. Now
3689     we set the flag only if there is a literal "\r" or "\n" in the class. */
3690 ph10 227
3691 ph10 230 #if 0
3692 ph10 226 /* Remember whether \r or \n are in this class */
3693 ph10 227
3694 ph10 226 if (negate_class)
3695     {
3696 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3697 ph10 226 }
3698     else
3699     {
3700 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3701 ph10 227 }
3702 ph10 230 #endif
3703 ph10 227
3704 ph10 231
3705 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3706 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3707     use of \p or \P, in other words, no use of any XCLASS features, we can
3708     optimize.
3709    
3710 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3711     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3712     operate on single-bytes only. This is an historical hangover. Maybe one day
3713     we can tidy these opcodes to handle multi-byte characters.
3714 nigel 77
3715     The optimization throws away the bit map. We turn the item into a
3716     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3717     that OP_NOT does not support multibyte characters. In the positive case, it
3718     can cause firstbyte to be set. Otherwise, there can be no first char if
3719     this item is first, whatever repeat count may follow. In the case of
3720     reqbyte, save the previous value for reinstating. */
3721    
3722     #ifdef SUPPORT_UTF8
3723 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3724 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3725 nigel 77 #else
3726     if (class_charcount == 1)
3727     #endif
3728     {
3729     zeroreqbyte = reqbyte;
3730    
3731     /* The OP_NOT opcode works on one-byte characters only. */
3732    
3733     if (negate_class)
3734     {
3735     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3736     zerofirstbyte = firstbyte;
3737     *code++ = OP_NOT;
3738     *code++ = class_lastchar;
3739     break;
3740     }
3741    
3742     /* For a single, positive character, get the value into mcbuffer, and
3743     then we can handle this with the normal one-character code. */
3744    
3745     #ifdef SUPPORT_UTF8
3746     if (utf8 && class_lastchar > 127)
3747     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3748     else
3749     #endif
3750     {
3751     mcbuffer[0] = class_lastchar;
3752     mclength = 1;
3753     }
3754     goto ONE_CHAR;
3755     } /* End of 1-char optimization */
3756    
3757     /* The general case - not the one-char optimization. If this is the first
3758     thing in the branch, there can be no first char setting, whatever the
3759     repeat count. Any reqbyte setting must remain unchanged after any kind of
3760     repeat. */
3761    
3762     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3763     zerofirstbyte = firstbyte;
3764     zeroreqbyte = reqbyte;
3765    
3766     /* If there are characters with values > 255, we have to compile an
3767 ph10 286 extended class, with its own opcode, unless there was a negated special
3768     such as \S in the class, because in that case all characters > 255 are in
3769     the class, so any that were explicitly given as well can be ignored. If
3770 ph10 264 (when there are explicit characters > 255 that must be listed) there are no
3771     characters < 256, we can omit the bitmap in the actual compiled code. */
3772 nigel 77
3773     #ifdef SUPPORT_UTF8
3774 ph10 264 if (class_utf8 && !should_flip_negation)
3775 nigel 77 {
3776     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3777     *code++ = OP_XCLASS;
3778     code += LINK_SIZE;
3779     *code = negate_class? XCL_NOT : 0;
3780    
3781 nigel 93 /* If the map is required, move up the extra data to make room for it;
3782     otherwise just move the code pointer to the end of the extra data. */
3783 nigel 77
3784     if (class_charcount > 0)
3785     {
3786     *code++ |= XCL_MAP;
3787 nigel 93 memmove(code + 32, code, class_utf8data - code);
3788 nigel 77 memcpy(code, classbits, 32);
3789 nigel 93 code = class_utf8data + 32;
3790 nigel 77 }
3791 nigel 93 else code = class_utf8data;
3792 nigel 77
3793     /* Now fill in the complete length of the item */
3794    
3795     PUT(previous, 1, code - previous);
3796     break; /* End of class handling */
3797     }
3798     #endif
3799    
3800 ph10 286 /* If there are no characters > 255, set the opcode to OP_CLASS or
3801     OP_NCLASS, depending on whether the whole class was negated and whether
3802     there were negative specials such as \S in the class. Then copy the 32-byte
3803 ph10 264 map into the code vector, negating it if necessary. */
3804 ph10 286
3805 ph10 264 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3806 nigel 77 if (negate_class)
3807     {
3808 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3809     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3810 nigel 77 }
3811     else
3812     {
3813     memcpy(code, classbits, 32);
3814     }
3815     code += 32;
3816     break;
3817    
3818 nigel 93
3819     /* ===================================================================*/
3820 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3821     has been tested above. */
3822    
3823 ph10 391 case CHAR_LEFT_CURLY_BRACKET:
3824 nigel 77 if (!is_quantifier) goto NORMAL_CHAR;
3825     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3826     if (*errorcodeptr != 0) goto FAILED;
3827     goto REPEAT;
3828    
3829 ph10 391 case CHAR_ASTERISK:
3830 nigel 77 repeat_min = 0;
3831     repeat_max = -1;
3832     goto REPEAT;
3833    
3834 ph10 391 case CHAR_PLUS:
3835 nigel 77 repeat_min = 1;
3836     repeat_max = -1;
3837     goto REPEAT;
3838    
3839 ph10 391 case CHAR_QUESTION_MARK:
3840 nigel 77 repeat_min = 0;
3841     repeat_max = 1;
3842    
3843     REPEAT:
3844     if (previous == NULL)
3845     {
3846     *errorcodeptr = ERR9;
3847     goto FAILED;
3848     }
3849    
3850     if (repeat_min == 0)
3851     {
3852     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3853     reqbyte = zeroreqbyte; /* Ditto */
3854     }
3855    
3856     /* Remember whether this is a variable length repeat */
3857    
3858     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3859    
3860     op_type = 0; /* Default single-char op codes */
3861     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3862    
3863     /* Save start of previous item, in case we have to move it up to make space
3864     for an inserted OP_ONCE for the additional '+' extension. */
3865    
3866     tempcode = previous;
3867    
3868     /* If the next character is '+', we have a possessive quantifier. This
3869     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3870     If the next character is '?' this is a minimizing repeat, by default,
3871     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3872     repeat type to the non-default. */
3873    
3874 ph10 391 if (ptr[1] == CHAR_PLUS)
3875 nigel 77 {
3876     repeat_type = 0; /* Force greedy */
3877     possessive_quantifier = TRUE;
3878     ptr++;
3879     }
3880 ph10 391 else if (ptr[1] == CHAR_QUESTION_MARK)
3881 nigel 77 {
3882     repeat_type = greedy_non_default;
3883     ptr++;
3884     }
3885     else repeat_type = greedy_default;
3886    
3887     /* If previous was a character match, abolish the item and generate a
3888     repeat item instead. If a char item has a minumum of more than one, ensure
3889     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3890     the first thing in a branch because the x will have gone into firstbyte
3891     instead. */
3892    
3893     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3894     {
3895     /* Deal with UTF-8 characters that take up more than one byte. It's
3896     easier to write this out separately than try to macrify it. Use c to
3897     hold the length of the character in bytes, plus 0x80 to flag that it's a
3898     length rather than a small character. */
3899    
3900     #ifdef SUPPORT_UTF8
3901     if (utf8 && (code[-1] & 0x80) != 0)
3902     {
3903     uschar *lastchar = code - 1;
3904     while((*lastchar & 0xc0) == 0x80) lastchar--;
3905     c = code - lastchar; /* Length of UTF-8 character */
3906     memcpy(utf8_char, lastchar, c); /* Save the char */
3907     c |= 0x80; /* Flag c as a length */
3908     }
3909     else
3910     #endif
3911    
3912     /* Handle the case of a single byte - either with no UTF8 support, or
3913     with UTF-8 disabled, or for a UTF-8 character < 128. */
3914    
3915     {
3916     c = code[-1];
3917     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3918     }
3919    
3920 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3921     the line is something that cannot possibly match this character. If so,
3922     automatically possessifying this item gains some performance in the case
3923     where the match fails. */
3924    
3925     if (!possessive_quantifier &&
3926     repeat_max < 0 &&
3927     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3928     options, cd))
3929     {
3930     repeat_type = 0; /* Force greedy */
3931     possessive_quantifier = TRUE;
3932     }
3933    
3934 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3935     }
3936    
3937     /* If previous was a single negated character ([^a] or similar), we use
3938     one of the special opcodes, replacing it. The code is shared with single-
3939     character repeats by setting opt_type to add a suitable offset into
3940 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3941     currently used only for single-byte chars. */
3942 nigel 77
3943     else if (*previous == OP_NOT)
3944     {
3945     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3946     c = previous[1];
3947 nigel 93 if (!possessive_quantifier &&
3948     repeat_max < 0 &&
3949     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3950     {
3951     repeat_type = 0; /* Force greedy */
3952     possessive_quantifier = TRUE;
3953     }
3954 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3955     }
3956    
3957     /* If previous was a character type match (\d or similar), abolish it and
3958     create a suitable repeat item. The code is shared with single-character
3959     repeats by setting op_type to add a suitable offset into repeat_type. Note
3960     the the Unicode property types will be present only when SUPPORT_UCP is
3961     defined, but we don't wrap the little bits of code here because it just
3962     makes it horribly messy. */
3963    
3964     else if (*previous < OP_EODN)
3965     {
3966     uschar *oldcode;
3967 nigel 87 int prop_type, prop_value;
3968 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3969     c = *previous;
3970    
3971 nigel 93 if (!possessive_quantifier &&
3972     repeat_max < 0 &&
3973     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3974     {
3975     repeat_type = 0; /* Force greedy */
3976     possessive_quantifier = TRUE;
3977     }
3978    
3979 nigel 77 OUTPUT_SINGLE_REPEAT:
3980 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3981     {
3982     prop_type = previous[1];
3983     prop_value = previous[2];
3984     }
3985     else prop_type = prop_value = -1;
3986 nigel 77
3987     oldcode = code;
3988     code = previous; /* Usually overwrite previous item */
3989    
3990     /* If the maximum is zero then the minimum must also be zero; Perl allows
3991     this case, so we do too - by simply omitting the item altogether. */
3992    
3993     if (repeat_max == 0) goto END_REPEAT;
3994    
3995 ph10 461