/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 514 - (hide annotations) (download)
Mon May 3 12:54:22 2010 UTC (4 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 225209 byte(s)
Add support for \N.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 475 Copyright (c) 1997-2010 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 ph10 475 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57     also used by pcretest. PCRE_DEBUG is not defined when building a production
58     library. */
59 nigel 85
60 ph10 475 #ifdef PCRE_DEBUG
61 nigel 85 #include "pcre_printint.src"
62     #endif
63    
64    
65 ph10 178 /* Macro for setting individual bits in class bitmaps. */
66    
67     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68    
69 ph10 202 /* Maximum length value to check against when making sure that the integer that
70     holds the compiled pattern length does not overflow. We make it a bit less than
71     INT_MAX to allow for adding in group terminating bytes, so that we don't have
72     to check them every time. */
73 ph10 178
74 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
75    
76    
77 nigel 77 /*************************************************
78     * Code parameters and static tables *
79     *************************************************/
80    
81 nigel 93 /* This value specifies the size of stack workspace that is used during the
82     first pre-compile phase that determines how much memory is required. The regex
83     is partly compiled into this space, but the compiled parts are discarded as
84     soon as they can be, so that hopefully there will never be an overrun. The code
85     does, however, check for an overrun. The largest amount I've seen used is 218,
86     so this number is very generous.
87 nigel 77
88 nigel 93 The same workspace is used during the second, actual compile phase for
89     remembering forward references to groups so that they can be filled in at the
90     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91     is 4 there is plenty of room. */
92 nigel 77
93 nigel 93 #define COMPILE_WORK_SIZE (4096)
94 nigel 77
95 ph10 507 /* The overrun tests check for a slightly smaller size so that they detect the
96 ph10 505 overrun before it actually does run off the end of the data block. */
97 nigel 93
98 ph10 505 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99    
100    
101 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
102     are simple data values; negative values are for special things like \d and so
103     on. Zero means further processing is needed (for things like \x), or the escape
104     is invalid. */
105    
106 ph10 391 #ifndef EBCDIC
107    
108     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
109 ph10 392 in UTF-8 mode. */
110 ph10 391
111 ph10 392 static const short int escapes[] = {
112 ph10 391 0, 0,
113     0, 0,
114 ph10 392 0, 0,
115     0, 0,
116     0, 0,
117 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
118 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
119 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
120 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
121     -ESC_B, -ESC_C,
122     -ESC_D, -ESC_E,
123     0, -ESC_G,
124     -ESC_H, 0,
125     0, -ESC_K,
126 ph10 391 0, 0,
127 ph10 514 -ESC_N, 0,
128 ph10 391 -ESC_P, -ESC_Q,
129     -ESC_R, -ESC_S,
130 ph10 392 0, 0,
131     -ESC_V, -ESC_W,
132     -ESC_X, 0,
133     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
134 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
135 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
136 ph10 391 CHAR_GRAVE_ACCENT, 7,
137 ph10 392 -ESC_b, 0,
138     -ESC_d, ESC_e,
139 ph10 391 ESC_f, 0,
140     -ESC_h, 0,
141 ph10 392 0, -ESC_k,
142 ph10 391 0, 0,
143     ESC_n, 0,
144 ph10 392 -ESC_p, 0,
145     ESC_r, -ESC_s,
146 ph10 391 ESC_tee, 0,
147 ph10 392 -ESC_v, -ESC_w,
148     0, 0,
149 ph10 391 -ESC_z
150 nigel 77 };
151    
152 ph10 392 #else
153 ph10 391
154     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155    
156 nigel 77 static const short int escapes[] = {
157     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
158     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
159     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
160     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
161     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
162     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
163     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
164     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
165 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
166 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
167 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
168 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
169 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
170     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
171     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
172     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
173 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
174 ph10 514 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
175 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
176 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
177 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
178     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
179     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
180     };
181     #endif
182    
183    
184 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185     searched linearly. Put all the names into a single string, in order to reduce
186 ph10 392 the number of relocations when a shared library is dynamically linked. The
187     string is built from string macros so that it works in UTF-8 mode on EBCDIC
188 ph10 391 platforms. */
189 ph10 210
190     typedef struct verbitem {
191 ph10 510 int len; /* Length of verb name */
192     int op; /* Op when no arg, or -1 if arg mandatory */
193     int op_arg; /* Op when arg present, or -1 if not allowed */
194 ph10 211 } verbitem;
195 ph10 210
196 ph10 240 static const char verbnames[] =
197 ph10 510 "\0" /* Empty name is a shorthand for MARK */
198 ph10 512 STRING_MARK0
199 ph10 391 STRING_ACCEPT0
200     STRING_COMMIT0
201     STRING_F0
202     STRING_FAIL0
203     STRING_PRUNE0
204     STRING_SKIP0
205     STRING_THEN;
206 ph10 240
207 ph10 327 static const verbitem verbs[] = {
208 ph10 510 { 0, -1, OP_MARK },
209 ph10 512 { 4, -1, OP_MARK },
210 ph10 510 { 6, OP_ACCEPT, -1 },
211     { 6, OP_COMMIT, -1 },
212     { 1, OP_FAIL, -1 },
213     { 4, OP_FAIL, -1 },
214     { 5, OP_PRUNE, OP_PRUNE_ARG },
215     { 4, OP_SKIP, OP_SKIP_ARG },
216     { 4, OP_THEN, OP_THEN_ARG }
217 ph10 210 };
218    
219 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
220 ph10 210
221    
222 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
223     now all in a single string, to reduce the number of relocations when a shared
224 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
225     length entry. The first three must be alpha, lower, upper, as this is assumed
226     for handling case independence. */
227 nigel 77
228 ph10 240 static const char posix_names[] =
229 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
230     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
231 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
232     STRING_word0 STRING_xdigit;
233 nigel 77
234     static const uschar posix_name_lengths[] = {
235     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
236    
237 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
238     base map, with an optional addition or removal of another map. Then, for some
239     classes, there is some additional tweaking: for [:blank:] the vertical space
240     characters are removed, and for [:alpha:] and [:alnum:] the underscore
241     character is removed. The triples in the table consist of the base map offset,
242     second map offset or -1 if no second map, and a non-negative value for map
243     addition or a negative value for map subtraction (if there are two maps). The
244     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
245     remove vertical space characters, 2 => remove underscore. */
246 nigel 77
247     static const int posix_class_maps[] = {
248 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
249     cbit_lower, -1, 0, /* lower */
250     cbit_upper, -1, 0, /* upper */
251     cbit_word, -1, 2, /* alnum - word without underscore */
252     cbit_print, cbit_cntrl, 0, /* ascii */
253     cbit_space, -1, 1, /* blank - a GNU extension */
254     cbit_cntrl, -1, 0, /* cntrl */
255     cbit_digit, -1, 0, /* digit */
256     cbit_graph, -1, 0, /* graph */
257     cbit_print, -1, 0, /* print */
258     cbit_punct, -1, 0, /* punct */
259     cbit_space, -1, 0, /* space */
260     cbit_word, -1, 0, /* word - a Perl extension */
261     cbit_xdigit,-1, 0 /* xdigit */
262 nigel 77 };
263    
264    
265 nigel 93 #define STRING(a) # a
266     #define XSTRING(s) STRING(s)
267    
268 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
269 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
270     they are documented. Always add a new error instead. Messages marked DEAD below
271 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
272     the number of relocations needed when a shared library is loaded dynamically,
273     it is now one long string. We cannot use a table of offsets, because the
274     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
275     simply count through to the one we want - this isn't a performance issue
276 ph10 507 because these strings are used only when there is a compilation error.
277 nigel 77
278 ph10 507 Each substring ends with \0 to insert a null character. This includes the final
279     substring, so that the whole string ends with \0\0, which can be detected when
280 ph10 499 counting through. */
281    
282 ph10 240 static const char error_texts[] =
283     "no error\0"
284     "\\ at end of pattern\0"
285     "\\c at end of pattern\0"
286     "unrecognized character follows \\\0"
287     "numbers out of order in {} quantifier\0"
288 nigel 77 /* 5 */
289 ph10 240 "number too big in {} quantifier\0"
290     "missing terminating ] for character class\0"
291     "invalid escape sequence in character class\0"
292     "range out of order in character class\0"
293     "nothing to repeat\0"
294 nigel 77 /* 10 */
295 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
296     "internal error: unexpected repeat\0"
297 ph10 269 "unrecognized character after (? or (?-\0"
298 ph10 240 "POSIX named classes are supported only within a class\0"
299     "missing )\0"
300 nigel 77 /* 15 */
301 ph10 240 "reference to non-existent subpattern\0"
302     "erroffset passed as NULL\0"
303     "unknown option bit(s) set\0"
304     "missing ) after comment\0"
305     "parentheses nested too deeply\0" /** DEAD **/
306 nigel 77 /* 20 */
307 ph10 240 "regular expression is too large\0"
308     "failed to get memory\0"
309     "unmatched parentheses\0"
310     "internal error: code overflow\0"
311     "unrecognized character after (?<\0"
312 nigel 77 /* 25 */
313 ph10 240 "lookbehind assertion is not fixed length\0"
314     "malformed number or name after (?(\0"
315     "conditional group contains more than two branches\0"
316     "assertion expected after (?(\0"
317     "(?R or (?[+-]digits must be followed by )\0"
318 nigel 77 /* 30 */
319 ph10 240 "unknown POSIX class name\0"
320     "POSIX collating elements are not supported\0"
321     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
322     "spare error\0" /** DEAD **/
323     "character value in \\x{...} sequence is too large\0"
324 nigel 77 /* 35 */
325 ph10 240 "invalid condition (?(0)\0"
326     "\\C not allowed in lookbehind assertion\0"
327 ph10 514 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
328 ph10 240 "number after (?C is > 255\0"
329     "closing ) for (?C expected\0"
330 nigel 77 /* 40 */
331 ph10 240 "recursive call could loop indefinitely\0"
332     "unrecognized character after (?P\0"
333     "syntax error in subpattern name (missing terminator)\0"
334     "two named subpatterns have the same name\0"
335     "invalid UTF-8 string\0"
336 nigel 77 /* 45 */
337 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
338     "malformed \\P or \\p sequence\0"
339     "unknown property name after \\P or \\p\0"
340     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
341     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
342 nigel 91 /* 50 */
343 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
344     "octal value is greater than \\377 (not in UTF-8 mode)\0"
345     "internal error: overran compiling workspace\0"
346     "internal error: previously-checked referenced subpattern not found\0"
347     "DEFINE group contains more than one branch\0"
348 nigel 93 /* 55 */
349 ph10 240 "repeating a DEFINE group is not allowed\0"
350     "inconsistent NEWLINE options\0"
351 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
352     "a numbered reference must not be zero\0"
353 ph10 510 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
354 ph10 211 /* 60 */
355 ph10 240 "(*VERB) not recognized\0"
356 ph10 268 "number is too big\0"
357 ph10 272 "subpattern name expected\0"
358 ph10 336 "digit expected after (?+\0"
359 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
360     /* 65 */
361 ph10 510 "different names for subpatterns of the same number are not allowed\0"
362 ph10 512 "(*MARK) must have an argument\0"
363 ph10 510 ;
364 nigel 77
365     /* Table to identify digits and hex digits. This is used when compiling
366     patterns. Note that the tables in chartables are dependent on the locale, and
367     may mark arbitrary characters as digits - but the PCRE compiling code expects
368     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
369     a private table here. It costs 256 bytes, but it is a lot faster than doing
370     character value tests (at least in some simple cases I timed), and in some
371     applications one wants PCRE to compile efficiently as well as match
372     efficiently.
373    
374     For convenience, we use the same bit definitions as in chartables:
375    
376     0x04 decimal digit
377     0x08 hexadecimal digit
378    
379     Then we can use ctype_digit and ctype_xdigit in the code. */
380    
381 ph10 392 #ifndef EBCDIC
382 ph10 391
383 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
384 ph10 391 UTF-8 mode. */
385    
386 nigel 77 static const unsigned char digitab[] =
387     {
388     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
393     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
394     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
395     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
396     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
397     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
398     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
399     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
400     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
401     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
402     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
403     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
404     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
405     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
406     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
407     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
408     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
409     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
410     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
411     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
412     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
413     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
414     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
415     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
416     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
417     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
418     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
419     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
420    
421 ph10 392 #else
422 ph10 391
423     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
424    
425 nigel 77 static const unsigned char digitab[] =
426     {
427     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
428     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
429     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
430     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
431     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
432     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
433     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
434     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
435     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
436     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
437     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
438 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
439 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
440     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
441     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
442     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
443     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
444     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
445     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
446     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
447     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
448     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
449     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
450     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
451     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
452     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
453     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
454     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
455     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
456     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
457     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
458     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
459    
460     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
461     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
462     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
463     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
464     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
465     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
466     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
467     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
468     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
469     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
470     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
471     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
472 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
473 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
474     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
475     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
476     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
477     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
478     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
479     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
480     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
481     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
482     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
483     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
484     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
485     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
486     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
487     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
488     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
489     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
490     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
491     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
492     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
493     #endif
494    
495    
496     /* Definition to allow mutual recursion */
497    
498     static BOOL
499 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
500 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
501 nigel 77
502    
503    
504     /*************************************************
505 ph10 240 * Find an error text *
506     *************************************************/
507    
508 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
509     some of the text is of unknown length, we can't use a table of offsets.
510     Instead, just count through the strings. This is not a performance issue
511 ph10 240 because it happens only when there has been a compilation error.
512    
513     Argument: the error number
514     Returns: pointer to the error string
515     */
516    
517     static const char *
518     find_error_text(int n)
519     {
520     const char *s = error_texts;
521 ph10 507 for (; n > 0; n--)
522 ph10 499 {
523     while (*s++ != 0) {};
524     if (*s == 0) return "Error text not found (please report)";
525 ph10 507 }
526 ph10 240 return s;
527     }
528    
529    
530     /*************************************************
531 nigel 77 * Handle escapes *
532     *************************************************/
533    
534     /* This function is called when a \ has been encountered. It either returns a
535     positive value for a simple escape such as \n, or a negative value which
536 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
537     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
538     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
539     ptr is pointing at the \. On exit, it is on the final character of the escape
540     sequence.
541 nigel 77
542     Arguments:
543     ptrptr points to the pattern position pointer
544     errorcodeptr points to the errorcode variable
545     bracount number of previous extracting brackets
546     options the options bits
547     isclass TRUE if inside a character class
548    
549     Returns: zero or positive => a data character
550     negative => a special escape sequence
551 ph10 213 on error, errorcodeptr is set
552 nigel 77 */
553    
554     static int
555     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
556     int options, BOOL isclass)
557     {
558 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
559     const uschar *ptr = *ptrptr + 1;
560 nigel 77 int c, i;
561    
562 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
563     ptr--; /* Set pointer back to the last byte */
564    
565 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
566    
567     if (c == 0) *errorcodeptr = ERR1;
568    
569 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
570     in a table. A non-zero result is something that can be returned immediately.
571 nigel 77 Otherwise further processing may be required. */
572    
573 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
574     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
575     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
576 nigel 77
577 ph10 97 #else /* EBCDIC coding */
578 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
579 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
580     #endif
581    
582     /* Escapes that need further processing, or are illegal. */
583    
584     else
585     {
586     const uschar *oldptr;
587 nigel 93 BOOL braced, negated;
588    
589 nigel 77 switch (c)
590     {
591     /* A number of Perl escapes are not handled by PCRE. We give an explicit
592     error. */
593    
594 ph10 391 case CHAR_l:
595     case CHAR_L:
596     case CHAR_u:
597     case CHAR_U:
598 nigel 77 *errorcodeptr = ERR37;
599     break;
600    
601 ph10 333 /* \g must be followed by one of a number of specific things:
602 ph10 345
603 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
604     backreference. If negative, it is a relative backreference. This is a Perl
605     5.10 feature.
606 ph10 345
607 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
608     is part of Perl's movement towards a unified syntax for back references. As
609     this is synonymous with \k{name}, we fudge it up by pretending it really
610     was \k.
611 ph10 345
612     (3) For Oniguruma compatibility we also support \g followed by a name or a
613     number either in angle brackets or in single quotes. However, these are
614     (possibly recursive) subroutine calls, _not_ backreferences. Just return
615 ph10 333 the -ESC_g code (cf \k). */
616 nigel 93
617 ph10 391 case CHAR_g:
618     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
619 ph10 333 {
620     c = -ESC_g;
621 ph10 345 break;
622     }
623 ph10 333
624     /* Handle the Perl-compatible cases */
625 ph10 345
626 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
627 nigel 93 {
628 ph10 171 const uschar *p;
629 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
630     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
631     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
632 ph10 171 {
633     c = -ESC_k;
634     break;
635 ph10 172 }
636 nigel 93 braced = TRUE;
637     ptr++;
638     }
639     else braced = FALSE;
640    
641 ph10 391 if (ptr[1] == CHAR_MINUS)
642 nigel 93 {
643     negated = TRUE;
644     ptr++;
645     }
646     else negated = FALSE;
647    
648     c = 0;
649     while ((digitab[ptr[1]] & ctype_digit) != 0)
650 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
651 ph10 220
652 ph10 333 if (c < 0) /* Integer overflow */
653 ph10 213 {
654     *errorcodeptr = ERR61;
655     break;
656 ph10 220 }
657 ph10 345
658 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
659 nigel 93 {
660     *errorcodeptr = ERR57;
661 ph10 213 break;
662 nigel 93 }
663 ph10 345
664 ph10 333 if (c == 0)
665     {
666     *errorcodeptr = ERR58;
667     break;
668 ph10 345 }
669 nigel 93
670     if (negated)
671     {
672     if (c > bracount)
673     {
674     *errorcodeptr = ERR15;
675 ph10 213 break;
676 nigel 93 }
677     c = bracount - (c - 1);
678     }
679    
680     c = -(ESC_REF + c);
681     break;
682    
683 nigel 77 /* The handling of escape sequences consisting of a string of digits
684     starting with one that is not zero is not straightforward. By experiment,
685     the way Perl works seems to be as follows:
686    
687     Outside a character class, the digits are read as a decimal number. If the
688     number is less than 10, or if there are that many previous extracting
689     left brackets, then it is a back reference. Otherwise, up to three octal
690     digits are read to form an escaped byte. Thus \123 is likely to be octal
691     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
692     value is greater than 377, the least significant 8 bits are taken. Inside a
693     character class, \ followed by a digit is always an octal number. */
694    
695 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
696     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
697 nigel 77
698     if (!isclass)
699     {
700     oldptr = ptr;
701 ph10 391 c -= CHAR_0;
702 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
703 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
704 ph10 333 if (c < 0) /* Integer overflow */
705 ph10 213 {
706     *errorcodeptr = ERR61;
707 ph10 220 break;
708     }
709 nigel 77 if (c < 10 || c <= bracount)
710     {
711     c = -(ESC_REF + c);
712     break;
713     }
714     ptr = oldptr; /* Put the pointer back and fall through */
715     }
716    
717     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
718     generates a binary zero byte and treats the digit as a following literal.
719     Thus we have to pull back the pointer by one. */
720    
721 ph10 391 if ((c = *ptr) >= CHAR_8)
722 nigel 77 {
723     ptr--;
724     c = 0;
725     break;
726     }
727    
728     /* \0 always starts an octal number, but we may drop through to here with a
729 nigel 91 larger first octal digit. The original code used just to take the least
730     significant 8 bits of octal numbers (I think this is what early Perls used
731     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
732     than 3 octal digits. */
733 nigel 77
734 ph10 391 case CHAR_0:
735     c -= CHAR_0;
736     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
737     c = c * 8 + *(++ptr) - CHAR_0;
738 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
739 nigel 77 break;
740    
741 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
742     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
743     treated as a data character. */
744 nigel 77
745 ph10 391 case CHAR_x:
746     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
747 nigel 77 {
748     const uschar *pt = ptr + 2;
749 nigel 87 int count = 0;
750    
751 nigel 77 c = 0;
752     while ((digitab[*pt] & ctype_xdigit) != 0)
753     {
754 nigel 87 register int cc = *pt++;
755 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
756 nigel 77 count++;
757 nigel 87
758 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
759     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
760     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
761 ph10 97 #else /* EBCDIC coding */
762 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
763     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
764 nigel 77 #endif
765     }
766 nigel 87
767 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
768 nigel 77 {
769 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
770 nigel 77 ptr = pt;
771     break;
772     }
773 nigel 87
774 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
775     recognize this construct; fall through to the normal \x handling. */
776     }
777    
778 nigel 87 /* Read just a single-byte hex-defined char */
779 nigel 77
780     c = 0;
781     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
782     {
783 ph10 391 int cc; /* Some compilers don't like */
784     cc = *(++ptr); /* ++ in initializers */
785     #ifndef EBCDIC /* ASCII/UTF-8 coding */
786     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
787     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
788 ph10 97 #else /* EBCDIC coding */
789 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
790     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
791 nigel 77 #endif
792     }
793     break;
794    
795 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
796     This coding is ASCII-specific, but then the whole concept of \cx is
797     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
798 nigel 77
799 ph10 391 case CHAR_c:
800 nigel 77 c = *(++ptr);
801     if (c == 0)
802     {
803     *errorcodeptr = ERR2;
804 ph10 213 break;
805 nigel 77 }
806    
807 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
808     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
809 nigel 77 c ^= 0x40;
810 ph10 97 #else /* EBCDIC coding */
811 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
812 nigel 77 c ^= 0xC0;
813     #endif
814     break;
815    
816     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
817 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
818     otherwise, for Perl compatibility, it is a literal. This code looks a bit
819     odd, but there used to be some cases other than the default, and there may
820     be again in future, so I haven't "optimized" it. */
821 nigel 77
822     default:
823     if ((options & PCRE_EXTRA) != 0) switch(c)
824     {
825     default:
826     *errorcodeptr = ERR3;
827     break;
828     }
829     break;
830     }
831     }
832 ph10 514
833     /* Perl supports \N{name} for character names, as well as plain \N for "not
834     newline". PCRE does not support \N{name}. */
835 nigel 77
836 ph10 514 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
837     *errorcodeptr = ERR37;
838    
839 nigel 77 *ptrptr = ptr;
840     return c;
841     }
842    
843    
844    
845     #ifdef SUPPORT_UCP
846     /*************************************************
847     * Handle \P and \p *
848     *************************************************/
849    
850     /* This function is called after \P or \p has been encountered, provided that
851     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
852     pointing at the P or p. On exit, it is pointing at the final character of the
853     escape sequence.
854    
855     Argument:
856     ptrptr points to the pattern position pointer
857     negptr points to a boolean that is set TRUE for negation else FALSE
858 nigel 87 dptr points to an int that is set to the detailed property value
859 nigel 77 errorcodeptr points to the error code variable
860    
861 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
862 nigel 77 */
863    
864     static int
865 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
866 nigel 77 {
867     int c, i, bot, top;
868     const uschar *ptr = *ptrptr;
869 nigel 87 char name[32];
870 nigel 77
871     c = *(++ptr);
872     if (c == 0) goto ERROR_RETURN;
873    
874     *negptr = FALSE;
875    
876 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
877     negation. */
878 nigel 77
879 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
880 nigel 77 {
881 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
882 nigel 77 {
883     *negptr = TRUE;
884     ptr++;
885     }
886 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
887 nigel 77 {
888     c = *(++ptr);
889     if (c == 0) goto ERROR_RETURN;
890 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
891 nigel 77 name[i] = c;
892     }
893 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
894 nigel 77 name[i] = 0;
895     }
896    
897     /* Otherwise there is just one following character */
898    
899     else
900     {
901     name[0] = c;
902     name[1] = 0;
903     }
904    
905     *ptrptr = ptr;
906    
907     /* Search for a recognized property name using binary chop */
908    
909     bot = 0;
910     top = _pcre_utt_size;
911    
912     while (bot < top)
913     {
914 nigel 87 i = (bot + top) >> 1;
915 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
916 nigel 87 if (c == 0)
917     {
918     *dptr = _pcre_utt[i].value;
919     return _pcre_utt[i].type;
920     }
921 nigel 77 if (c > 0) bot = i + 1; else top = i;
922     }
923    
924     *errorcodeptr = ERR47;
925     *ptrptr = ptr;
926     return -1;
927    
928     ERROR_RETURN:
929     *errorcodeptr = ERR46;
930     *ptrptr = ptr;
931     return -1;
932     }
933     #endif
934    
935    
936    
937    
938     /*************************************************
939     * Check for counted repeat *
940     *************************************************/
941    
942     /* This function is called when a '{' is encountered in a place where it might
943     start a quantifier. It looks ahead to see if it really is a quantifier or not.
944     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
945     where the ddds are digits.
946    
947     Arguments:
948     p pointer to the first char after '{'
949    
950     Returns: TRUE or FALSE
951     */
952    
953     static BOOL
954     is_counted_repeat(const uschar *p)
955     {
956     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
957     while ((digitab[*p] & ctype_digit) != 0) p++;
958 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
959 nigel 77
960 ph10 391 if (*p++ != CHAR_COMMA) return FALSE;
961     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
962 nigel 77
963     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
964     while ((digitab[*p] & ctype_digit) != 0) p++;
965    
966 ph10 391 return (*p == CHAR_RIGHT_CURLY_BRACKET);
967 nigel 77 }
968    
969    
970    
971     /*************************************************
972     * Read repeat counts *
973     *************************************************/
974    
975     /* Read an item of the form {n,m} and return the values. This is called only
976     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
977     so the syntax is guaranteed to be correct, but we need to check the values.
978    
979     Arguments:
980     p pointer to first char after '{'
981     minp pointer to int for min
982     maxp pointer to int for max
983     returned as -1 if no max
984     errorcodeptr points to error code variable
985    
986     Returns: pointer to '}' on success;
987     current ptr on error, with errorcodeptr set non-zero
988     */
989    
990     static const uschar *
991     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
992     {
993     int min = 0;
994     int max = -1;
995    
996 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
997     an integer overflow. */
998    
999 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1000 nigel 81 if (min < 0 || min > 65535)
1001     {
1002     *errorcodeptr = ERR5;
1003     return p;
1004     }
1005 nigel 77
1006 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
1007     Also, max must not be less than min. */
1008    
1009 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1010 nigel 77 {
1011 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1012 nigel 77 {
1013     max = 0;
1014 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1015 nigel 81 if (max < 0 || max > 65535)
1016     {
1017     *errorcodeptr = ERR5;
1018     return p;
1019     }
1020 nigel 77 if (max < min)
1021     {
1022     *errorcodeptr = ERR4;
1023     return p;
1024     }
1025     }
1026     }
1027    
1028 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1029     '}'. */
1030 nigel 77
1031 nigel 81 *minp = min;
1032     *maxp = max;
1033 nigel 77 return p;
1034     }
1035    
1036    
1037    
1038     /*************************************************
1039 ph10 408 * Subroutine for finding forward reference *
1040 nigel 91 *************************************************/
1041    
1042 ph10 408 /* This recursive function is called only from find_parens() below. The
1043     top-level call starts at the beginning of the pattern. All other calls must
1044     start at a parenthesis. It scans along a pattern's text looking for capturing
1045 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1046     name it is given, it returns its number. Alternatively, if the name is NULL, it
1047 ph10 408 returns when it reaches a given numbered subpattern. We know that if (?P< is
1048     encountered, the name will be terminated by '>' because that is checked in the
1049 ph10 411 first pass. Recursion is used to keep track of subpatterns that reset the
1050 ph10 408 capturing group numbers - the (?| feature.
1051 nigel 91
1052     Arguments:
1053 ph10 408 ptrptr address of the current character pointer (updated)
1054 ph10 345 cd compile background data
1055 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1056     lorn name length, or subpattern number if name is NULL
1057     xmode TRUE if we are in /x mode
1058 ph10 411 count pointer to the current capturing subpattern number (updated)
1059 nigel 91
1060     Returns: the number of the named subpattern, or -1 if not found
1061     */
1062    
1063     static int
1064 ph10 408 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1065     BOOL xmode, int *count)
1066 nigel 91 {
1067 ph10 408 uschar *ptr = *ptrptr;
1068     int start_count = *count;
1069     int hwm_count = start_count;
1070     BOOL dup_parens = FALSE;
1071 nigel 93
1072 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1073 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1074    
1075     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1076     {
1077     if (ptr[1] == CHAR_QUESTION_MARK &&
1078 ph10 411 ptr[2] == CHAR_VERTICAL_LINE)
1079 ph10 408 {
1080     ptr += 3;
1081 ph10 411 dup_parens = TRUE;
1082     }
1083 ph10 408
1084     /* Handle a normal, unnamed capturing parenthesis */
1085 ph10 411
1086 ph10 408 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1087     {
1088     *count += 1;
1089     if (name == NULL && *count == lorn) return *count;
1090 ph10 411 ptr++;
1091 ph10 408 }
1092    
1093     /* Handle a condition. If it is an assertion, just carry on so that it
1094     is processed as normal. If not, skip to the closing parenthesis of the
1095 ph10 411 condition (there can't be any nested parens. */
1096    
1097 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1098     {
1099 ph10 411 ptr += 2;
1100 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1101     {
1102     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1103 ph10 411 if (*ptr != 0) ptr++;
1104 ph10 408 }
1105 ph10 411 }
1106    
1107 ph10 408 /* We have either (? or (* and not a condition */
1108    
1109     else
1110 ph10 411 {
1111 ph10 408 ptr += 2;
1112     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1113    
1114     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1115 ph10 411
1116 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1117     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1118     {
1119     int term;
1120     const uschar *thisname;
1121     *count += 1;
1122     if (name == NULL && *count == lorn) return *count;
1123     term = *ptr++;
1124     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1125     thisname = ptr;
1126     while (*ptr != term) ptr++;
1127     if (name != NULL && lorn == ptr - thisname &&
1128     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1129     return *count;
1130 ph10 461 term++;
1131 ph10 411 }
1132 ph10 408 }
1133 ph10 411 }
1134 ph10 408
1135 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1136 ph10 408 bars. */
1137    
1138 nigel 91 for (; *ptr != 0; ptr++)
1139     {
1140 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1141    
1142 ph10 391 if (*ptr == CHAR_BACKSLASH)
1143 nigel 93 {
1144 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1145 ph10 391 if (*ptr == CHAR_Q) for (;;)
1146 nigel 93 {
1147 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1148 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1149 ph10 391 if (*(++ptr) == CHAR_E) break;
1150 nigel 93 }
1151     continue;
1152     }
1153    
1154 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1155     are handled for real. If the first character is '^', skip it. Also, if the
1156     first few characters (either before or after ^) are \Q\E or \E we skip them
1157 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1158 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1159 nigel 93
1160 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1161 nigel 93 {
1162 ph10 340 BOOL negate_class = FALSE;
1163     for (;;)
1164     {
1165 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1166 ph10 340 {
1167 ph10 438 if (ptr[2] == CHAR_E)
1168     ptr+= 2;
1169     else if (strncmp((const char *)ptr+2,
1170 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1171 ph10 438 ptr += 4;
1172 ph10 392 else
1173 ph10 391 break;
1174 ph10 340 }
1175 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1176 ph10 461 {
1177 ph10 340 negate_class = TRUE;
1178 ph10 438 ptr++;
1179 ph10 461 }
1180 ph10 340 else break;
1181     }
1182    
1183     /* If the next character is ']', it is a data character that must be
1184 ph10 341 skipped, except in JavaScript compatibility mode. */
1185 ph10 345
1186 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1187 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1188 ph10 345 ptr++;
1189    
1190 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1191 nigel 93 {
1192 ph10 220 if (*ptr == 0) return -1;
1193 ph10 391 if (*ptr == CHAR_BACKSLASH)
1194 nigel 93 {
1195 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1196 ph10 391 if (*ptr == CHAR_Q) for (;;)
1197 nigel 93 {
1198 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1199 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1200 ph10 391 if (*(++ptr) == CHAR_E) break;
1201 nigel 93 }
1202     continue;
1203     }
1204     }
1205     continue;
1206     }
1207    
1208     /* Skip comments in /x mode */
1209    
1210 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1211 nigel 93 {
1212 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1213 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1214 nigel 93 continue;
1215     }
1216    
1217 ph10 408 /* Check for the special metacharacters */
1218 ph10 411
1219 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1220 nigel 93 {
1221 ph10 408 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1222     if (rc > 0) return rc;
1223     if (*ptr == 0) goto FAIL_EXIT;
1224 nigel 93 }
1225 ph10 411
1226 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1227     {
1228     if (dup_parens && *count < hwm_count) *count = hwm_count;
1229     *ptrptr = ptr;
1230     return -1;
1231     }
1232 ph10 411
1233     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1234 ph10 408 {
1235     if (*count > hwm_count) hwm_count = *count;
1236     *count = start_count;
1237 ph10 411 }
1238 ph10 408 }
1239 nigel 93
1240 ph10 408 FAIL_EXIT:
1241     *ptrptr = ptr;
1242     return -1;
1243     }
1244 nigel 93
1245    
1246    
1247    
1248 ph10 408 /*************************************************
1249     * Find forward referenced subpattern *
1250     *************************************************/
1251 nigel 93
1252 ph10 408 /* This function scans along a pattern's text looking for capturing
1253     subpatterns, and counting them. If it finds a named pattern that matches the
1254     name it is given, it returns its number. Alternatively, if the name is NULL, it
1255     returns when it reaches a given numbered subpattern. This is used for forward
1256     references to subpatterns. We used to be able to start this scan from the
1257     current compiling point, using the current count value from cd->bracount, and
1258     do it all in a single loop, but the addition of the possibility of duplicate
1259     subpattern numbers means that we have to scan from the very start, in order to
1260     take account of such duplicates, and to use a recursive function to keep track
1261     of the different types of group.
1262    
1263     Arguments:
1264     cd compile background data
1265     name name to seek, or NULL if seeking a numbered subpattern
1266     lorn name length, or subpattern number if name is NULL
1267     xmode TRUE if we are in /x mode
1268    
1269     Returns: the number of the found subpattern, or -1 if not found
1270     */
1271    
1272     static int
1273     find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1274     {
1275     uschar *ptr = (uschar *)cd->start_pattern;
1276     int count = 0;
1277     int rc;
1278    
1279     /* If the pattern does not start with an opening parenthesis, the first call
1280     to find_parens_sub() will scan right to the end (if necessary). However, if it
1281     does start with a parenthesis, find_parens_sub() will return when it hits the
1282     matching closing parens. That is why we have to have a loop. */
1283    
1284 ph10 411 for (;;)
1285     {
1286 ph10 408 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1287 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1288     }
1289    
1290 ph10 408 return rc;
1291 nigel 91 }
1292    
1293    
1294    
1295 ph10 408
1296 nigel 91 /*************************************************
1297 nigel 77 * Find first significant op code *
1298     *************************************************/
1299    
1300     /* This is called by several functions that scan a compiled expression looking
1301     for a fixed first character, or an anchoring op code etc. It skips over things
1302     that do not influence this. For some calls, a change of option is important.
1303     For some calls, it makes sense to skip negative forward and all backward
1304     assertions, and also the \b assertion; for others it does not.
1305    
1306     Arguments:
1307     code pointer to the start of the group
1308     options pointer to external options
1309     optbit the option bit whose changing is significant, or
1310     zero if none are
1311     skipassert TRUE if certain assertions are to be skipped
1312    
1313     Returns: pointer to the first significant opcode
1314     */
1315    
1316     static const uschar*
1317     first_significant_code(const uschar *code, int *options, int optbit,
1318     BOOL skipassert)
1319     {
1320     for (;;)
1321     {
1322     switch ((int)*code)
1323     {
1324     case OP_OPT:
1325     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1326     *options = (int)code[1];
1327     code += 2;
1328     break;
1329    
1330     case OP_ASSERT_NOT:
1331     case OP_ASSERTBACK:
1332     case OP_ASSERTBACK_NOT:
1333     if (!skipassert) return code;
1334     do code += GET(code, 1); while (*code == OP_ALT);
1335     code += _pcre_OP_lengths[*code];
1336     break;
1337    
1338     case OP_WORD_BOUNDARY:
1339     case OP_NOT_WORD_BOUNDARY:
1340     if (!skipassert) return code;
1341     /* Fall through */
1342    
1343     case OP_CALLOUT:
1344     case OP_CREF:
1345 ph10 459 case OP_NCREF:
1346 nigel 93 case OP_RREF:
1347 ph10 459 case OP_NRREF:
1348 nigel 93 case OP_DEF:
1349 nigel 77 code += _pcre_OP_lengths[*code];
1350     break;
1351    
1352     default:
1353     return code;
1354     }
1355     }
1356     /* Control never reaches here */
1357     }
1358    
1359    
1360    
1361    
1362     /*************************************************
1363 ph10 454 * Find the fixed length of a branch *
1364 nigel 77 *************************************************/
1365    
1366 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1367 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1368 ph10 461 In UTF8 mode, the result is in characters rather than bytes. The branch is
1369 ph10 454 temporarily terminated with OP_END when this function is called.
1370 nigel 77
1371 ph10 461 This function is called when a backward assertion is encountered, so that if it
1372     fails, the error message can point to the correct place in the pattern.
1373 ph10 454 However, we cannot do this when the assertion contains subroutine calls,
1374 ph10 461 because they can be forward references. We solve this by remembering this case
1375 ph10 454 and doing the check at the end; a flag specifies which mode we are running in.
1376    
1377 nigel 77 Arguments:
1378     code points to the start of the pattern (the bracket)
1379     options the compiling options
1380 ph10 461 atend TRUE if called when the pattern is complete
1381     cd the "compile data" structure
1382 nigel 77
1383 ph10 461 Returns: the fixed length,
1384 ph10 454 or -1 if there is no fixed length,
1385 nigel 77 or -2 if \C was encountered
1386 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1387 nigel 77 */
1388    
1389     static int
1390 ph10 454 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1391 nigel 77 {
1392     int length = -1;
1393    
1394     register int branchlength = 0;
1395     register uschar *cc = code + 1 + LINK_SIZE;
1396    
1397     /* Scan along the opcodes for this branch. If we get to the end of the
1398     branch, check the length against that of the other branches. */
1399    
1400     for (;;)
1401     {
1402     int d;
1403 ph10 454 uschar *ce, *cs;
1404 nigel 77 register int op = *cc;
1405     switch (op)
1406     {
1407 nigel 93 case OP_CBRA:
1408 nigel 77 case OP_BRA:
1409     case OP_ONCE:
1410     case OP_COND:
1411 ph10 454 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1412 nigel 77 if (d < 0) return d;
1413     branchlength += d;
1414     do cc += GET(cc, 1); while (*cc == OP_ALT);
1415     cc += 1 + LINK_SIZE;
1416     break;
1417    
1418     /* Reached end of a branch; if it's a ket it is the end of a nested
1419     call. If it's ALT it is an alternation in a nested call. If it is
1420     END it's the end of the outer call. All can be handled by the same code. */
1421    
1422     case OP_ALT:
1423     case OP_KET:
1424     case OP_KETRMAX:
1425     case OP_KETRMIN:
1426     case OP_END:
1427     if (length < 0) length = branchlength;
1428     else if (length != branchlength) return -1;
1429     if (*cc != OP_ALT) return length;
1430     cc += 1 + LINK_SIZE;
1431     branchlength = 0;
1432     break;
1433 ph10 461
1434 ph10 454 /* A true recursion implies not fixed length, but a subroutine call may
1435     be OK. If the subroutine is a forward reference, we can't deal with
1436     it until the end of the pattern, so return -3. */
1437 ph10 461
1438 ph10 454 case OP_RECURSE:
1439     if (!atend) return -3;
1440     cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1441     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1442     if (cc > cs && cc < ce) return -1; /* Recursion */
1443     d = find_fixedlength(cs + 2, options, atend, cd);
1444 ph10 461 if (d < 0) return d;
1445 ph10 454 branchlength += d;
1446     cc += 1 + LINK_SIZE;
1447 ph10 461 break;
1448 nigel 77
1449     /* Skip over assertive subpatterns */
1450    
1451     case OP_ASSERT:
1452     case OP_ASSERT_NOT:
1453     case OP_ASSERTBACK:
1454     case OP_ASSERTBACK_NOT:
1455     do cc += GET(cc, 1); while (*cc == OP_ALT);
1456     /* Fall through */
1457    
1458     /* Skip over things that don't match chars */
1459    
1460     case OP_REVERSE:
1461     case OP_CREF:
1462 ph10 459 case OP_NCREF:
1463 nigel 93 case OP_RREF:
1464 ph10 459 case OP_NRREF:
1465 nigel 93 case OP_DEF:
1466 nigel 77 case OP_OPT:
1467     case OP_CALLOUT:
1468     case OP_SOD:
1469     case OP_SOM:
1470 ph10 500 case OP_SET_SOM:
1471 nigel 77 case OP_EOD:
1472     case OP_EODN:
1473     case OP_CIRC:
1474     case OP_DOLL:
1475     case OP_NOT_WORD_BOUNDARY:
1476     case OP_WORD_BOUNDARY:
1477     cc += _pcre_OP_lengths[*cc];
1478     break;
1479    
1480     /* Handle literal characters */
1481    
1482     case OP_CHAR:
1483     case OP_CHARNC:
1484 nigel 91 case OP_NOT:
1485 nigel 77 branchlength++;
1486     cc += 2;
1487     #ifdef SUPPORT_UTF8
1488 ph10 461 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1489 ph10 426 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1490 nigel 77 #endif
1491     break;
1492    
1493     /* Handle exact repetitions. The count is already in characters, but we
1494     need to skip over a multibyte character in UTF8 mode. */
1495    
1496     case OP_EXACT:
1497     branchlength += GET2(cc,1);
1498     cc += 4;
1499     #ifdef SUPPORT_UTF8
1500 ph10 461 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1501 ph10 426 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1502 nigel 77 #endif
1503     break;
1504    
1505     case OP_TYPEEXACT:
1506     branchlength += GET2(cc,1);
1507 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1508 nigel 77 cc += 4;
1509     break;
1510    
1511     /* Handle single-char matchers */
1512    
1513     case OP_PROP:
1514     case OP_NOTPROP:
1515 nigel 87 cc += 2;
1516 nigel 77 /* Fall through */
1517    
1518     case OP_NOT_DIGIT:
1519     case OP_DIGIT:
1520     case OP_NOT_WHITESPACE:
1521     case OP_WHITESPACE:
1522     case OP_NOT_WORDCHAR:
1523     case OP_WORDCHAR:
1524     case OP_ANY:
1525 ph10 342 case OP_ALLANY:
1526 nigel 77 branchlength++;
1527     cc++;
1528     break;
1529    
1530     /* The single-byte matcher isn't allowed */
1531    
1532     case OP_ANYBYTE:
1533     return -2;
1534    
1535     /* Check a class for variable quantification */
1536    
1537     #ifdef SUPPORT_UTF8
1538     case OP_XCLASS:
1539     cc += GET(cc, 1) - 33;
1540     /* Fall through */
1541     #endif
1542    
1543     case OP_CLASS:
1544     case OP_NCLASS:
1545     cc += 33;
1546    
1547     switch (*cc)
1548     {
1549     case OP_CRSTAR:
1550     case OP_CRMINSTAR:
1551     case OP_CRQUERY:
1552     case OP_CRMINQUERY:
1553     return -1;
1554    
1555     case OP_CRRANGE:
1556     case OP_CRMINRANGE:
1557     if (GET2(cc,1) != GET2(cc,3)) return -1;
1558     branchlength += GET2(cc,1);
1559     cc += 5;
1560     break;
1561    
1562     default:
1563     branchlength++;
1564     }
1565     break;
1566    
1567     /* Anything else is variable length */
1568    
1569     default:
1570     return -1;
1571     }
1572     }
1573     /* Control never gets here */
1574     }
1575    
1576    
1577    
1578    
1579     /*************************************************
1580 ph10 454 * Scan compiled regex for specific bracket *
1581 nigel 77 *************************************************/
1582    
1583     /* This little function scans through a compiled pattern until it finds a
1584 ph10 454 capturing bracket with the given number, or, if the number is negative, an
1585 ph10 461 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1586     so that it can be called from pcre_study() when finding the minimum matching
1587 ph10 455 length.
1588 nigel 77
1589     Arguments:
1590     code points to start of expression
1591     utf8 TRUE in UTF-8 mode
1592 ph10 454 number the required bracket number or negative to find a lookbehind
1593 nigel 77
1594     Returns: pointer to the opcode for the bracket, or NULL if not found
1595     */
1596    
1597 ph10 455 const uschar *
1598     _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1599 nigel 77 {
1600     for (;;)
1601     {
1602     register int c = *code;
1603     if (c == OP_END) return NULL;
1604 nigel 91
1605     /* XCLASS is used for classes that cannot be represented just by a bit
1606     map. This includes negated single high-valued characters. The length in
1607     the table is zero; the actual length is stored in the compiled code. */
1608    
1609     if (c == OP_XCLASS) code += GET(code, 1);
1610 ph10 461
1611 ph10 454 /* Handle recursion */
1612 ph10 461
1613 ph10 454 else if (c == OP_REVERSE)
1614     {
1615 ph10 461 if (number < 0) return (uschar *)code;
1616 ph10 454 code += _pcre_OP_lengths[c];
1617     }
1618 nigel 91
1619 nigel 93 /* Handle capturing bracket */
1620 nigel 91
1621 nigel 93 else if (c == OP_CBRA)
1622 nigel 77 {
1623 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1624 nigel 77 if (n == number) return (uschar *)code;
1625 nigel 93 code += _pcre_OP_lengths[c];
1626 nigel 77 }
1627 nigel 91
1628 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1629     repeated character types, we have to test for \p and \P, which have an extra
1630 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1631 ph10 510 must add in its length. */
1632 nigel 91
1633 nigel 77 else
1634     {
1635 ph10 218 switch(c)
1636     {
1637     case OP_TYPESTAR:
1638     case OP_TYPEMINSTAR:
1639     case OP_TYPEPLUS:
1640     case OP_TYPEMINPLUS:
1641     case OP_TYPEQUERY:
1642     case OP_TYPEMINQUERY:
1643     case OP_TYPEPOSSTAR:
1644     case OP_TYPEPOSPLUS:
1645     case OP_TYPEPOSQUERY:
1646     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1647 ph10 220 break;
1648 ph10 221
1649     case OP_TYPEUPTO:
1650     case OP_TYPEMINUPTO:
1651     case OP_TYPEEXACT:
1652     case OP_TYPEPOSUPTO:
1653     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1654     break;
1655 ph10 512
1656 ph10 510 case OP_MARK:
1657     case OP_PRUNE_ARG:
1658     case OP_SKIP_ARG:
1659     case OP_THEN_ARG:
1660     code += code[1];
1661 ph10 512 break;
1662 ph10 220 }
1663    
1664 ph10 218 /* Add in the fixed length from the table */
1665 ph10 220
1666 nigel 77 code += _pcre_OP_lengths[c];
1667 ph10 220
1668 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1669     a multi-byte character. The length in the table is a minimum, so we have to
1670     arrange to skip the extra bytes. */
1671 ph10 220
1672 ph10 107 #ifdef SUPPORT_UTF8
1673 nigel 77 if (utf8) switch(c)
1674     {
1675     case OP_CHAR:
1676     case OP_CHARNC:
1677     case OP_EXACT:
1678     case OP_UPTO:
1679     case OP_MINUPTO:
1680 nigel 93 case OP_POSUPTO:
1681 nigel 77 case OP_STAR:
1682     case OP_MINSTAR:
1683 nigel 93 case OP_POSSTAR:
1684 nigel 77 case OP_PLUS:
1685     case OP_MINPLUS:
1686 nigel 93 case OP_POSPLUS:
1687 nigel 77 case OP_QUERY:
1688     case OP_MINQUERY:
1689 nigel 93 case OP_POSQUERY:
1690     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1691 nigel 77 break;
1692     }
1693 ph10 369 #else
1694     (void)(utf8); /* Keep compiler happy by referencing function argument */
1695 ph10 111 #endif
1696 nigel 77 }
1697     }
1698     }
1699    
1700    
1701    
1702     /*************************************************
1703     * Scan compiled regex for recursion reference *
1704     *************************************************/
1705    
1706     /* This little function scans through a compiled pattern until it finds an
1707     instance of OP_RECURSE.
1708    
1709     Arguments:
1710     code points to start of expression
1711     utf8 TRUE in UTF-8 mode
1712    
1713     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1714     */
1715    
1716     static const uschar *
1717     find_recurse(const uschar *code, BOOL utf8)
1718     {
1719     for (;;)
1720     {
1721     register int c = *code;
1722     if (c == OP_END) return NULL;
1723 nigel 91 if (c == OP_RECURSE) return code;
1724 ph10 220
1725 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1726     map. This includes negated single high-valued characters. The length in
1727     the table is zero; the actual length is stored in the compiled code. */
1728    
1729     if (c == OP_XCLASS) code += GET(code, 1);
1730    
1731 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1732     repeated character types, we have to test for \p and \P, which have an extra
1733 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1734 ph10 510 must add in its length. */
1735 nigel 91
1736 nigel 77 else
1737     {
1738 ph10 218 switch(c)
1739     {
1740     case OP_TYPESTAR:
1741     case OP_TYPEMINSTAR:
1742     case OP_TYPEPLUS:
1743     case OP_TYPEMINPLUS:
1744     case OP_TYPEQUERY:
1745     case OP_TYPEMINQUERY:
1746     case OP_TYPEPOSSTAR:
1747     case OP_TYPEPOSPLUS:
1748     case OP_TYPEPOSQUERY:
1749     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1750 ph10 220 break;
1751 ph10 221
1752     case OP_TYPEPOSUPTO:
1753     case OP_TYPEUPTO:
1754     case OP_TYPEMINUPTO:
1755     case OP_TYPEEXACT:
1756     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1757     break;
1758 ph10 512
1759 ph10 510 case OP_MARK:
1760     case OP_PRUNE_ARG:
1761     case OP_SKIP_ARG:
1762     case OP_THEN_ARG:
1763     code += code[1];
1764 ph10 512 break;
1765 ph10 220 }
1766    
1767 ph10 218 /* Add in the fixed length from the table */
1768    
1769 nigel 77 code += _pcre_OP_lengths[c];
1770 ph10 220
1771 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1772     by a multi-byte character. The length in the table is a minimum, so we have
1773     to arrange to skip the extra bytes. */
1774 ph10 220
1775 ph10 107 #ifdef SUPPORT_UTF8
1776 nigel 77 if (utf8) switch(c)
1777     {
1778     case OP_CHAR:
1779     case OP_CHARNC:
1780     case OP_EXACT:
1781     case OP_UPTO:
1782     case OP_MINUPTO:
1783 nigel 93 case OP_POSUPTO:
1784 nigel 77 case OP_STAR:
1785     case OP_MINSTAR:
1786 nigel 93 case OP_POSSTAR:
1787 nigel 77 case OP_PLUS:
1788     case OP_MINPLUS:
1789 nigel 93 case OP_POSPLUS:
1790 nigel 77 case OP_QUERY:
1791     case OP_MINQUERY:
1792 nigel 93 case OP_POSQUERY:
1793     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1794 nigel 77 break;
1795     }
1796 ph10 369 #else
1797     (void)(utf8); /* Keep compiler happy by referencing function argument */
1798 ph10 111 #endif
1799 nigel 77 }
1800     }
1801     }
1802    
1803    
1804    
1805     /*************************************************
1806     * Scan compiled branch for non-emptiness *
1807     *************************************************/
1808    
1809     /* This function scans through a branch of a compiled pattern to see whether it
1810 nigel 93 can match the empty string or not. It is called from could_be_empty()
1811     below and from compile_branch() when checking for an unlimited repeat of a
1812     group that can match nothing. Note that first_significant_code() skips over
1813 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1814     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1815     bracket whose current branch will already have been scanned.
1816 nigel 77
1817     Arguments:
1818     code points to start of search
1819     endcode points to where to stop
1820     utf8 TRUE if in UTF8 mode
1821 ph10 503 cd contains pointers to tables etc.
1822 nigel 77
1823     Returns: TRUE if what is matched could be empty
1824     */
1825    
1826     static BOOL
1827 ph10 503 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1828     compile_data *cd)
1829 nigel 77 {
1830     register int c;
1831 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1832 nigel 77 code < endcode;
1833     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1834     {
1835     const uschar *ccode;
1836    
1837     c = *code;
1838 ph10 507
1839 ph10 286 /* Skip over forward assertions; the other assertions are skipped by
1840 ph10 282 first_significant_code() with a TRUE final argument. */
1841 ph10 286
1842 ph10 282 if (c == OP_ASSERT)
1843 ph10 286 {
1844 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1845     c = *code;
1846     continue;
1847 ph10 286 }
1848 ph10 172
1849 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1850 nigel 77
1851 ph10 335 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1852 ph10 170 {
1853 ph10 172 code += _pcre_OP_lengths[c];
1854 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1855     c = *code;
1856     continue;
1857     }
1858 ph10 507
1859 ph10 503 /* For a recursion/subroutine call, if its end has been reached, which
1860     implies a subroutine call, we can scan it. */
1861 ph10 507
1862 ph10 503 if (c == OP_RECURSE)
1863     {
1864 ph10 507 BOOL empty_branch = FALSE;
1865 ph10 503 const uschar *scode = cd->start_code + GET(code, 1);
1866     if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
1867     do
1868     {
1869 ph10 504 if (could_be_empty_branch(scode, endcode, utf8, cd))
1870     {
1871     empty_branch = TRUE;
1872 ph10 507 break;
1873     }
1874 ph10 503 scode += GET(scode, 1);
1875     }
1876     while (*scode == OP_ALT);
1877 ph10 504 if (!empty_branch) return FALSE; /* All branches are non-empty */
1878 ph10 503 continue;
1879 ph10 507 }
1880 ph10 170
1881     /* For other groups, scan the branches. */
1882 ph10 172
1883 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1884 nigel 77 {
1885     BOOL empty_branch;
1886     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1887 ph10 406
1888     /* If a conditional group has only one branch, there is a second, implied,
1889 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
1890     Otherwise, scan the individual branches of the group. */
1891 ph10 406
1892 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1893 nigel 77 code += GET(code, 1);
1894 ph10 395 else
1895 ph10 406 {
1896 ph10 395 empty_branch = FALSE;
1897     do
1898     {
1899 ph10 503 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
1900 ph10 395 empty_branch = TRUE;
1901     code += GET(code, 1);
1902     }
1903     while (*code == OP_ALT);
1904     if (!empty_branch) return FALSE; /* All branches are non-empty */
1905 nigel 77 }
1906 ph10 406
1907 ph10 172 c = *code;
1908 nigel 93 continue;
1909 nigel 77 }
1910    
1911 nigel 93 /* Handle the other opcodes */
1912    
1913     switch (c)
1914 nigel 77 {
1915 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1916     cannot be represented just by a bit map. This includes negated single
1917     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1918 ph10 220 actual length is stored in the compiled code, so we must update "code"
1919 ph10 216 here. */
1920 nigel 77
1921     #ifdef SUPPORT_UTF8
1922     case OP_XCLASS:
1923 ph10 216 ccode = code += GET(code, 1);
1924 nigel 77 goto CHECK_CLASS_REPEAT;
1925     #endif
1926    
1927     case OP_CLASS:
1928     case OP_NCLASS:
1929     ccode = code + 33;
1930    
1931     #ifdef SUPPORT_UTF8
1932     CHECK_CLASS_REPEAT:
1933     #endif
1934    
1935     switch (*ccode)
1936     {
1937     case OP_CRSTAR: /* These could be empty; continue */
1938     case OP_CRMINSTAR:
1939     case OP_CRQUERY:
1940     case OP_CRMINQUERY:
1941     break;
1942    
1943     default: /* Non-repeat => class must match */
1944     case OP_CRPLUS: /* These repeats aren't empty */
1945     case OP_CRMINPLUS:
1946     return FALSE;
1947    
1948     case OP_CRRANGE:
1949     case OP_CRMINRANGE:
1950     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1951     break;
1952     }
1953     break;
1954    
1955     /* Opcodes that must match a character */
1956    
1957     case OP_PROP:
1958     case OP_NOTPROP:
1959     case OP_EXTUNI:
1960     case OP_NOT_DIGIT:
1961     case OP_DIGIT:
1962     case OP_NOT_WHITESPACE:
1963     case OP_WHITESPACE:
1964     case OP_NOT_WORDCHAR:
1965     case OP_WORDCHAR:
1966     case OP_ANY:
1967 ph10 345 case OP_ALLANY:
1968 nigel 77 case OP_ANYBYTE:
1969     case OP_CHAR:
1970     case OP_CHARNC:
1971     case OP_NOT:
1972     case OP_PLUS:
1973     case OP_MINPLUS:
1974 nigel 93 case OP_POSPLUS:
1975 nigel 77 case OP_EXACT:
1976     case OP_NOTPLUS:
1977     case OP_NOTMINPLUS:
1978 nigel 93 case OP_NOTPOSPLUS:
1979 nigel 77 case OP_NOTEXACT:
1980     case OP_TYPEPLUS:
1981     case OP_TYPEMINPLUS:
1982 nigel 93 case OP_TYPEPOSPLUS:
1983 nigel 77 case OP_TYPEEXACT:
1984     return FALSE;
1985 ph10 227
1986     /* These are going to continue, as they may be empty, but we have to
1987     fudge the length for the \p and \P cases. */
1988    
1989 ph10 224 case OP_TYPESTAR:
1990     case OP_TYPEMINSTAR:
1991     case OP_TYPEPOSSTAR:
1992     case OP_TYPEQUERY:
1993     case OP_TYPEMINQUERY:
1994     case OP_TYPEPOSQUERY:
1995     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1996 ph10 227 break;
1997    
1998 ph10 224 /* Same for these */
1999 ph10 227
2000 ph10 224 case OP_TYPEUPTO:
2001     case OP_TYPEMINUPTO:
2002     case OP_TYPEPOSUPTO:
2003     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2004     break;
2005 nigel 77
2006     /* End of branch */
2007    
2008     case OP_KET:
2009     case OP_KETRMAX:
2010     case OP_KETRMIN:
2011     case OP_ALT:
2012     return TRUE;
2013    
2014 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2015     MINUPTO, and POSUPTO may be followed by a multibyte character */
2016 nigel 77
2017     #ifdef SUPPORT_UTF8
2018     case OP_STAR:
2019     case OP_MINSTAR:
2020 nigel 93 case OP_POSSTAR:
2021 nigel 77 case OP_QUERY:
2022     case OP_MINQUERY:
2023 nigel 93 case OP_POSQUERY:
2024 ph10 426 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2025     break;
2026 ph10 461
2027 nigel 77 case OP_UPTO:
2028     case OP_MINUPTO:
2029 nigel 93 case OP_POSUPTO:
2030 ph10 426 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2031 nigel 77 break;
2032     #endif
2033 ph10 503
2034 ph10 510 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2035     string. */
2036    
2037     case OP_MARK:
2038     case OP_PRUNE_ARG:
2039     case OP_SKIP_ARG:
2040     case OP_THEN_ARG:
2041     code += code[1];
2042 ph10 512 break;
2043 ph10 510
2044 ph10 503 /* None of the remaining opcodes are required to match a character. */
2045 ph10 507
2046 ph10 503 default:
2047 ph10 507 break;
2048 nigel 77 }
2049     }
2050    
2051     return TRUE;
2052     }
2053    
2054    
2055    
2056     /*************************************************
2057     * Scan compiled regex for non-emptiness *
2058     *************************************************/
2059    
2060     /* This function is called to check for left recursive calls. We want to check
2061     the current branch of the current pattern to see if it could match the empty
2062     string. If it could, we must look outwards for branches at other levels,
2063     stopping when we pass beyond the bracket which is the subject of the recursion.
2064    
2065     Arguments:
2066     code points to start of the recursion
2067     endcode points to where to stop (current RECURSE item)
2068     bcptr points to the chain of current (unclosed) branch starts
2069     utf8 TRUE if in UTF-8 mode
2070 ph10 507 cd pointers to tables etc
2071 nigel 77
2072     Returns: TRUE if what is matched could be empty
2073     */
2074    
2075     static BOOL
2076     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2077 ph10 503 BOOL utf8, compile_data *cd)
2078 nigel 77 {
2079 ph10 475 while (bcptr != NULL && bcptr->current_branch >= code)
2080 nigel 77 {
2081 ph10 503 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2082 ph10 475 return FALSE;
2083 nigel 77 bcptr = bcptr->outer;
2084     }
2085     return TRUE;
2086     }
2087    
2088    
2089    
2090     /*************************************************
2091     * Check for POSIX class syntax *
2092     *************************************************/
2093    
2094     /* This function is called when the sequence "[:" or "[." or "[=" is
2095 ph10 295 encountered in a character class. It checks whether this is followed by a
2096 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2097 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2098 nigel 77
2099 ph10 298 Originally, this function only recognized a sequence of letters between the
2100     terminators, but it seems that Perl recognizes any sequence of characters,
2101     though of course unknown POSIX names are subsequently rejected. Perl gives an
2102     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2103     didn't consider this to be a POSIX class. Likewise for [:1234:].
2104 ph10 295
2105 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2106     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2107     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2108     below handles the special case of \], but does not try to do any other escape
2109     processing. This makes it different from Perl for cases such as [:l\ower:]
2110 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2111 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2112 ph10 295 I think.
2113    
2114     Arguments:
2115 nigel 77 ptr pointer to the initial [
2116     endptr where to return the end pointer
2117    
2118     Returns: TRUE or FALSE
2119     */
2120    
2121     static BOOL
2122 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2123 nigel 77 {
2124     int terminator; /* Don't combine these lines; the Solaris cc */
2125     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2126 ph10 295 for (++ptr; *ptr != 0; ptr++)
2127 nigel 77 {
2128 ph10 391 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2129 ph10 298 {
2130 ph10 391 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2131     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2132 ph10 295 {
2133     *endptr = ptr;
2134     return TRUE;
2135 ph10 298 }
2136     }
2137     }
2138 nigel 77 return FALSE;
2139     }
2140    
2141    
2142    
2143    
2144     /*************************************************
2145     * Check POSIX class name *
2146     *************************************************/
2147    
2148     /* This function is called to check the name given in a POSIX-style class entry
2149     such as [:alnum:].
2150    
2151     Arguments:
2152     ptr points to the first letter
2153     len the length of the name
2154    
2155     Returns: a value representing the name, or -1 if unknown
2156     */
2157    
2158     static int
2159     check_posix_name(const uschar *ptr, int len)
2160     {
2161 ph10 240 const char *pn = posix_names;
2162 nigel 77 register int yield = 0;
2163     while (posix_name_lengths[yield] != 0)
2164     {
2165     if (len == posix_name_lengths[yield] &&
2166 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2167 ph10 243 pn += posix_name_lengths[yield] + 1;
2168 nigel 77 yield++;
2169     }
2170     return -1;
2171     }
2172    
2173    
2174     /*************************************************
2175     * Adjust OP_RECURSE items in repeated group *
2176     *************************************************/
2177    
2178     /* OP_RECURSE items contain an offset from the start of the regex to the group
2179     that is referenced. This means that groups can be replicated for fixed
2180     repetition simply by copying (because the recursion is allowed to refer to
2181     earlier groups that are outside the current group). However, when a group is
2182 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2183     inserted before it, after it has been compiled. This means that any OP_RECURSE
2184     items within it that refer to the group itself or any contained groups have to
2185     have their offsets adjusted. That one of the jobs of this function. Before it
2186     is called, the partially compiled regex must be temporarily terminated with
2187     OP_END.
2188 nigel 77
2189 nigel 93 This function has been extended with the possibility of forward references for
2190     recursions and subroutine calls. It must also check the list of such references
2191     for the group we are dealing with. If it finds that one of the recursions in
2192     the current group is on this list, it adjusts the offset in the list, not the
2193     value in the reference (which is a group number).
2194    
2195 nigel 77 Arguments:
2196     group points to the start of the group
2197     adjust the amount by which the group is to be moved
2198     utf8 TRUE in UTF-8 mode
2199     cd contains pointers to tables etc.
2200 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2201 nigel 77
2202     Returns: nothing
2203     */
2204    
2205     static void
2206 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2207     uschar *save_hwm)
2208 nigel 77 {
2209     uschar *ptr = group;
2210 ph10 224
2211 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2212     {
2213 nigel 93 int offset;
2214     uschar *hc;
2215    
2216     /* See if this recursion is on the forward reference list. If so, adjust the
2217     reference. */
2218 ph10 345
2219 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2220     {
2221     offset = GET(hc, 0);
2222     if (cd->start_code + offset == ptr + 1)
2223     {
2224     PUT(hc, 0, offset + adjust);
2225     break;
2226     }
2227     }
2228    
2229     /* Otherwise, adjust the recursion offset if it's after the start of this
2230     group. */
2231    
2232     if (hc >= cd->hwm)
2233     {
2234     offset = GET(ptr, 1);
2235     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2236     }
2237    
2238 nigel 77 ptr += 1 + LINK_SIZE;
2239     }
2240     }
2241    
2242    
2243    
2244     /*************************************************
2245     * Insert an automatic callout point *
2246     *************************************************/
2247    
2248     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2249     callout points before each pattern item.
2250    
2251     Arguments:
2252     code current code pointer
2253     ptr current pattern pointer
2254     cd pointers to tables etc
2255    
2256     Returns: new code pointer
2257     */
2258    
2259     static uschar *
2260     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2261     {
2262     *code++ = OP_CALLOUT;
2263     *code++ = 255;
2264     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2265     PUT(code, LINK_SIZE, 0); /* Default length */
2266     return code + 2*LINK_SIZE;
2267     }
2268    
2269    
2270    
2271     /*************************************************
2272     * Complete a callout item *
2273     *************************************************/
2274    
2275     /* A callout item contains the length of the next item in the pattern, which
2276     we can't fill in till after we have reached the relevant point. This is used
2277     for both automatic and manual callouts.
2278    
2279     Arguments:
2280     previous_callout points to previous callout item
2281     ptr current pattern pointer
2282     cd pointers to tables etc
2283    
2284     Returns: nothing
2285     */
2286    
2287     static void
2288     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2289     {
2290     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2291     PUT(previous_callout, 2 + LINK_SIZE, length);
2292     }
2293    
2294    
2295    
2296     #ifdef SUPPORT_UCP
2297     /*************************************************
2298     * Get othercase range *
2299     *************************************************/
2300    
2301     /* This function is passed the start and end of a class range, in UTF-8 mode
2302     with UCP support. It searches up the characters, looking for internal ranges of
2303     characters in the "other" case. Each call returns the next one, updating the
2304     start address.
2305    
2306     Arguments:
2307     cptr points to starting character value; updated
2308     d end value
2309     ocptr where to put start of othercase range
2310     odptr where to put end of othercase range
2311    
2312     Yield: TRUE when range returned; FALSE when no more
2313     */
2314    
2315     static BOOL
2316 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2317     unsigned int *odptr)
2318 nigel 77 {
2319 nigel 93 unsigned int c, othercase, next;
2320 nigel 77
2321     for (c = *cptr; c <= d; c++)
2322 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2323 nigel 77
2324     if (c > d) return FALSE;
2325    
2326     *ocptr = othercase;
2327     next = othercase + 1;
2328    
2329     for (++c; c <= d; c++)
2330     {
2331 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2332 nigel 77 next++;
2333     }
2334    
2335     *odptr = next - 1;
2336     *cptr = c;
2337    
2338     return TRUE;
2339     }
2340     #endif /* SUPPORT_UCP */
2341    
2342    
2343 nigel 93
2344 nigel 77 /*************************************************
2345 nigel 93 * Check if auto-possessifying is possible *
2346     *************************************************/
2347    
2348     /* This function is called for unlimited repeats of certain items, to see
2349     whether the next thing could possibly match the repeated item. If not, it makes
2350     sense to automatically possessify the repeated item.
2351    
2352     Arguments:
2353     op_code the repeated op code
2354     this data for this item, depends on the opcode
2355     utf8 TRUE in UTF-8 mode
2356     utf8_char used for utf8 character bytes, NULL if not relevant
2357     ptr next character in pattern
2358     options options bits
2359     cd contains pointers to tables etc.
2360    
2361     Returns: TRUE if possessifying is wanted
2362     */
2363    
2364     static BOOL
2365     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2366     const uschar *ptr, int options, compile_data *cd)
2367     {
2368     int next;
2369    
2370     /* Skip whitespace and comments in extended mode */
2371    
2372     if ((options & PCRE_EXTENDED) != 0)
2373     {
2374     for (;;)
2375     {
2376     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2377 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2378 nigel 93 {
2379     while (*(++ptr) != 0)
2380     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2381     }
2382     else break;
2383     }
2384     }
2385    
2386     /* If the next item is one that we can handle, get its value. A non-negative
2387     value is a character, a negative value is an escape value. */
2388    
2389 ph10 391 if (*ptr == CHAR_BACKSLASH)
2390 nigel 93 {
2391     int temperrorcode = 0;
2392     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2393     if (temperrorcode != 0) return FALSE;
2394     ptr++; /* Point after the escape sequence */
2395     }
2396    
2397     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2398     {
2399     #ifdef SUPPORT_UTF8
2400     if (utf8) { GETCHARINC(next, ptr); } else
2401     #endif
2402     next = *ptr++;
2403     }
2404    
2405     else return FALSE;
2406    
2407     /* Skip whitespace and comments in extended mode */
2408    
2409     if ((options & PCRE_EXTENDED) != 0)
2410     {
2411     for (;;)
2412     {
2413     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2414 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2415 nigel 93 {
2416     while (*(++ptr) != 0)
2417     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2418     }
2419     else break;
2420     }
2421     }
2422    
2423     /* If the next thing is itself optional, we have to give up. */
2424    
2425 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2426 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2427     return FALSE;
2428 nigel 93
2429     /* Now compare the next item with the previous opcode. If the previous is a
2430     positive single character match, "item" either contains the character or, if
2431     "item" is greater than 127 in utf8 mode, the character's bytes are in
2432     utf8_char. */
2433    
2434    
2435     /* Handle cases when the next item is a character. */
2436    
2437     if (next >= 0) switch(op_code)
2438     {
2439     case OP_CHAR:
2440     #ifdef SUPPORT_UTF8
2441     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2442 ph10 369 #else
2443     (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2444 nigel 93 #endif
2445     return item != next;
2446    
2447     /* For CHARNC (caseless character) we must check the other case. If we have
2448     Unicode property support, we can use it to test the other case of
2449     high-valued characters. */
2450    
2451     case OP_CHARNC:
2452     #ifdef SUPPORT_UTF8
2453     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2454     #endif
2455     if (item == next) return FALSE;
2456     #ifdef SUPPORT_UTF8
2457     if (utf8)
2458     {
2459     unsigned int othercase;
2460     if (next < 128) othercase = cd->fcc[next]; else
2461     #ifdef SUPPORT_UCP
2462 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2463 nigel 93 #else
2464     othercase = NOTACHAR;
2465     #endif
2466     return (unsigned int)item != othercase;
2467     }
2468     else
2469     #endif /* SUPPORT_UTF8 */
2470     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2471    
2472     /* For OP_NOT, "item" must be a single-byte character. */
2473    
2474     case OP_NOT:
2475     if (item == next) return TRUE;
2476     if ((options & PCRE_CASELESS) == 0) return FALSE;
2477     #ifdef SUPPORT_UTF8
2478     if (utf8)
2479     {
2480     unsigned int othercase;
2481     if (next < 128) othercase = cd->fcc[next]; else
2482     #ifdef SUPPORT_UCP
2483 ph10 349 othercase = UCD_OTHERCASE(next);
2484 nigel 93 #else
2485     othercase = NOTACHAR;
2486     #endif
2487     return (unsigned int)item == othercase;
2488     }
2489     else
2490     #endif /* SUPPORT_UTF8 */
2491     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2492    
2493     case OP_DIGIT:
2494     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2495    
2496     case OP_NOT_DIGIT:
2497     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2498    
2499     case OP_WHITESPACE:
2500     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2501    
2502     case OP_NOT_WHITESPACE:
2503     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2504    
2505     case OP_WORDCHAR:
2506     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2507    
2508     case OP_NOT_WORDCHAR:
2509     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2510    
2511 ph10 180 case OP_HSPACE:
2512     case OP_NOT_HSPACE:
2513     switch(next)
2514     {
2515     case 0x09:
2516     case 0x20:
2517     case 0xa0:
2518     case 0x1680:
2519     case 0x180e:
2520     case 0x2000:
2521     case 0x2001:
2522     case 0x2002:
2523     case 0x2003:
2524     case 0x2004:
2525     case 0x2005:
2526     case 0x2006:
2527     case 0x2007:
2528     case 0x2008:
2529     case 0x2009:
2530     case 0x200A:
2531     case 0x202f:
2532     case 0x205f:
2533     case 0x3000:
2534     return op_code != OP_HSPACE;
2535     default:
2536     return op_code == OP_HSPACE;
2537     }
2538    
2539     case OP_VSPACE:
2540     case OP_NOT_VSPACE:
2541     switch(next)
2542     {
2543     case 0x0a:
2544     case 0x0b:
2545     case 0x0c:
2546     case 0x0d:
2547     case 0x85:
2548     case 0x2028:
2549     case 0x2029:
2550     return op_code != OP_VSPACE;
2551     default:
2552     return op_code == OP_VSPACE;
2553     }
2554    
2555 nigel 93 default:
2556     return FALSE;
2557     }
2558    
2559    
2560     /* Handle the case when the next item is \d, \s, etc. */
2561    
2562     switch(op_code)
2563     {
2564     case OP_CHAR:
2565     case OP_CHARNC:
2566     #ifdef SUPPORT_UTF8
2567     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2568     #endif
2569     switch(-next)
2570     {
2571     case ESC_d:
2572     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2573    
2574     case ESC_D:
2575     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2576    
2577     case ESC_s:
2578     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2579    
2580     case ESC_S:
2581     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2582    
2583     case ESC_w:
2584     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2585    
2586     case ESC_W:
2587     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2588 ph10 182
2589 ph10 180 case ESC_h:
2590     case ESC_H:
2591     switch(item)
2592     {
2593     case 0x09:
2594     case 0x20:
2595     case 0xa0:
2596     case 0x1680:
2597     case 0x180e:
2598     case 0x2000:
2599     case 0x2001:
2600     case 0x2002:
2601     case 0x2003:
2602     case 0x2004:
2603     case 0x2005:
2604     case 0x2006:
2605     case 0x2007:
2606     case 0x2008:
2607     case 0x2009:
2608     case 0x200A:
2609     case 0x202f:
2610     case 0x205f:
2611     case 0x3000:
2612     return -next != ESC_h;
2613     default:
2614     return -next == ESC_h;
2615 ph10 182 }
2616    
2617 ph10 180 case ESC_v:
2618     case ESC_V:
2619     switch(item)
2620     {
2621     case 0x0a:
2622     case 0x0b:
2623     case 0x0c:
2624     case 0x0d:
2625     case 0x85:
2626     case 0x2028:
2627     case 0x2029:
2628     return -next != ESC_v;
2629     default:
2630     return -next == ESC_v;
2631 ph10 182 }
2632 nigel 93
2633     default:
2634     return FALSE;
2635     }
2636    
2637     case OP_DIGIT:
2638 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2639     next == -ESC_h || next == -ESC_v;
2640 nigel 93
2641     case OP_NOT_DIGIT:
2642     return next == -ESC_d;
2643    
2644     case OP_WHITESPACE:
2645     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2646    
2647     case OP_NOT_WHITESPACE:
2648 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2649 nigel 93
2650 ph10 180 case OP_HSPACE:
2651     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2652    
2653     case OP_NOT_HSPACE:
2654     return next == -ESC_h;
2655 ph10 182
2656 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2657 ph10 182 case OP_VSPACE:
2658 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2659    
2660     case OP_NOT_VSPACE:
2661 ph10 182 return next == -ESC_v;
2662 ph10 180
2663 nigel 93 case OP_WORDCHAR:
2664 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2665 nigel 93
2666     case OP_NOT_WORDCHAR:
2667     return next == -ESC_w || next == -ESC_d;
2668 ph10 182
2669 nigel 93 default:
2670     return FALSE;
2671     }
2672    
2673     /* Control does not reach here */
2674     }
2675    
2676    
2677    
2678     /*************************************************
2679 nigel 77 * Compile one branch *
2680     *************************************************/
2681    
2682 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2683 nigel 77 changed during the branch, the pointer is used to change the external options
2684 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2685     to find out the amount of memory needed, as well as during the real compile
2686     phase. The value of lengthptr distinguishes the two phases.
2687 nigel 77
2688     Arguments:
2689     optionsptr pointer to the option bits
2690     codeptr points to the pointer to the current code point
2691     ptrptr points to the current pattern pointer
2692     errorcodeptr points to error code variable
2693     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2694     reqbyteptr set to the last literal character required, else < 0
2695     bcptr points to current branch chain
2696     cd contains pointers to tables etc.
2697 nigel 93 lengthptr NULL during the real compile phase
2698     points to length accumulator during pre-compile phase
2699 nigel 77
2700     Returns: TRUE on success
2701     FALSE, with *errorcodeptr set non-zero on error
2702     */
2703    
2704     static BOOL
2705 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2706     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2707     compile_data *cd, int *lengthptr)
2708 nigel 77 {
2709     int repeat_type, op_type;
2710     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2711     int bravalue = 0;
2712     int greedy_default, greedy_non_default;
2713     int firstbyte, reqbyte;
2714     int zeroreqbyte, zerofirstbyte;
2715     int req_caseopt, reqvary, tempreqvary;
2716     int options = *optionsptr;
2717     int after_manual_callout = 0;
2718 nigel 93 int length_prevgroup = 0;
2719 nigel 77 register int c;
2720     register uschar *code = *codeptr;
2721 nigel 93 uschar *last_code = code;
2722     uschar *orig_code = code;
2723 nigel 77 uschar *tempcode;
2724     BOOL inescq = FALSE;
2725     BOOL groupsetfirstbyte = FALSE;
2726     const uschar *ptr = *ptrptr;
2727     const uschar *tempptr;
2728     uschar *previous = NULL;
2729     uschar *previous_callout = NULL;
2730 nigel 93 uschar *save_hwm = NULL;
2731 nigel 77 uschar classbits[32];
2732    
2733     #ifdef SUPPORT_UTF8
2734     BOOL class_utf8;
2735     BOOL utf8 = (options & PCRE_UTF8) != 0;
2736     uschar *class_utf8data;
2737 ph10 300 uschar *class_utf8data_base;
2738 nigel 77 uschar utf8_char[6];
2739     #else
2740     BOOL utf8 = FALSE;
2741 nigel 93 uschar *utf8_char = NULL;
2742 nigel 77 #endif
2743    
2744 ph10 475 #ifdef PCRE_DEBUG
2745 nigel 93 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2746     #endif
2747    
2748 nigel 77 /* Set up the default and non-default settings for greediness */
2749    
2750     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2751     greedy_non_default = greedy_default ^ 1;
2752    
2753     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2754     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2755     matches a non-fixed char first char; reqbyte just remains unset if we never
2756     find one.
2757    
2758     When we hit a repeat whose minimum is zero, we may have to adjust these values
2759     to take the zero repeat into account. This is implemented by setting them to
2760     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2761     item types that can be repeated set these backoff variables appropriately. */
2762    
2763     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2764    
2765     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2766     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2767     value > 255. It is added into the firstbyte or reqbyte variables to record the
2768     case status of the value. This is used only for ASCII characters. */
2769    
2770     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2771    
2772     /* Switch on next character until the end of the branch */
2773    
2774     for (;; ptr++)
2775     {
2776     BOOL negate_class;
2777 ph10 286 BOOL should_flip_negation;
2778 nigel 77 BOOL possessive_quantifier;
2779     BOOL is_quantifier;
2780 nigel 93 BOOL is_recurse;
2781 ph10 180 BOOL reset_bracount;
2782 nigel 77 int class_charcount;
2783     int class_lastchar;
2784     int newoptions;
2785     int recno;
2786 ph10 172 int refsign;
2787 nigel 77 int skipbytes;
2788     int subreqbyte;
2789     int subfirstbyte;
2790 nigel 93 int terminator;
2791 nigel 77 int mclength;
2792     uschar mcbuffer[8];
2793    
2794 nigel 93 /* Get next byte in the pattern */
2795 nigel 77
2796     c = *ptr;
2797 ph10 345
2798 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2799     previous cycle of this loop. */
2800    
2801     if (lengthptr != NULL)
2802     {
2803 ph10 475 #ifdef PCRE_DEBUG
2804 nigel 93 if (code > cd->hwm) cd->hwm = code; /* High water info */
2805     #endif
2806 ph10 505 if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */
2807 nigel 93 {
2808     *errorcodeptr = ERR52;
2809     goto FAILED;
2810     }
2811    
2812     /* There is at least one situation where code goes backwards: this is the
2813     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2814     the class is simply eliminated. However, it is created first, so we have to
2815     allow memory for it. Therefore, don't ever reduce the length at this point.
2816     */
2817    
2818     if (code < last_code) code = last_code;
2819 ph10 202
2820     /* Paranoid check for integer overflow */
2821    
2822     if (OFLOW_MAX - *lengthptr < code - last_code)
2823     {
2824     *errorcodeptr = ERR20;
2825     goto FAILED;
2826     }
2827    
2828 nigel 93 *lengthptr += code - last_code;
2829     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2830    
2831     /* If "previous" is set and it is not at the start of the work space, move
2832     it back to there, in order to avoid filling up the work space. Otherwise,
2833     if "previous" is NULL, reset the current code pointer to the start. */
2834    
2835     if (previous != NULL)
2836     {
2837     if (previous > orig_code)
2838     {
2839     memmove(orig_code, previous, code - previous);
2840     code -= previous - orig_code;
2841     previous = orig_code;
2842     }
2843     }
2844     else code = orig_code;
2845    
2846     /* Remember where this code item starts so we can pick up the length
2847     next time round. */
2848    
2849     last_code = code;
2850     }
2851    
2852     /* In the real compile phase, just check the workspace used by the forward
2853     reference list. */
2854    
2855 ph10 505 else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
2856 nigel 93 {
2857     *errorcodeptr = ERR52;
2858     goto FAILED;
2859     }
2860    
2861 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2862    
2863     if (inescq && c != 0)
2864     {
2865 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2866 nigel 77 {
2867     inescq = FALSE;
2868     ptr++;
2869     continue;
2870     }
2871     else
2872     {
2873     if (previous_callout != NULL)
2874     {
2875 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2876     complete_callout(previous_callout, ptr, cd);
2877 nigel 77 previous_callout = NULL;
2878     }
2879     if ((options & PCRE_AUTO_CALLOUT) != 0)
2880     {
2881     previous_callout = code;
2882     code = auto_callout(code, ptr, cd);
2883     }
2884     goto NORMAL_CHAR;
2885     }
2886     }
2887    
2888     /* Fill in length of a previous callout, except when the next thing is
2889     a quantifier. */
2890    
2891 ph10 392 is_quantifier =
2892 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2893     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2894 nigel 77
2895     if (!is_quantifier && previous_callout != NULL &&
2896     after_manual_callout-- <= 0)
2897     {
2898 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2899     complete_callout(previous_callout, ptr, cd);
2900 nigel 77 previous_callout = NULL;
2901     }
2902    
2903     /* In extended mode, skip white space and comments */
2904    
2905     if ((options & PCRE_EXTENDED) != 0)
2906     {
2907     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2908 ph10 391 if (c == CHAR_NUMBER_SIGN)
2909 nigel 77 {
2910 nigel 93 while (*(++ptr) != 0)
2911 nigel 91 {
2912 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2913 nigel 91 }
2914 nigel 93 if (*ptr != 0) continue;
2915    
2916 nigel 91 /* Else fall through to handle end of string */
2917     c = 0;
2918 nigel 77 }
2919     }
2920    
2921     /* No auto callout for quantifiers. */
2922    
2923     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2924     {
2925     previous_callout = code;
2926     code = auto_callout(code, ptr, cd);
2927     }
2928    
2929     switch(c)
2930     {
2931 nigel 93 /* ===================================================================*/
2932     case 0: /* The branch terminates at string end */
2933 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
2934     case CHAR_RIGHT_PARENTHESIS:
2935 nigel 77 *firstbyteptr = firstbyte;
2936     *reqbyteptr = reqbyte;
2937     *codeptr = code;
2938     *ptrptr = ptr;
2939 nigel 93 if (lengthptr != NULL)
2940     {
2941 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2942     {
2943     *errorcodeptr = ERR20;
2944     goto FAILED;
2945     }
2946 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2947     DPRINTF((">> end branch\n"));
2948     }
2949 nigel 77 return TRUE;
2950    
2951 nigel 93
2952     /* ===================================================================*/
2953 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2954     the setting of any following char as a first character. */
2955    
2956 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
2957 nigel 77 if ((options & PCRE_MULTILINE) != 0)
2958     {
2959     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2960     }
2961     previous = NULL;
2962     *code++ = OP_CIRC;
2963     break;
2964    
2965 ph10 391 case CHAR_DOLLAR_SIGN:
2966 nigel 77 previous = NULL;
2967     *code++ = OP_DOLL;
2968     break;
2969    
2970     /* There can never be a first char if '.' is first, whatever happens about
2971     repeats. The value of reqbyte doesn't change either. */
2972    
2973 ph10 391 case CHAR_DOT:
2974 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2975     zerofirstbyte = firstbyte;
2976     zeroreqbyte = reqbyte;
2977     previous = code;
2978 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2979 nigel 77 break;
2980    
2981 nigel 93
2982     /* ===================================================================*/
2983 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2984     32-byte bitmap of the permitted characters, except in the special case
2985     where there is only one such character. For negated classes, we build the
2986     map as usual, then invert it at the end. However, we use a different opcode
2987     so that data characters > 255 can be handled correctly.
2988 nigel 77
2989     If the class contains characters outside the 0-255 range, a different
2990     opcode is compiled. It may optionally have a bit map for characters < 256,
2991     but those above are are explicitly listed afterwards. A flag byte tells
2992     whether the bitmap is present, and whether this is a negated class or not.
2993 ph10 345
2994 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
2995     default (Perl) mode, it is treated as a data character. */
2996 ph10 345
2997 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
2998 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2999     {
3000     *errorcodeptr = ERR64;
3001 ph10 345 goto FAILED;
3002 ph10 336 }
3003 ph10 345 goto NORMAL_CHAR;
3004 nigel 77
3005 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
3006 nigel 77 previous = code;
3007    
3008     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3009     they are encountered at the top level, so we'll do that too. */
3010    
3011 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3012 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
3013 ph10 295 check_posix_syntax(ptr, &tempptr))
3014 nigel 77 {
3015 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3016 nigel 77 goto FAILED;
3017     }
3018    
3019 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
3020 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
3021 ph10 205 skip them too. This makes for compatibility with Perl. */
3022 ph10 208
3023 ph10 205 negate_class = FALSE;
3024     for (;;)
3025 nigel 77 {
3026     c = *(++ptr);
3027 ph10 391 if (c == CHAR_BACKSLASH)
3028 ph10 205 {
3029 ph10 392 if (ptr[1] == CHAR_E)
3030 ph10 391 ptr++;
3031 ph10 392 else if (strncmp((const char *)ptr+1,
3032     STR_Q STR_BACKSLASH STR_E, 3) == 0)
3033 ph10 391 ptr += 3;
3034 ph10 392 else
3035 ph10 391 break;
3036 ph10 205 }
3037 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3038 ph10 205 negate_class = TRUE;
3039     else break;
3040 ph10 208 }
3041 ph10 345
3042     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3043     an initial ']' is taken as a data character -- the code below handles
3044 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3045     [^] must match any character, so generate OP_ALLANY. */
3046 ph10 345
3047 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3048 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3049 ph10 341 {
3050     *code++ = negate_class? OP_ALLANY : OP_FAIL;
3051     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3052     zerofirstbyte = firstbyte;
3053     break;
3054 ph10 345 }
3055 nigel 77
3056 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
3057     negation flag at the end, so that support for characters > 255 works
3058 ph10 264 correctly (they are all included in the class). */
3059    
3060     should_flip_negation = FALSE;
3061    
3062 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
3063 nigel 93 of just a single character (as long as it's < 256). However, For higher
3064     valued UTF-8 characters, we don't yet do any optimization. */
3065 nigel 77
3066     class_charcount = 0;
3067     class_lastchar = -1;
3068    
3069 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
3070     temporary bit of memory, in case the class contains only 1 character (less
3071     than 256), because in that case the compiled code doesn't use the bit map.
3072     */
3073    
3074     memset(classbits, 0, 32 * sizeof(uschar));
3075    
3076 nigel 77 #ifdef SUPPORT_UTF8
3077     class_utf8 = FALSE; /* No chars >= 256 */
3078 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3079 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3080 nigel 77 #endif
3081    
3082     /* Process characters until ] is reached. By writing this as a "do" it
3083 nigel 93 means that an initial ] is taken as a data character. At the start of the
3084     loop, c contains the first byte of the character. */
3085 nigel 77
3086 nigel 93 if (c != 0) do
3087 nigel 77 {
3088 nigel 93 const uschar *oldptr;
3089    
3090 nigel 77 #ifdef SUPPORT_UTF8
3091     if (utf8 && c > 127)
3092     { /* Braces are required because the */
3093     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3094     }
3095 ph10 309
3096 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3097 ph10 309 data and reset the pointer. This is so that very large classes that
3098 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
3099 ph10 309 (which is on the stack). */
3100    
3101 ph10 300 if (lengthptr != NULL)
3102     {
3103     *lengthptr += class_utf8data - class_utf8data_base;
3104 ph10 309 class_utf8data = class_utf8data_base;
3105     }
3106    
3107 nigel 77 #endif
3108    
3109     /* Inside \Q...\E everything is literal except \E */
3110    
3111     if (inescq)
3112     {
3113 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3114 nigel 77 {
3115 nigel 93 inescq = FALSE; /* Reset literal state */
3116     ptr++; /* Skip the 'E' */
3117     continue; /* Carry on with next */
3118 nigel 77 }
3119 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
3120 nigel 77 }
3121    
3122     /* Handle POSIX class names. Perl allows a negation extension of the
3123     form [:^name:]. A square bracket that doesn't match the syntax is
3124     treated as a literal. We also recognize the POSIX constructions
3125     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3126     5.6 and 5.8 do. */
3127    
3128 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3129 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3130 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3131 nigel 77 {
3132     BOOL local_negate = FALSE;
3133 nigel 87 int posix_class, taboffset, tabopt;
3134 nigel 77 register const uschar *cbits = cd->cbits;
3135 nigel 87 uschar pbits[32];
3136 nigel 77
3137 ph10 391 if (ptr[1] != CHAR_COLON)
3138 nigel 77 {
3139     *errorcodeptr = ERR31;
3140     goto FAILED;
3141     }
3142    
3143     ptr += 2;
3144 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3145 nigel 77 {
3146     local_negate = TRUE;
3147 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3148 nigel 77 ptr++;
3149     }
3150    
3151     posix_class = check_posix_name(ptr, tempptr - ptr);
3152     if (posix_class < 0)
3153     {
3154     *errorcodeptr = ERR30;
3155     goto FAILED;
3156     }
3157    
3158     /* If matching is caseless, upper and lower are converted to
3159     alpha. This relies on the fact that the class table starts with
3160     alpha, lower, upper as the first 3 entries. */
3161    
3162     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3163     posix_class = 0;
3164    
3165 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
3166     because we may be adding and subtracting from it, and we don't want to
3167     subtract bits that may be in the main map already. At the end we or the
3168     result into the bit map that is being built. */
3169 nigel 77
3170     posix_class *= 3;
3171 nigel 87
3172     /* Copy in the first table (always present) */
3173    
3174     memcpy(pbits, cbits + posix_class_maps[posix_class],
3175     32 * sizeof(uschar));
3176    
3177     /* If there is a second table, add or remove it as required. */
3178    
3179     taboffset = posix_class_maps[posix_class + 1];
3180     tabopt = posix_class_maps[posix_class + 2];
3181    
3182     if (taboffset >= 0)
3183 nigel 77 {
3184 nigel 87 if (tabopt >= 0)
3185     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3186 nigel 77 else
3187 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3188 nigel 77 }
3189    
3190 nigel 87 /* Not see if we need to remove any special characters. An option
3191     value of 1 removes vertical space and 2 removes underscore. */
3192    
3193     if (tabopt < 0) tabopt = -tabopt;
3194     if (tabopt == 1) pbits[1] &= ~0x3c;
3195     else if (tabopt == 2) pbits[11] &= 0x7f;
3196    
3197     /* Add the POSIX table or its complement into the main table that is
3198     being built and we are done. */
3199    
3200     if (local_negate)
3201     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3202     else
3203     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3204    
3205 nigel 77 ptr = tempptr + 1;
3206     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3207     continue; /* End of POSIX syntax handling */
3208     }
3209    
3210     /* Backslash may introduce a single character, or it may introduce one
3211 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3212 ph10 513 case. Inside a class (and only there) it is treated as backspace. We
3213     assume that other escapes have more than one character in them, so set
3214     class_charcount bigger than one. Unrecognized escapes fall through and
3215     are either treated as literal characters (by default), or are faulted if
3216     PCRE_EXTRA is set. */
3217 nigel 77
3218 ph10 391 if (c == CHAR_BACKSLASH)
3219 nigel 77 {
3220 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3221     if (*errorcodeptr != 0) goto FAILED;
3222 nigel 77
3223 ph10 513 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3224 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3225     {
3226 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3227 nigel 77 {
3228     ptr += 2; /* avoid empty string */
3229     }
3230     else inescq = TRUE;
3231     continue;
3232     }
3233 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3234 nigel 77
3235     if (c < 0)
3236     {
3237     register const uschar *cbits = cd->cbits;
3238     class_charcount += 2; /* Greater than 1 is what matters */
3239 nigel 93
3240     /* Save time by not doing this in the pre-compile phase. */
3241    
3242     if (lengthptr == NULL) switch (-c)
3243 nigel 77 {
3244     case ESC_d:
3245     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3246     continue;
3247    
3248     case ESC_D:
3249 ph10 286 should_flip_negation = TRUE;
3250 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3251     continue;
3252    
3253     case ESC_w:
3254     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3255     continue;
3256    
3257     case ESC_W:
3258 ph10 286 should_flip_negation = TRUE;
3259 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3260     continue;
3261    
3262     case ESC_s:
3263     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3264     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3265     continue;
3266    
3267     case ESC_S:
3268 ph10 286 should_flip_negation = TRUE;
3269 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3270     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3271     continue;
3272    
3273 nigel 93 default: /* Not recognized; fall through */
3274     break; /* Need "default" setting to stop compiler warning. */
3275     }
3276    
3277     /* In the pre-compile phase, just do the recognition. */
3278    
3279     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3280     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3281 ph10 180
3282 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
3283     they use extra memory. */
3284 ph10 180
3285 ph10 178 if (-c == ESC_h)
3286     {
3287     SETBIT(classbits, 0x09); /* VT */
3288     SETBIT(classbits, 0x20); /* SPACE */
3289 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
3290 ph10 178 #ifdef SUPPORT_UTF8
3291     if (utf8)
3292 ph10 180 {
3293 ph10 178 class_utf8 = TRUE;
3294     *class_utf8data++ = XCL_SINGLE;
3295 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3296 ph10 178 *class_utf8data++ = XCL_SINGLE;
3297 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3298     *class_utf8data++ = XCL_RANGE;
3299     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3300     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3301 ph10 178 *class_utf8data++ = XCL_SINGLE;
3302 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3303 ph10 178 *class_utf8data++ = XCL_SINGLE;
3304 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3305 ph10 178 *class_utf8data++ = XCL_SINGLE;
3306 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3307     }
3308     #endif
3309     continue;
3310     }
3311 nigel 93
3312 ph10 178 if (-c == ESC_H)
3313     {
3314     for (c = 0; c < 32; c++)
3315     {
3316     int x = 0xff;
3317     switch (c)
3318 ph10 180 {
3319 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
3320     case 0x20/8: x ^= 1 << (0x20%8); break;
3321     case 0xa0/8: x ^= 1 << (0xa0%8); break;
3322     default: break;
3323     }
3324     classbits[c] |= x;
3325 ph10 180 }
3326    
3327 ph10 178 #ifdef SUPPORT_UTF8
3328     if (utf8)
3329 ph10 180 {
3330 ph10 178 class_utf8 = TRUE;
3331 ph10 180 *class_utf8data++ = XCL_RANGE;
3332     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3333     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3334     *class_utf8data++ = XCL_RANGE;
3335     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3336     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3337     *class_utf8data++ = XCL_RANGE;
3338     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3339     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3340     *class_utf8data++ = XCL_RANGE;
3341     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3342     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3343     *class_utf8data++ = XCL_RANGE;
3344     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3345     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3346     *class_utf8data++ = XCL_RANGE;
3347     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3348     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3349     *class_utf8data++ = XCL_RANGE;
3350     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3351     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3352     }
3353     #endif
3354     continue;
3355     }
3356 ph10 178
3357     if (-c == ESC_v)
3358     {
3359     SETBIT(classbits, 0x0a); /* LF */
3360     SETBIT(classbits, 0x0b); /* VT */
3361 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3362     SETBIT(classbits, 0x0d); /* CR */
3363     SETBIT(classbits, 0x85); /* NEL */
3364 ph10 178 #ifdef SUPPORT_UTF8
3365     if (utf8)
3366 ph10 180 {
3367 ph10 178 class_utf8 = TRUE;
3368 ph10 180 *class_utf8data++ = XCL_RANGE;
3369     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3370     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3371     }
3372     #endif
3373     continue;
3374     }
3375 ph10 178
3376     if (-c == ESC_V)
3377     {
3378     for (c = 0; c < 32; c++)
3379     {
3380     int x = 0xff;
3381     switch (c)
3382 ph10 180 {
3383 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3384     x ^= 1 << (0x0b%8);
3385     x ^= 1 << (0x0c%8);
3386 ph10 180 x ^= 1 << (0x0d%8);
3387 ph10 178 break;
3388     case 0x85/8: x ^= 1 << (0x85%8); break;
3389     default: break;
3390     }
3391     classbits[c] |= x;
3392 ph10 180 }
3393    
3394 ph10 178 #ifdef SUPPORT_UTF8
3395     if (utf8)
3396 ph10 180 {
3397 ph10 178 class_utf8 = TRUE;
3398 ph10 180 *class_utf8data++ = XCL_RANGE;
3399     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3400     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3401     *class_utf8data++ = XCL_RANGE;
3402     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3403     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3404     }
3405     #endif
3406     continue;
3407     }
3408 ph10 178
3409 nigel 93 /* We need to deal with \P and \p in both phases. */
3410    
3411 nigel 77 #ifdef SUPPORT_UCP
3412 nigel 93 if (-c == ESC_p || -c == ESC_P)
3413     {
3414     BOOL negated;
3415     int pdata;
3416     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3417     if (ptype < 0) goto FAILED;
3418     class_utf8 = TRUE;
3419     *class_utf8data++ = ((-c == ESC_p) != negated)?
3420     XCL_PROP : XCL_NOTPROP;
3421     *class_utf8data++ = ptype;
3422     *class_utf8data++ = pdata;
3423     class_charcount -= 2; /* Not a < 256 character */
3424 nigel 77 continue;
3425 nigel 93 }
3426 nigel 77 #endif
3427 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
3428     strict mode. By default, for compatibility with Perl, they are
3429     treated as literals. */
3430 nigel 77
3431 nigel 93 if ((options & PCRE_EXTRA) != 0)
3432     {
3433     *errorcodeptr = ERR7;
3434     goto FAILED;
3435     }
3436 nigel 77
3437 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3438     c = *ptr; /* Get the final character and fall through */
3439 nigel 77 }
3440    
3441     /* Fall through if we have a single character (c >= 0). This may be
3442 nigel 93 greater than 256 in UTF-8 mode. */
3443 nigel 77
3444     } /* End of backslash handling */
3445    
3446     /* A single character may be followed by '-' to form a range. However,
3447     Perl does not permit ']' to be the end of the range. A '-' character
3448 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3449     entirely. The code for handling \Q and \E is messy. */
3450 nigel 77
3451 nigel 93 CHECK_RANGE:
3452 ph10 391 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3453 nigel 77 {
3454 nigel 93 inescq = FALSE;
3455     ptr += 2;
3456     }
3457    
3458     oldptr = ptr;
3459 ph10 231
3460 ph10 230 /* Remember \r or \n */
3461 ph10 231
3462 ph10 391 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3463 ph10 231
3464 ph10 230 /* Check for range */
3465 nigel 93
3466 ph10 391 if (!inescq && ptr[1] == CHAR_MINUS)
3467 nigel 93 {
3468 nigel 77 int d;
3469     ptr += 2;
3470 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3471 nigel 77
3472 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3473     mode. */
3474    
3475 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3476 nigel 93 {
3477     ptr += 2;
3478 ph10 392 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3479 ph10 391 { ptr += 2; continue; }
3480 nigel 93 inescq = TRUE;
3481     break;
3482     }
3483    
3484 ph10 391 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3485 nigel 93 {
3486     ptr = oldptr;
3487     goto LONE_SINGLE_CHARACTER;
3488     }
3489    
3490 nigel 77 #ifdef SUPPORT_UTF8
3491     if (utf8)
3492     { /* Braces are required because the */
3493     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3494     }
3495     else
3496     #endif
3497     d = *ptr; /* Not UTF-8 mode */
3498    
3499     /* The second part of a range can be a single-character escape, but
3500     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3501     in such circumstances. */
3502    
3503 ph10 391 if (!inescq && d == CHAR_BACKSLASH)
3504 nigel 77 {
3505 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3506     if (*errorcodeptr != 0) goto FAILED;
3507 nigel 77
3508 ph10 514 /* \b is backspace; any other special means the '-' was literal */
3509 nigel 77
3510     if (d < 0)
3511     {
3512 ph10 514 if (d == -ESC_b) d = CHAR_BS; else
3513 nigel 77 {
3514 nigel 93 ptr = oldptr;
3515 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3516     }
3517     }
3518     }
3519    
3520 nigel 93 /* Check that the two values are in the correct order. Optimize
3521     one-character ranges */
3522 nigel 77
3523 nigel 93 if (d < c)
3524     {
3525     *errorcodeptr = ERR8;
3526     goto FAILED;
3527     }
3528    
3529 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3530    
3531 ph10 230 /* Remember \r or \n */
3532 ph10 231
3533 ph10 391 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3534 ph10 231
3535 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3536     matching, we have to use an XCLASS with extra data items. Caseless
3537     matching for characters > 127 is available only if UCP support is
3538     available. */
3539    
3540     #ifdef SUPPORT_UTF8
3541     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3542     {
3543     class_utf8 = TRUE;
3544    
3545     /* With UCP support, we can find the other case equivalents of
3546     the relevant characters. There may be several ranges. Optimize how
3547     they fit with the basic range. */
3548    
3549     #ifdef SUPPORT_UCP
3550     if ((options & PCRE_CASELESS) != 0)
3551     {
3552 nigel 93 unsigned int occ, ocd;
3553     unsigned int cc = c;
3554     unsigned int origd = d;
3555 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3556     {
3557 ph10 180 if (occ >= (unsigned int)c &&
3558     ocd <= (unsigned int)d)
3559 ph10 176 continue; /* Skip embedded ranges */
3560 nigel 77
3561 ph10 180 if (occ < (unsigned int)c &&
3562 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3563 nigel 77 { /* if there is overlap, */
3564     c = occ; /* noting that if occ < c */
3565     continue; /* we can't have ocd > d */
3566     } /* because a subrange is */
3567 ph10 180 if (ocd > (unsigned int)d &&
3568 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3569 nigel 77 { /* the basic range. */
3570     d = ocd;
3571     continue;
3572     }
3573    
3574     if (occ == ocd)
3575     {
3576     *class_utf8data++ = XCL_SINGLE;
3577     }
3578     else
3579     {
3580     *class_utf8data++ = XCL_RANGE;
3581     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3582     }
3583     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3584     }
3585     }
3586     #endif /* SUPPORT_UCP */
3587    
3588     /* Now record the original range, possibly modified for UCP caseless
3589     overlapping ranges. */
3590    
3591     *class_utf8data++ = XCL_RANGE;
3592     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3593     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3594    
3595     /* With UCP support, we are done. Without UCP support, there is no
3596     caseless matching for UTF-8 characters > 127; we can use the bit map
3597     for the smaller ones. */
3598    
3599     #ifdef SUPPORT_UCP
3600     continue; /* With next character in the class */
3601     #else
3602     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3603    
3604     /* Adjust upper limit and fall through to set up the map */
3605    
3606     d = 127;
3607    
3608     #endif /* SUPPORT_UCP */
3609     }
3610     #endif /* SUPPORT_UTF8 */
3611    
3612     /* We use the bit map for all cases when not in UTF-8 mode; else
3613     ranges that lie entirely within 0-127 when there is UCP support; else
3614     for partial ranges without UCP support. */
3615    
3616 nigel 93 class_charcount += d - c + 1;
3617     class_lastchar = d;
3618    
3619     /* We can save a bit of time by skipping this in the pre-compile. */
3620    
3621     if (lengthptr == NULL) for (; c <= d; c++)
3622 nigel 77 {
3623     classbits[c/8] |= (1 << (c&7));
3624     if ((options & PCRE_CASELESS) != 0)
3625     {
3626     int uc = cd->fcc[c]; /* flip case */
3627     classbits[uc/8] |= (1 << (uc&7));
3628     }
3629     }
3630    
3631     continue; /* Go get the next char in the class */
3632     }
3633    
3634     /* Handle a lone single character - we can get here for a normal
3635     non-escape char, or after \ that introduces a single character or for an
3636     apparent range that isn't. */
3637    
3638     LONE_SINGLE_CHARACTER:
3639 ph10 231
3640 nigel 77 /* Handle a character that cannot go in the bit map */
3641    
3642     #ifdef SUPPORT_UTF8
3643     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3644     {
3645     class_utf8 = TRUE;
3646     *class_utf8data++ = XCL_SINGLE;
3647     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3648    
3649     #ifdef SUPPORT_UCP
3650     if ((options & PCRE_CASELESS) != 0)
3651     {
3652 nigel 93 unsigned int othercase;
3653 ph10 349 if ((othercase = UCD_OTHERCASE(c)) != c)
3654 nigel 77 {
3655     *class_utf8data++ = XCL_SINGLE;
3656     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3657     }
3658     }
3659     #endif /* SUPPORT_UCP */
3660    
3661     }
3662     else
3663     #endif /* SUPPORT_UTF8 */
3664    
3665     /* Handle a single-byte character */
3666     {
3667     classbits[c/8] |= (1 << (c&7));
3668     if ((options & PCRE_CASELESS) != 0)
3669     {
3670     c = cd->fcc[c]; /* flip case */
3671     classbits[c/8] |= (1 << (c&7));
3672     }
3673     class_charcount++;
3674     class_lastchar = c;
3675     }
3676     }
3677    
3678 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3679 nigel 77
3680 ph10 391 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3681 nigel 77
3682 nigel 93 if (c == 0) /* Missing terminating ']' */
3683     {
3684     *errorcodeptr = ERR6;
3685     goto FAILED;
3686     }
3687 ph10 231
3688    
3689 ph10 230 /* This code has been disabled because it would mean that \s counts as
3690     an explicit \r or \n reference, and that's not really what is wanted. Now
3691     we set the flag only if there is a literal "\r" or "\n" in the class. */
3692 ph10 227
3693 ph10 230 #if 0
3694 ph10 226 /* Remember whether \r or \n are in this class */
3695 ph10 227
3696 ph10 226 if (negate_class)
3697     {
3698 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3699 ph10 226 }
3700     else
3701     {
3702 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3703 ph10 227 }
3704 ph10 230 #endif
3705 ph10 227
3706 ph10 231
3707 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3708 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3709     use of \p or \P, in other words, no use of any XCLASS features, we can
3710     optimize.
3711    
3712 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3713     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3714     operate on single-bytes only. This is an historical hangover. Maybe one day
3715     we can tidy these opcodes to handle multi-byte characters.
3716 nigel 77
3717     The optimization throws away the bit map. We turn the item into a
3718     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3719     that OP_NOT does not support multibyte characters. In the positive case, it
3720     can cause firstbyte to be set. Otherwise, there can be no first char if
3721     this item is first, whatever repeat count may follow. In the case of
3722     reqbyte, save the previous value for reinstating. */
3723    
3724     #ifdef SUPPORT_UTF8
3725 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3726 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3727 nigel 77 #else
3728     if (class_charcount == 1)
3729     #endif
3730     {
3731     zeroreqbyte = reqbyte;
3732    
3733     /* The OP_NOT opcode works on one-byte characters only. */
3734    
3735     if (negate_class)
3736     {
3737     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3738     zerofirstbyte = firstbyte;
3739     *code++ = OP_NOT;
3740     *code++ = class_lastchar;
3741     break;
3742     }
3743    
3744     /* For a single, positive character, get the value into mcbuffer, and
3745     then we can handle this with the normal one-character code. */
3746    
3747     #ifdef SUPPORT_UTF8
3748     if (utf8 && class_lastchar > 127)
3749     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3750     else
3751     #endif
3752     {
3753     mcbuffer[0] = class_lastchar;
3754     mclength = 1;
3755     }
3756     goto ONE_CHAR;
3757     } /* End of 1-char optimization */
3758    
3759     /* The general case - not the one-char optimization. If this is the first
3760     thing in the branch, there can be no first char setting, whatever the
3761     repeat count. Any reqbyte setting must remain unchanged after any kind of
3762     repeat. */
3763    
3764     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3765     zerofirstbyte = firstbyte;
3766     zeroreqbyte = reqbyte;
3767    
3768     /* If there are characters with values > 255, we have to compile an
3769 ph10 286 extended class, with its own opcode, unless there was a negated special
3770     such as \S in the class, because in that case all characters > 255 are in
3771     the class, so any that were explicitly given as well can be ignored. If
3772 ph10 264 (when there are explicit characters > 255 that must be listed) there are no
3773     characters < 256, we can omit the bitmap in the actual compiled code. */
3774 nigel 77
3775     #ifdef SUPPORT_UTF8
3776 ph10 264 if (class_utf8 && !should_flip_negation)
3777 nigel 77 {
3778     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3779     *code++ = OP_XCLASS;
3780     code += LINK_SIZE;
3781     *code = negate_class? XCL_NOT : 0;
3782    
3783 nigel 93 /* If the map is required, move up the extra data to make room for it;
3784     otherwise just move the code pointer to the end of the extra data. */
3785 nigel 77
3786     if (class_charcount > 0)
3787     {
3788     *code++ |= XCL_MAP;
3789 nigel 93 memmove(code + 32, code, class_utf8data - code);
3790 nigel 77 memcpy(code, classbits, 32);
3791 nigel 93 code = class_utf8data + 32;
3792 nigel 77 }
3793 nigel 93 else code = class_utf8data;
3794 nigel 77
3795     /* Now fill in the complete length of the item */
3796    
3797     PUT(previous, 1, code - previous);
3798     break; /* End of class handling */
3799     }
3800     #endif
3801    
3802 ph10 286 /* If there are no characters > 255, set the opcode to OP_CLASS or
3803     OP_NCLASS, depending on whether the whole class was negated and whether
3804     there were negative specials such as \S in the class. Then copy the 32-byte
3805 ph10 264 map into the code vector, negating it if necessary. */
3806 ph10 286
3807 ph10 264 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3808 nigel 77 if (negate_class)
3809     {
3810 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3811     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3812 nigel 77 }
3813     else
3814     {
3815     memcpy(code, classbits, 32);
3816     }
3817     code += 32;
3818     break;
3819    
3820 nigel 93
3821     /* ===================================================================*/
3822 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3823     has been tested above. */
3824    
3825 ph10 391 case CHAR_LEFT_CURLY_BRACKET:
3826 nigel 77 if (!is_quantifier) goto NORMAL_CHAR;
3827     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3828     if (*errorcodeptr != 0) goto FAILED;
3829     goto REPEAT;
3830    
3831 ph10 391 case CHAR_ASTERISK:
3832 nigel 77 repeat_min = 0;
3833     repeat_max = -1;
3834     goto REPEAT;
3835    
3836 ph10 391 case CHAR_PLUS:
3837 nigel 77 repeat_min = 1;
3838     repeat_max = -1;
3839     goto REPEAT;
3840    
3841 ph10 391 case CHAR_QUESTION_MARK:
3842 nigel 77 repeat_min = 0;
3843     repeat_max = 1;
3844    
3845     REPEAT:
3846     if (previous == NULL)
3847     {
3848     *errorcodeptr = ERR9;
3849     goto FAILED;
3850     }
3851    
3852     if (repeat_min == 0)
3853     {
3854     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3855     reqbyte = zeroreqbyte; /* Ditto */
3856     }
3857    
3858     /* Remember whether this is a variable length repeat */
3859    
3860     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3861    
3862     op_type = 0; /* Default single-char op codes */
3863     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3864    
3865     /* Save start of previous item, in case we have to move it up to make space
3866     for an inserted OP_ONCE for the additional '+' extension. */
3867    
3868     tempcode = previous;
3869    
3870     /* If the next character is '+', we have a possessive quantifier. This
3871     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3872     If the next character is '?' this is a minimizing repeat, by default,
3873     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3874     repeat type to the non-default. */
3875    
3876 ph10 391 if (ptr[1] == CHAR_PLUS)
3877 nigel 77 {
3878     repeat_type = 0; /* Force greedy */
3879     possessive_quantifier = TRUE;
3880     ptr++;
3881     }
3882 ph10 391 else if (ptr[1] == CHAR_QUESTION_MARK)
3883 nigel 77 {
3884     repeat_type = greedy_non_default;
3885     ptr++;
3886     }
3887     else repeat_type = greedy_default;
3888    
3889     /* If previous was a character match, abolish the item and generate a
3890     repeat item instead. If a char item has a minumum of more than one, ensure
3891     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3892     the first thing in a branch because the x will have gone into firstbyte
3893     instead. */
3894    
3895     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3896     {
3897     /* Deal with UTF-8 characters that take up more than one byte. It's
3898     easier to write this out separately than try to macrify it. Use c to
3899     hold the length of the character in bytes, plus 0x80 to flag that it's a
3900     length rather than a small character. */
3901    
3902     #ifdef SUPPORT_UTF8
3903     if (utf8 && (code[-1] & 0x80) != 0)
3904     {
3905     uschar *lastchar = code - 1;
3906     while((*lastchar & 0xc0) == 0x80) lastchar--;
3907     c = code - lastchar; /* Length of UTF-8 character */
3908     memcpy(utf8_char, lastchar, c); /* Save the char */
3909     c |= 0x80; /* Flag c as a length */
3910     }
3911     else
3912     #endif
3913    
3914     /* Handle the case of a single byte - either with no UTF8 support, or
3915     with UTF-8 disabled, or for a UTF-8 character < 128. */
3916    
3917     {
3918     c = code[-1];
3919     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3920     }
3921    
3922 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3923     the line is something that cannot possibly match this character. If so,
3924     automatically possessifying this item gains some performance in the case
3925     where the match fails. */
3926    
3927     if (!possessive_quantifier &&
3928     repeat_max < 0 &&
3929     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3930     options, cd))
3931     {
3932     repeat_type = 0; /* Force greedy */
3933     possessive_quantifier = TRUE;
3934     }
3935