/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 544 - (hide annotations) (download)
Tue Jun 15 17:20:55 2010 UTC (2 years, 11 months ago) by ph10
File MIME type: text/plain
File size: 233022 byte(s)
Fix forward reference in the presence of (?#( (open parens in comment).

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 475 Copyright (c) 1997-2010 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 ph10 475 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57     also used by pcretest. PCRE_DEBUG is not defined when building a production
58     library. */
59 nigel 85
60 ph10 475 #ifdef PCRE_DEBUG
61 nigel 85 #include "pcre_printint.src"
62     #endif
63    
64    
65 ph10 178 /* Macro for setting individual bits in class bitmaps. */
66    
67     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68    
69 ph10 202 /* Maximum length value to check against when making sure that the integer that
70     holds the compiled pattern length does not overflow. We make it a bit less than
71     INT_MAX to allow for adding in group terminating bytes, so that we don't have
72     to check them every time. */
73 ph10 178
74 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
75    
76    
77 nigel 77 /*************************************************
78     * Code parameters and static tables *
79     *************************************************/
80    
81 nigel 93 /* This value specifies the size of stack workspace that is used during the
82     first pre-compile phase that determines how much memory is required. The regex
83     is partly compiled into this space, but the compiled parts are discarded as
84     soon as they can be, so that hopefully there will never be an overrun. The code
85     does, however, check for an overrun. The largest amount I've seen used is 218,
86     so this number is very generous.
87 nigel 77
88 nigel 93 The same workspace is used during the second, actual compile phase for
89     remembering forward references to groups so that they can be filled in at the
90     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91     is 4 there is plenty of room. */
92 nigel 77
93 nigel 93 #define COMPILE_WORK_SIZE (4096)
94 nigel 77
95 ph10 507 /* The overrun tests check for a slightly smaller size so that they detect the
96 ph10 505 overrun before it actually does run off the end of the data block. */
97 nigel 93
98 ph10 505 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99    
100    
101 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
102     are simple data values; negative values are for special things like \d and so
103     on. Zero means further processing is needed (for things like \x), or the escape
104     is invalid. */
105    
106 ph10 391 #ifndef EBCDIC
107    
108     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
109 ph10 392 in UTF-8 mode. */
110 ph10 391
111 ph10 392 static const short int escapes[] = {
112 ph10 391 0, 0,
113     0, 0,
114 ph10 392 0, 0,
115     0, 0,
116     0, 0,
117 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
118 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
119 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
120 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
121     -ESC_B, -ESC_C,
122     -ESC_D, -ESC_E,
123     0, -ESC_G,
124     -ESC_H, 0,
125     0, -ESC_K,
126 ph10 391 0, 0,
127 ph10 514 -ESC_N, 0,
128 ph10 391 -ESC_P, -ESC_Q,
129     -ESC_R, -ESC_S,
130 ph10 392 0, 0,
131     -ESC_V, -ESC_W,
132     -ESC_X, 0,
133     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
134 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
135 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
136 ph10 391 CHAR_GRAVE_ACCENT, 7,
137 ph10 392 -ESC_b, 0,
138     -ESC_d, ESC_e,
139 ph10 391 ESC_f, 0,
140     -ESC_h, 0,
141 ph10 392 0, -ESC_k,
142 ph10 391 0, 0,
143     ESC_n, 0,
144 ph10 392 -ESC_p, 0,
145     ESC_r, -ESC_s,
146 ph10 391 ESC_tee, 0,
147 ph10 392 -ESC_v, -ESC_w,
148     0, 0,
149 ph10 391 -ESC_z
150 nigel 77 };
151    
152 ph10 392 #else
153 ph10 391
154     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155    
156 nigel 77 static const short int escapes[] = {
157     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
158     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
159     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
160     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
161     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
162     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
163     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
164     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
165 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
166 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
167 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
168 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
169 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
170     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
171     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
172     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
173 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
174 ph10 514 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
175 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
176 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
177 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
178     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
179     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
180     };
181     #endif
182    
183    
184 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185     searched linearly. Put all the names into a single string, in order to reduce
186 ph10 392 the number of relocations when a shared library is dynamically linked. The
187     string is built from string macros so that it works in UTF-8 mode on EBCDIC
188 ph10 391 platforms. */
189 ph10 210
190     typedef struct verbitem {
191 ph10 510 int len; /* Length of verb name */
192     int op; /* Op when no arg, or -1 if arg mandatory */
193     int op_arg; /* Op when arg present, or -1 if not allowed */
194 ph10 211 } verbitem;
195 ph10 210
196 ph10 240 static const char verbnames[] =
197 ph10 510 "\0" /* Empty name is a shorthand for MARK */
198 ph10 512 STRING_MARK0
199 ph10 391 STRING_ACCEPT0
200     STRING_COMMIT0
201     STRING_F0
202     STRING_FAIL0
203     STRING_PRUNE0
204     STRING_SKIP0
205     STRING_THEN;
206 ph10 240
207 ph10 327 static const verbitem verbs[] = {
208 ph10 510 { 0, -1, OP_MARK },
209 ph10 512 { 4, -1, OP_MARK },
210 ph10 510 { 6, OP_ACCEPT, -1 },
211     { 6, OP_COMMIT, -1 },
212     { 1, OP_FAIL, -1 },
213     { 4, OP_FAIL, -1 },
214     { 5, OP_PRUNE, OP_PRUNE_ARG },
215     { 4, OP_SKIP, OP_SKIP_ARG },
216     { 4, OP_THEN, OP_THEN_ARG }
217 ph10 210 };
218    
219 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
220 ph10 210
221    
222 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
223     now all in a single string, to reduce the number of relocations when a shared
224 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
225     length entry. The first three must be alpha, lower, upper, as this is assumed
226     for handling case independence. */
227 nigel 77
228 ph10 240 static const char posix_names[] =
229 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
230     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
231 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
232     STRING_word0 STRING_xdigit;
233 nigel 77
234     static const uschar posix_name_lengths[] = {
235     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
236    
237 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
238     base map, with an optional addition or removal of another map. Then, for some
239     classes, there is some additional tweaking: for [:blank:] the vertical space
240     characters are removed, and for [:alpha:] and [:alnum:] the underscore
241     character is removed. The triples in the table consist of the base map offset,
242     second map offset or -1 if no second map, and a non-negative value for map
243     addition or a negative value for map subtraction (if there are two maps). The
244     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
245     remove vertical space characters, 2 => remove underscore. */
246 nigel 77
247     static const int posix_class_maps[] = {
248 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
249     cbit_lower, -1, 0, /* lower */
250     cbit_upper, -1, 0, /* upper */
251     cbit_word, -1, 2, /* alnum - word without underscore */
252     cbit_print, cbit_cntrl, 0, /* ascii */
253     cbit_space, -1, 1, /* blank - a GNU extension */
254     cbit_cntrl, -1, 0, /* cntrl */
255     cbit_digit, -1, 0, /* digit */
256     cbit_graph, -1, 0, /* graph */
257     cbit_print, -1, 0, /* print */
258     cbit_punct, -1, 0, /* punct */
259     cbit_space, -1, 0, /* space */
260     cbit_word, -1, 0, /* word - a Perl extension */
261     cbit_xdigit,-1, 0 /* xdigit */
262 nigel 77 };
263    
264 ph10 535 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
265     substitutes must be in the order of the names, defined above, and there are
266 ph10 518 both positive and negative cases. NULL means no substitute. */
267 nigel 77
268 ph10 518 #ifdef SUPPORT_UCP
269     static const uschar *substitutes[] = {
270     (uschar *)"\\P{Nd}", /* \D */
271     (uschar *)"\\p{Nd}", /* \d */
272     (uschar *)"\\P{Xsp}", /* \S */ /* NOTE: Xsp is Perl space */
273     (uschar *)"\\p{Xsp}", /* \s */
274     (uschar *)"\\P{Xwd}", /* \W */
275 ph10 535 (uschar *)"\\p{Xwd}" /* \w */
276 ph10 518 };
277 ph10 535
278 ph10 518 static const uschar *posix_substitutes[] = {
279     (uschar *)"\\p{L}", /* alpha */
280 ph10 535 (uschar *)"\\p{Ll}", /* lower */
281     (uschar *)"\\p{Lu}", /* upper */
282     (uschar *)"\\p{Xan}", /* alnum */
283 ph10 518 NULL, /* ascii */
284     (uschar *)"\\h", /* blank */
285     NULL, /* cntrl */
286     (uschar *)"\\p{Nd}", /* digit */
287     NULL, /* graph */
288     NULL, /* print */
289     NULL, /* punct */
290     (uschar *)"\\p{Xps}", /* space */ /* NOTE: Xps is POSIX space */
291     (uschar *)"\\p{Xwd}", /* word */
292 ph10 535 NULL, /* xdigit */
293 ph10 518 /* Negated cases */
294     (uschar *)"\\P{L}", /* ^alpha */
295 ph10 535 (uschar *)"\\P{Ll}", /* ^lower */
296     (uschar *)"\\P{Lu}", /* ^upper */
297     (uschar *)"\\P{Xan}", /* ^alnum */
298 ph10 518 NULL, /* ^ascii */
299     (uschar *)"\\H", /* ^blank */
300     NULL, /* ^cntrl */
301     (uschar *)"\\P{Nd}", /* ^digit */
302     NULL, /* ^graph */
303     NULL, /* ^print */
304     NULL, /* ^punct */
305     (uschar *)"\\P{Xps}", /* ^space */ /* NOTE: Xps is POSIX space */
306     (uschar *)"\\P{Xwd}", /* ^word */
307 ph10 535 NULL /* ^xdigit */
308 ph10 518 };
309     #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
310 ph10 535 #endif
311 ph10 518
312 nigel 93 #define STRING(a) # a
313     #define XSTRING(s) STRING(s)
314    
315 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
316 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
317     they are documented. Always add a new error instead. Messages marked DEAD below
318 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
319     the number of relocations needed when a shared library is loaded dynamically,
320     it is now one long string. We cannot use a table of offsets, because the
321     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
322     simply count through to the one we want - this isn't a performance issue
323 ph10 507 because these strings are used only when there is a compilation error.
324 nigel 77
325 ph10 507 Each substring ends with \0 to insert a null character. This includes the final
326     substring, so that the whole string ends with \0\0, which can be detected when
327 ph10 499 counting through. */
328    
329 ph10 240 static const char error_texts[] =
330     "no error\0"
331     "\\ at end of pattern\0"
332     "\\c at end of pattern\0"
333     "unrecognized character follows \\\0"
334     "numbers out of order in {} quantifier\0"
335 nigel 77 /* 5 */
336 ph10 240 "number too big in {} quantifier\0"
337     "missing terminating ] for character class\0"
338     "invalid escape sequence in character class\0"
339     "range out of order in character class\0"
340     "nothing to repeat\0"
341 nigel 77 /* 10 */
342 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
343     "internal error: unexpected repeat\0"
344 ph10 269 "unrecognized character after (? or (?-\0"
345 ph10 240 "POSIX named classes are supported only within a class\0"
346     "missing )\0"
347 nigel 77 /* 15 */
348 ph10 240 "reference to non-existent subpattern\0"
349     "erroffset passed as NULL\0"
350     "unknown option bit(s) set\0"
351     "missing ) after comment\0"
352     "parentheses nested too deeply\0" /** DEAD **/
353 nigel 77 /* 20 */
354 ph10 240 "regular expression is too large\0"
355     "failed to get memory\0"
356     "unmatched parentheses\0"
357     "internal error: code overflow\0"
358     "unrecognized character after (?<\0"
359 nigel 77 /* 25 */
360 ph10 240 "lookbehind assertion is not fixed length\0"
361     "malformed number or name after (?(\0"
362     "conditional group contains more than two branches\0"
363     "assertion expected after (?(\0"
364     "(?R or (?[+-]digits must be followed by )\0"
365 nigel 77 /* 30 */
366 ph10 240 "unknown POSIX class name\0"
367     "POSIX collating elements are not supported\0"
368     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
369     "spare error\0" /** DEAD **/
370     "character value in \\x{...} sequence is too large\0"
371 nigel 77 /* 35 */
372 ph10 240 "invalid condition (?(0)\0"
373     "\\C not allowed in lookbehind assertion\0"
374 ph10 514 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
375 ph10 240 "number after (?C is > 255\0"
376     "closing ) for (?C expected\0"
377 nigel 77 /* 40 */
378 ph10 240 "recursive call could loop indefinitely\0"
379     "unrecognized character after (?P\0"
380     "syntax error in subpattern name (missing terminator)\0"
381     "two named subpatterns have the same name\0"
382     "invalid UTF-8 string\0"
383 nigel 77 /* 45 */
384 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
385     "malformed \\P or \\p sequence\0"
386     "unknown property name after \\P or \\p\0"
387     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
388     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
389 nigel 91 /* 50 */
390 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
391     "octal value is greater than \\377 (not in UTF-8 mode)\0"
392     "internal error: overran compiling workspace\0"
393     "internal error: previously-checked referenced subpattern not found\0"
394     "DEFINE group contains more than one branch\0"
395 nigel 93 /* 55 */
396 ph10 240 "repeating a DEFINE group is not allowed\0"
397     "inconsistent NEWLINE options\0"
398 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
399     "a numbered reference must not be zero\0"
400 ph10 510 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
401 ph10 211 /* 60 */
402 ph10 240 "(*VERB) not recognized\0"
403 ph10 268 "number is too big\0"
404 ph10 272 "subpattern name expected\0"
405 ph10 336 "digit expected after (?+\0"
406 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
407     /* 65 */
408 ph10 510 "different names for subpatterns of the same number are not allowed\0"
409 ph10 512 "(*MARK) must have an argument\0"
410 ph10 535 "this version of PCRE is not compiled with PCRE_UCP support\0"
411 ph10 510 ;
412 nigel 77
413     /* Table to identify digits and hex digits. This is used when compiling
414     patterns. Note that the tables in chartables are dependent on the locale, and
415     may mark arbitrary characters as digits - but the PCRE compiling code expects
416     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
417     a private table here. It costs 256 bytes, but it is a lot faster than doing
418     character value tests (at least in some simple cases I timed), and in some
419     applications one wants PCRE to compile efficiently as well as match
420     efficiently.
421    
422     For convenience, we use the same bit definitions as in chartables:
423    
424     0x04 decimal digit
425     0x08 hexadecimal digit
426    
427     Then we can use ctype_digit and ctype_xdigit in the code. */
428    
429 ph10 392 #ifndef EBCDIC
430 ph10 391
431 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
432 ph10 391 UTF-8 mode. */
433    
434 nigel 77 static const unsigned char digitab[] =
435     {
436     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
437     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
438     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
439     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
440     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
441     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
442     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
443     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
444     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
445     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
446     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
447     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
448     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
449     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
450     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
451     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
452     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
453     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
454     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
455     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
456     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
457     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
458     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
459     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
460     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
461     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
462     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
463     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
464     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
465     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
466     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
467     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
468    
469 ph10 392 #else
470 ph10 391
471     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
472    
473 nigel 77 static const unsigned char digitab[] =
474     {
475     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
476     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
477     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
478     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
479     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
480     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
481     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
482     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
483     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
484     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
485     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
486 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
487 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
488     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
489     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
490     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
491     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
492     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
493     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
494     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
495     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
496     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
497     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
498     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
499     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
500     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
501     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
502     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
503     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
504     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
505     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
506     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
507    
508     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
509     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
510     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
511     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
512     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
513     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
514     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
515     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
516     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
517     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
518     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
519     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
520 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
521 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
522     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
523     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
524     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
525     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
526     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
527     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
528     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
529     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
530     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
531     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
532     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
533     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
534     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
535     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
536     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
537     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
538     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
539     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
540     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
541     #endif
542    
543    
544     /* Definition to allow mutual recursion */
545    
546     static BOOL
547 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
548 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
549 nigel 77
550    
551    
552     /*************************************************
553 ph10 240 * Find an error text *
554     *************************************************/
555    
556 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
557     some of the text is of unknown length, we can't use a table of offsets.
558     Instead, just count through the strings. This is not a performance issue
559 ph10 240 because it happens only when there has been a compilation error.
560    
561     Argument: the error number
562     Returns: pointer to the error string
563     */
564    
565     static const char *
566     find_error_text(int n)
567     {
568     const char *s = error_texts;
569 ph10 507 for (; n > 0; n--)
570 ph10 499 {
571     while (*s++ != 0) {};
572     if (*s == 0) return "Error text not found (please report)";
573 ph10 507 }
574 ph10 240 return s;
575     }
576    
577    
578     /*************************************************
579 nigel 77 * Handle escapes *
580     *************************************************/
581    
582     /* This function is called when a \ has been encountered. It either returns a
583     positive value for a simple escape such as \n, or a negative value which
584 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
585     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
586     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
587     ptr is pointing at the \. On exit, it is on the final character of the escape
588     sequence.
589 nigel 77
590     Arguments:
591     ptrptr points to the pattern position pointer
592     errorcodeptr points to the errorcode variable
593     bracount number of previous extracting brackets
594     options the options bits
595     isclass TRUE if inside a character class
596    
597     Returns: zero or positive => a data character
598     negative => a special escape sequence
599 ph10 213 on error, errorcodeptr is set
600 nigel 77 */
601    
602     static int
603     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
604     int options, BOOL isclass)
605     {
606 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
607     const uschar *ptr = *ptrptr + 1;
608 nigel 77 int c, i;
609    
610 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
611     ptr--; /* Set pointer back to the last byte */
612    
613 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
614    
615     if (c == 0) *errorcodeptr = ERR1;
616    
617 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
618     in a table. A non-zero result is something that can be returned immediately.
619 nigel 77 Otherwise further processing may be required. */
620    
621 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
622     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
623     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
624 nigel 77
625 ph10 97 #else /* EBCDIC coding */
626 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
627 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
628     #endif
629    
630     /* Escapes that need further processing, or are illegal. */
631    
632     else
633     {
634     const uschar *oldptr;
635 nigel 93 BOOL braced, negated;
636    
637 nigel 77 switch (c)
638     {
639     /* A number of Perl escapes are not handled by PCRE. We give an explicit
640     error. */
641    
642 ph10 391 case CHAR_l:
643     case CHAR_L:
644     case CHAR_u:
645     case CHAR_U:
646 nigel 77 *errorcodeptr = ERR37;
647     break;
648    
649 ph10 333 /* \g must be followed by one of a number of specific things:
650 ph10 345
651 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
652     backreference. If negative, it is a relative backreference. This is a Perl
653     5.10 feature.
654 ph10 345
655 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
656     is part of Perl's movement towards a unified syntax for back references. As
657     this is synonymous with \k{name}, we fudge it up by pretending it really
658     was \k.
659 ph10 345
660     (3) For Oniguruma compatibility we also support \g followed by a name or a
661     number either in angle brackets or in single quotes. However, these are
662     (possibly recursive) subroutine calls, _not_ backreferences. Just return
663 ph10 333 the -ESC_g code (cf \k). */
664 nigel 93
665 ph10 391 case CHAR_g:
666     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
667 ph10 333 {
668     c = -ESC_g;
669 ph10 345 break;
670     }
671 ph10 333
672     /* Handle the Perl-compatible cases */
673 ph10 345
674 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
675 nigel 93 {
676 ph10 171 const uschar *p;
677 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
678     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
679     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
680 ph10 171 {
681     c = -ESC_k;
682     break;
683 ph10 172 }
684 nigel 93 braced = TRUE;
685     ptr++;
686     }
687     else braced = FALSE;
688    
689 ph10 391 if (ptr[1] == CHAR_MINUS)
690 nigel 93 {
691     negated = TRUE;
692     ptr++;
693     }
694     else negated = FALSE;
695    
696     c = 0;
697     while ((digitab[ptr[1]] & ctype_digit) != 0)
698 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
699 ph10 220
700 ph10 333 if (c < 0) /* Integer overflow */
701 ph10 213 {
702     *errorcodeptr = ERR61;
703     break;
704 ph10 220 }
705 ph10 345
706 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
707 nigel 93 {
708     *errorcodeptr = ERR57;
709 ph10 213 break;
710 nigel 93 }
711 ph10 345
712 ph10 333 if (c == 0)
713     {
714     *errorcodeptr = ERR58;
715     break;
716 ph10 345 }
717 nigel 93
718     if (negated)
719     {
720     if (c > bracount)
721     {
722     *errorcodeptr = ERR15;
723 ph10 213 break;
724 nigel 93 }
725     c = bracount - (c - 1);
726     }
727    
728     c = -(ESC_REF + c);
729     break;
730    
731 nigel 77 /* The handling of escape sequences consisting of a string of digits
732     starting with one that is not zero is not straightforward. By experiment,
733     the way Perl works seems to be as follows:
734    
735     Outside a character class, the digits are read as a decimal number. If the
736     number is less than 10, or if there are that many previous extracting
737     left brackets, then it is a back reference. Otherwise, up to three octal
738     digits are read to form an escaped byte. Thus \123 is likely to be octal
739     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
740     value is greater than 377, the least significant 8 bits are taken. Inside a
741     character class, \ followed by a digit is always an octal number. */
742    
743 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
744     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
745 nigel 77
746     if (!isclass)
747     {
748     oldptr = ptr;
749 ph10 391 c -= CHAR_0;
750 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
751 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
752 ph10 333 if (c < 0) /* Integer overflow */
753 ph10 213 {
754     *errorcodeptr = ERR61;
755 ph10 220 break;
756     }
757 nigel 77 if (c < 10 || c <= bracount)
758     {
759     c = -(ESC_REF + c);
760     break;
761     }
762     ptr = oldptr; /* Put the pointer back and fall through */
763     }
764    
765     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
766     generates a binary zero byte and treats the digit as a following literal.
767     Thus we have to pull back the pointer by one. */
768    
769 ph10 391 if ((c = *ptr) >= CHAR_8)
770 nigel 77 {
771     ptr--;
772     c = 0;
773     break;
774     }
775    
776     /* \0 always starts an octal number, but we may drop through to here with a
777 nigel 91 larger first octal digit. The original code used just to take the least
778     significant 8 bits of octal numbers (I think this is what early Perls used
779     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
780     than 3 octal digits. */
781 nigel 77
782 ph10 391 case CHAR_0:
783     c -= CHAR_0;
784     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
785     c = c * 8 + *(++ptr) - CHAR_0;
786 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
787 nigel 77 break;
788    
789 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
790     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
791     treated as a data character. */
792 nigel 77
793 ph10 391 case CHAR_x:
794     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
795 nigel 77 {
796     const uschar *pt = ptr + 2;
797 nigel 87 int count = 0;
798    
799 nigel 77 c = 0;
800     while ((digitab[*pt] & ctype_xdigit) != 0)
801     {
802 nigel 87 register int cc = *pt++;
803 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
804 nigel 77 count++;
805 nigel 87
806 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
807     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
808     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
809 ph10 97 #else /* EBCDIC coding */
810 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
811     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
812 nigel 77 #endif
813     }
814 nigel 87
815 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
816 nigel 77 {
817 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
818 nigel 77 ptr = pt;
819     break;
820     }
821 nigel 87
822 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
823     recognize this construct; fall through to the normal \x handling. */
824     }
825    
826 nigel 87 /* Read just a single-byte hex-defined char */
827 nigel 77
828     c = 0;
829     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
830     {
831 ph10 391 int cc; /* Some compilers don't like */
832     cc = *(++ptr); /* ++ in initializers */
833     #ifndef EBCDIC /* ASCII/UTF-8 coding */
834     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
835     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
836 ph10 97 #else /* EBCDIC coding */
837 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
838     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
839 nigel 77 #endif
840     }
841     break;
842    
843 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
844     This coding is ASCII-specific, but then the whole concept of \cx is
845     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
846 nigel 77
847 ph10 391 case CHAR_c:
848 nigel 77 c = *(++ptr);
849     if (c == 0)
850     {
851     *errorcodeptr = ERR2;
852 ph10 213 break;
853 nigel 77 }
854    
855 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
856     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
857 nigel 77 c ^= 0x40;
858 ph10 97 #else /* EBCDIC coding */
859 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
860 nigel 77 c ^= 0xC0;
861     #endif
862     break;
863    
864     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
865 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
866     otherwise, for Perl compatibility, it is a literal. This code looks a bit
867     odd, but there used to be some cases other than the default, and there may
868     be again in future, so I haven't "optimized" it. */
869 nigel 77
870     default:
871     if ((options & PCRE_EXTRA) != 0) switch(c)
872     {
873     default:
874     *errorcodeptr = ERR3;
875     break;
876     }
877     break;
878     }
879     }
880 ph10 518
881     /* Perl supports \N{name} for character names, as well as plain \N for "not
882 ph10 514 newline". PCRE does not support \N{name}. */
883 nigel 77
884 ph10 514 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
885 ph10 518 *errorcodeptr = ERR37;
886 ph10 514
887 ph10 518 /* If PCRE_UCP is set, we change the values for \d etc. */
888    
889     if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
890     c -= (ESC_DU - ESC_D);
891    
892     /* Set the pointer to the final character before returning. */
893    
894 nigel 77 *ptrptr = ptr;
895     return c;
896     }
897    
898    
899    
900     #ifdef SUPPORT_UCP
901     /*************************************************
902     * Handle \P and \p *
903     *************************************************/
904    
905     /* This function is called after \P or \p has been encountered, provided that
906     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
907     pointing at the P or p. On exit, it is pointing at the final character of the
908     escape sequence.
909    
910     Argument:
911     ptrptr points to the pattern position pointer
912     negptr points to a boolean that is set TRUE for negation else FALSE
913 nigel 87 dptr points to an int that is set to the detailed property value
914 nigel 77 errorcodeptr points to the error code variable
915    
916 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
917 nigel 77 */
918    
919     static int
920 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
921 nigel 77 {
922     int c, i, bot, top;
923     const uschar *ptr = *ptrptr;
924 nigel 87 char name[32];
925 nigel 77
926     c = *(++ptr);
927     if (c == 0) goto ERROR_RETURN;
928    
929     *negptr = FALSE;
930    
931 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
932     negation. */
933 nigel 77
934 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
935 nigel 77 {
936 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
937 nigel 77 {
938     *negptr = TRUE;
939     ptr++;
940     }
941 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
942 nigel 77 {
943     c = *(++ptr);
944     if (c == 0) goto ERROR_RETURN;
945 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
946 nigel 77 name[i] = c;
947     }
948 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
949 nigel 77 name[i] = 0;
950     }
951    
952     /* Otherwise there is just one following character */
953    
954     else
955     {
956     name[0] = c;
957     name[1] = 0;
958     }
959    
960     *ptrptr = ptr;
961    
962     /* Search for a recognized property name using binary chop */
963    
964     bot = 0;
965     top = _pcre_utt_size;
966    
967     while (bot < top)
968     {
969 nigel 87 i = (bot + top) >> 1;
970 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
971 nigel 87 if (c == 0)
972     {
973     *dptr = _pcre_utt[i].value;
974     return _pcre_utt[i].type;
975     }
976 nigel 77 if (c > 0) bot = i + 1; else top = i;
977     }
978    
979     *errorcodeptr = ERR47;
980     *ptrptr = ptr;
981     return -1;
982    
983     ERROR_RETURN:
984     *errorcodeptr = ERR46;
985     *ptrptr = ptr;
986     return -1;
987     }
988     #endif
989    
990    
991    
992    
993     /*************************************************
994     * Check for counted repeat *
995     *************************************************/
996    
997     /* This function is called when a '{' is encountered in a place where it might
998     start a quantifier. It looks ahead to see if it really is a quantifier or not.
999     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1000     where the ddds are digits.
1001    
1002     Arguments:
1003     p pointer to the first char after '{'
1004    
1005     Returns: TRUE or FALSE
1006     */
1007    
1008     static BOOL
1009     is_counted_repeat(const uschar *p)
1010     {
1011     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1012     while ((digitab[*p] & ctype_digit) != 0) p++;
1013 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1014 nigel 77
1015 ph10 391 if (*p++ != CHAR_COMMA) return FALSE;
1016     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1017 nigel 77
1018     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1019     while ((digitab[*p] & ctype_digit) != 0) p++;
1020    
1021 ph10 391 return (*p == CHAR_RIGHT_CURLY_BRACKET);
1022 nigel 77 }
1023    
1024    
1025    
1026     /*************************************************
1027     * Read repeat counts *
1028     *************************************************/
1029    
1030     /* Read an item of the form {n,m} and return the values. This is called only
1031     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1032     so the syntax is guaranteed to be correct, but we need to check the values.
1033    
1034     Arguments:
1035     p pointer to first char after '{'
1036     minp pointer to int for min
1037     maxp pointer to int for max
1038     returned as -1 if no max
1039     errorcodeptr points to error code variable
1040    
1041     Returns: pointer to '}' on success;
1042     current ptr on error, with errorcodeptr set non-zero
1043     */
1044    
1045     static const uschar *
1046     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
1047     {
1048     int min = 0;
1049     int max = -1;
1050    
1051 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
1052     an integer overflow. */
1053    
1054 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1055 nigel 81 if (min < 0 || min > 65535)
1056     {
1057     *errorcodeptr = ERR5;
1058     return p;
1059     }
1060 nigel 77
1061 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
1062     Also, max must not be less than min. */
1063    
1064 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1065 nigel 77 {
1066 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1067 nigel 77 {
1068     max = 0;
1069 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1070 nigel 81 if (max < 0 || max > 65535)
1071     {
1072     *errorcodeptr = ERR5;
1073     return p;
1074     }
1075 nigel 77 if (max < min)
1076     {
1077     *errorcodeptr = ERR4;
1078     return p;
1079     }
1080     }
1081     }
1082    
1083 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1084     '}'. */
1085 nigel 77
1086 nigel 81 *minp = min;
1087     *maxp = max;
1088 nigel 77 return p;
1089     }
1090    
1091    
1092    
1093     /*************************************************
1094 ph10 408 * Subroutine for finding forward reference *
1095 nigel 91 *************************************************/
1096    
1097 ph10 408 /* This recursive function is called only from find_parens() below. The
1098     top-level call starts at the beginning of the pattern. All other calls must
1099     start at a parenthesis. It scans along a pattern's text looking for capturing
1100 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1101     name it is given, it returns its number. Alternatively, if the name is NULL, it
1102 ph10 408 returns when it reaches a given numbered subpattern. We know that if (?P< is
1103     encountered, the name will be terminated by '>' because that is checked in the
1104 ph10 411 first pass. Recursion is used to keep track of subpatterns that reset the
1105 ph10 408 capturing group numbers - the (?| feature.
1106 nigel 91
1107     Arguments:
1108 ph10 408 ptrptr address of the current character pointer (updated)
1109 ph10 345 cd compile background data
1110 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1111     lorn name length, or subpattern number if name is NULL
1112     xmode TRUE if we are in /x mode
1113 ph10 411 count pointer to the current capturing subpattern number (updated)
1114 nigel 91
1115     Returns: the number of the named subpattern, or -1 if not found
1116     */
1117    
1118     static int
1119 ph10 408 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1120     BOOL xmode, int *count)
1121 nigel 91 {
1122 ph10 408 uschar *ptr = *ptrptr;
1123     int start_count = *count;
1124     int hwm_count = start_count;
1125     BOOL dup_parens = FALSE;
1126 nigel 93
1127 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1128 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1129    
1130     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1131     {
1132 ph10 544 /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1133    
1134     if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1135    
1136     /* Handle a normal, unnamed capturing parenthesis. */
1137 ph10 408
1138 ph10 544 else if (ptr[1] != CHAR_QUESTION_MARK)
1139 ph10 408 {
1140     *count += 1;
1141     if (name == NULL && *count == lorn) return *count;
1142 ph10 411 ptr++;
1143 ph10 408 }
1144    
1145 ph10 544 /* All cases now have (? at the start. Remember when we are in a group
1146     where the parenthesis numbers are duplicated. */
1147    
1148     else if (ptr[2] == CHAR_VERTICAL_LINE)
1149     {
1150     ptr += 3;
1151     dup_parens = TRUE;
1152     }
1153    
1154     /* Handle comments; all characters are allowed until a ket is reached. */
1155    
1156     else if (ptr[2] == CHAR_NUMBER_SIGN)
1157     {
1158     for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1159     goto FAIL_EXIT;
1160     }
1161    
1162 ph10 408 /* Handle a condition. If it is an assertion, just carry on so that it
1163     is processed as normal. If not, skip to the closing parenthesis of the
1164 ph10 544 condition (there can't be any nested parens). */
1165 ph10 411
1166 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1167     {
1168 ph10 411 ptr += 2;
1169 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1170     {
1171     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1172 ph10 411 if (*ptr != 0) ptr++;
1173 ph10 408 }
1174 ph10 411 }
1175    
1176 ph10 544 /* Start with (? but not a condition. */
1177 ph10 408
1178     else
1179 ph10 411 {
1180 ph10 408 ptr += 2;
1181     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1182    
1183     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1184 ph10 411
1185 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1186     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1187     {
1188     int term;
1189     const uschar *thisname;
1190     *count += 1;
1191     if (name == NULL && *count == lorn) return *count;
1192     term = *ptr++;
1193     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1194     thisname = ptr;
1195     while (*ptr != term) ptr++;
1196     if (name != NULL && lorn == ptr - thisname &&
1197     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1198     return *count;
1199 ph10 461 term++;
1200 ph10 411 }
1201 ph10 408 }
1202 ph10 411 }
1203 ph10 408
1204 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1205 ph10 408 bars. */
1206    
1207 nigel 91 for (; *ptr != 0; ptr++)
1208     {
1209 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1210    
1211 ph10 391 if (*ptr == CHAR_BACKSLASH)
1212 nigel 93 {
1213 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1214 ph10 391 if (*ptr == CHAR_Q) for (;;)
1215 nigel 93 {
1216 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1217 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1218 ph10 391 if (*(++ptr) == CHAR_E) break;
1219 nigel 93 }
1220     continue;
1221     }
1222    
1223 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1224     are handled for real. If the first character is '^', skip it. Also, if the
1225     first few characters (either before or after ^) are \Q\E or \E we skip them
1226 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1227 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1228 nigel 93
1229 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1230 nigel 93 {
1231 ph10 340 BOOL negate_class = FALSE;
1232     for (;;)
1233     {
1234 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1235 ph10 340 {
1236 ph10 438 if (ptr[2] == CHAR_E)
1237     ptr+= 2;
1238     else if (strncmp((const char *)ptr+2,
1239 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1240 ph10 438 ptr += 4;
1241 ph10 392 else
1242 ph10 391 break;
1243 ph10 340 }
1244 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1245 ph10 461 {
1246 ph10 340 negate_class = TRUE;
1247 ph10 438 ptr++;
1248 ph10 461 }
1249 ph10 340 else break;
1250     }
1251    
1252     /* If the next character is ']', it is a data character that must be
1253 ph10 341 skipped, except in JavaScript compatibility mode. */
1254 ph10 345
1255 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1256 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1257 ph10 345 ptr++;
1258    
1259 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1260 nigel 93 {
1261 ph10 220 if (*ptr == 0) return -1;
1262 ph10 391 if (*ptr == CHAR_BACKSLASH)
1263 nigel 93 {
1264 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1265 ph10 391 if (*ptr == CHAR_Q) for (;;)
1266 nigel 93 {
1267 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1268 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1269 ph10 391 if (*(++ptr) == CHAR_E) break;
1270 nigel 93 }
1271     continue;
1272     }
1273     }
1274     continue;
1275     }
1276    
1277     /* Skip comments in /x mode */
1278    
1279 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1280 nigel 93 {
1281 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1282 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1283 nigel 93 continue;
1284     }
1285    
1286 ph10 408 /* Check for the special metacharacters */
1287 ph10 411
1288 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1289 nigel 93 {
1290 ph10 408 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1291     if (rc > 0) return rc;
1292     if (*ptr == 0) goto FAIL_EXIT;
1293 nigel 93 }
1294 ph10 411
1295 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1296     {
1297     if (dup_parens && *count < hwm_count) *count = hwm_count;
1298 ph10 544 goto FAIL_EXIT;
1299 ph10 408 }
1300 ph10 411
1301     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1302 ph10 408 {
1303     if (*count > hwm_count) hwm_count = *count;
1304     *count = start_count;
1305 ph10 411 }
1306 ph10 408 }
1307 nigel 93
1308 ph10 408 FAIL_EXIT:
1309     *ptrptr = ptr;
1310     return -1;
1311     }
1312 nigel 93
1313    
1314    
1315    
1316 ph10 408 /*************************************************
1317     * Find forward referenced subpattern *
1318     *************************************************/
1319 nigel 93
1320 ph10 408 /* This function scans along a pattern's text looking for capturing
1321     subpatterns, and counting them. If it finds a named pattern that matches the
1322     name it is given, it returns its number. Alternatively, if the name is NULL, it
1323     returns when it reaches a given numbered subpattern. This is used for forward
1324     references to subpatterns. We used to be able to start this scan from the
1325     current compiling point, using the current count value from cd->bracount, and
1326     do it all in a single loop, but the addition of the possibility of duplicate
1327     subpattern numbers means that we have to scan from the very start, in order to
1328     take account of such duplicates, and to use a recursive function to keep track
1329     of the different types of group.
1330    
1331     Arguments:
1332     cd compile background data
1333     name name to seek, or NULL if seeking a numbered subpattern
1334     lorn name length, or subpattern number if name is NULL
1335     xmode TRUE if we are in /x mode
1336    
1337     Returns: the number of the found subpattern, or -1 if not found
1338     */
1339    
1340     static int
1341     find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1342     {
1343     uschar *ptr = (uschar *)cd->start_pattern;
1344     int count = 0;
1345     int rc;
1346    
1347     /* If the pattern does not start with an opening parenthesis, the first call
1348     to find_parens_sub() will scan right to the end (if necessary). However, if it
1349     does start with a parenthesis, find_parens_sub() will return when it hits the
1350     matching closing parens. That is why we have to have a loop. */
1351    
1352 ph10 411 for (;;)
1353     {
1354 ph10 408 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1355 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1356     }
1357    
1358 ph10 408 return rc;
1359 nigel 91 }
1360    
1361    
1362    
1363 ph10 408
1364 nigel 91 /*************************************************
1365 nigel 77 * Find first significant op code *
1366     *************************************************/
1367    
1368     /* This is called by several functions that scan a compiled expression looking
1369     for a fixed first character, or an anchoring op code etc. It skips over things
1370     that do not influence this. For some calls, a change of option is important.
1371     For some calls, it makes sense to skip negative forward and all backward
1372     assertions, and also the \b assertion; for others it does not.
1373    
1374     Arguments:
1375     code pointer to the start of the group
1376     options pointer to external options
1377     optbit the option bit whose changing is significant, or
1378     zero if none are
1379     skipassert TRUE if certain assertions are to be skipped
1380    
1381     Returns: pointer to the first significant opcode
1382     */
1383    
1384     static const uschar*
1385     first_significant_code(const uschar *code, int *options, int optbit,
1386     BOOL skipassert)
1387     {
1388     for (;;)
1389     {
1390     switch ((int)*code)
1391     {
1392     case OP_OPT:
1393     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1394     *options = (int)code[1];
1395     code += 2;
1396     break;
1397    
1398     case OP_ASSERT_NOT:
1399     case OP_ASSERTBACK:
1400     case OP_ASSERTBACK_NOT:
1401     if (!skipassert) return code;
1402     do code += GET(code, 1); while (*code == OP_ALT);
1403     code += _pcre_OP_lengths[*code];
1404     break;
1405    
1406     case OP_WORD_BOUNDARY:
1407     case OP_NOT_WORD_BOUNDARY:
1408     if (!skipassert) return code;
1409     /* Fall through */
1410    
1411     case OP_CALLOUT:
1412     case OP_CREF:
1413 ph10 459 case OP_NCREF:
1414 nigel 93 case OP_RREF:
1415 ph10 459 case OP_NRREF:
1416 nigel 93 case OP_DEF:
1417 nigel 77 code += _pcre_OP_lengths[*code];
1418     break;
1419    
1420     default:
1421     return code;
1422     }
1423     }
1424     /* Control never reaches here */
1425     }
1426    
1427    
1428    
1429    
1430     /*************************************************
1431 ph10 454 * Find the fixed length of a branch *
1432 nigel 77 *************************************************/
1433    
1434 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1435 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1436 ph10 461 In UTF8 mode, the result is in characters rather than bytes. The branch is
1437 ph10 454 temporarily terminated with OP_END when this function is called.
1438 nigel 77
1439 ph10 461 This function is called when a backward assertion is encountered, so that if it
1440     fails, the error message can point to the correct place in the pattern.
1441 ph10 454 However, we cannot do this when the assertion contains subroutine calls,
1442 ph10 461 because they can be forward references. We solve this by remembering this case
1443 ph10 454 and doing the check at the end; a flag specifies which mode we are running in.
1444    
1445 nigel 77 Arguments:
1446     code points to the start of the pattern (the bracket)
1447     options the compiling options
1448 ph10 461 atend TRUE if called when the pattern is complete
1449     cd the "compile data" structure
1450 nigel 77
1451 ph10 461 Returns: the fixed length,
1452 ph10 454 or -1 if there is no fixed length,
1453 nigel 77 or -2 if \C was encountered
1454 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1455 nigel 77 */
1456    
1457     static int
1458 ph10 454 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1459 nigel 77 {
1460     int length = -1;
1461    
1462     register int branchlength = 0;
1463     register uschar *cc = code + 1 + LINK_SIZE;
1464    
1465     /* Scan along the opcodes for this branch. If we get to the end of the
1466     branch, check the length against that of the other branches. */
1467    
1468     for (;;)
1469     {
1470     int d;
1471 ph10 454 uschar *ce, *cs;
1472 nigel 77 register int op = *cc;
1473     switch (op)
1474     {
1475 nigel 93 case OP_CBRA:
1476 nigel 77 case OP_BRA:
1477     case OP_ONCE:
1478     case OP_COND:
1479 ph10 454 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1480 nigel 77 if (d < 0) return d;
1481     branchlength += d;
1482     do cc += GET(cc, 1); while (*cc == OP_ALT);
1483     cc += 1 + LINK_SIZE;
1484     break;
1485    
1486     /* Reached end of a branch; if it's a ket it is the end of a nested
1487     call. If it's ALT it is an alternation in a nested call. If it is
1488     END it's the end of the outer call. All can be handled by the same code. */
1489    
1490     case OP_ALT:
1491     case OP_KET:
1492     case OP_KETRMAX:
1493     case OP_KETRMIN:
1494     case OP_END:
1495     if (length < 0) length = branchlength;
1496     else if (length != branchlength) return -1;
1497     if (*cc != OP_ALT) return length;
1498     cc += 1 + LINK_SIZE;
1499     branchlength = 0;
1500     break;
1501 ph10 461
1502 ph10 454 /* A true recursion implies not fixed length, but a subroutine call may
1503     be OK. If the subroutine is a forward reference, we can't deal with
1504     it until the end of the pattern, so return -3. */
1505 ph10 461
1506 ph10 454 case OP_RECURSE:
1507     if (!atend) return -3;
1508     cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1509     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1510     if (cc > cs && cc < ce) return -1; /* Recursion */
1511     d = find_fixedlength(cs + 2, options, atend, cd);
1512 ph10 461 if (d < 0) return d;
1513 ph10 454 branchlength += d;
1514     cc += 1 + LINK_SIZE;
1515 ph10 461 break;
1516 nigel 77
1517     /* Skip over assertive subpatterns */
1518    
1519     case OP_ASSERT:
1520     case OP_ASSERT_NOT:
1521     case OP_ASSERTBACK:
1522     case OP_ASSERTBACK_NOT:
1523     do cc += GET(cc, 1); while (*cc == OP_ALT);
1524     /* Fall through */
1525    
1526     /* Skip over things that don't match chars */
1527    
1528     case OP_REVERSE:
1529     case OP_CREF:
1530 ph10 459 case OP_NCREF:
1531 nigel 93 case OP_RREF:
1532 ph10 459 case OP_NRREF:
1533 nigel 93 case OP_DEF:
1534 nigel 77 case OP_OPT:
1535     case OP_CALLOUT:
1536     case OP_SOD:
1537     case OP_SOM:
1538 ph10 500 case OP_SET_SOM:
1539 nigel 77 case OP_EOD:
1540     case OP_EODN:
1541     case OP_CIRC:
1542     case OP_DOLL:
1543     case OP_NOT_WORD_BOUNDARY:
1544     case OP_WORD_BOUNDARY:
1545     cc += _pcre_OP_lengths[*cc];
1546     break;
1547    
1548     /* Handle literal characters */
1549    
1550     case OP_CHAR:
1551     case OP_CHARNC:
1552 nigel 91 case OP_NOT:
1553 nigel 77 branchlength++;
1554     cc += 2;
1555     #ifdef SUPPORT_UTF8
1556 ph10 461 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1557 ph10 426 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1558 nigel 77 #endif
1559     break;
1560    
1561     /* Handle exact repetitions. The count is already in characters, but we
1562     need to skip over a multibyte character in UTF8 mode. */
1563    
1564     case OP_EXACT:
1565     branchlength += GET2(cc,1);
1566     cc += 4;
1567     #ifdef SUPPORT_UTF8
1568 ph10 461 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1569 ph10 426 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1570 nigel 77 #endif
1571     break;
1572    
1573     case OP_TYPEEXACT:
1574     branchlength += GET2(cc,1);
1575 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1576 nigel 77 cc += 4;
1577     break;
1578    
1579     /* Handle single-char matchers */
1580    
1581     case OP_PROP:
1582     case OP_NOTPROP:
1583 nigel 87 cc += 2;
1584 nigel 77 /* Fall through */
1585    
1586     case OP_NOT_DIGIT:
1587     case OP_DIGIT:
1588     case OP_NOT_WHITESPACE:
1589     case OP_WHITESPACE:
1590     case OP_NOT_WORDCHAR:
1591     case OP_WORDCHAR:
1592     case OP_ANY:
1593 ph10 342 case OP_ALLANY:
1594 nigel 77 branchlength++;
1595     cc++;
1596     break;
1597    
1598     /* The single-byte matcher isn't allowed */
1599    
1600     case OP_ANYBYTE:
1601     return -2;
1602    
1603     /* Check a class for variable quantification */
1604    
1605     #ifdef SUPPORT_UTF8
1606     case OP_XCLASS:
1607     cc += GET(cc, 1) - 33;
1608     /* Fall through */
1609     #endif
1610    
1611     case OP_CLASS:
1612     case OP_NCLASS:
1613     cc += 33;
1614    
1615     switch (*cc)
1616     {
1617     case OP_CRSTAR:
1618     case OP_CRMINSTAR:
1619     case OP_CRQUERY:
1620     case OP_CRMINQUERY:
1621     return -1;
1622    
1623     case OP_CRRANGE:
1624     case OP_CRMINRANGE:
1625     if (GET2(cc,1) != GET2(cc,3)) return -1;
1626     branchlength += GET2(cc,1);
1627     cc += 5;
1628     break;
1629    
1630     default:
1631     branchlength++;
1632     }
1633     break;
1634    
1635     /* Anything else is variable length */
1636    
1637     default:
1638     return -1;
1639     }
1640     }
1641     /* Control never gets here */
1642     }
1643    
1644    
1645    
1646    
1647     /*************************************************
1648 ph10 454 * Scan compiled regex for specific bracket *
1649 nigel 77 *************************************************/
1650    
1651     /* This little function scans through a compiled pattern until it finds a
1652 ph10 454 capturing bracket with the given number, or, if the number is negative, an
1653 ph10 461 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1654     so that it can be called from pcre_study() when finding the minimum matching
1655 ph10 455 length.
1656 nigel 77
1657     Arguments:
1658     code points to start of expression
1659     utf8 TRUE in UTF-8 mode
1660 ph10 454 number the required bracket number or negative to find a lookbehind
1661 nigel 77
1662     Returns: pointer to the opcode for the bracket, or NULL if not found
1663     */
1664    
1665 ph10 455 const uschar *
1666     _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1667 nigel 77 {
1668     for (;;)
1669     {
1670     register int c = *code;
1671     if (c == OP_END) return NULL;
1672 nigel 91
1673     /* XCLASS is used for classes that cannot be represented just by a bit
1674     map. This includes negated single high-valued characters. The length in
1675     the table is zero; the actual length is stored in the compiled code. */
1676    
1677     if (c == OP_XCLASS) code += GET(code, 1);
1678 ph10 461
1679 ph10 454 /* Handle recursion */
1680 ph10 461
1681 ph10 454 else if (c == OP_REVERSE)
1682     {
1683 ph10 461 if (number < 0) return (uschar *)code;
1684 ph10 454 code += _pcre_OP_lengths[c];
1685     }
1686 nigel 91
1687 nigel 93 /* Handle capturing bracket */
1688 nigel 91
1689 nigel 93 else if (c == OP_CBRA)
1690 nigel 77 {
1691 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1692 nigel 77 if (n == number) return (uschar *)code;
1693 nigel 93 code += _pcre_OP_lengths[c];
1694 nigel 77 }
1695 nigel 91
1696 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1697     repeated character types, we have to test for \p and \P, which have an extra
1698 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1699 ph10 510 must add in its length. */
1700 nigel 91
1701 nigel 77 else
1702     {
1703 ph10 218 switch(c)
1704     {
1705     case OP_TYPESTAR:
1706     case OP_TYPEMINSTAR:
1707     case OP_TYPEPLUS:
1708     case OP_TYPEMINPLUS:
1709     case OP_TYPEQUERY:
1710     case OP_TYPEMINQUERY:
1711     case OP_TYPEPOSSTAR:
1712     case OP_TYPEPOSPLUS:
1713     case OP_TYPEPOSQUERY:
1714     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1715 ph10 220 break;
1716 ph10 221
1717     case OP_TYPEUPTO:
1718     case OP_TYPEMINUPTO:
1719     case OP_TYPEEXACT:
1720     case OP_TYPEPOSUPTO:
1721     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1722     break;
1723 ph10 512
1724 ph10 510 case OP_MARK:
1725     case OP_PRUNE_ARG:
1726     case OP_SKIP_ARG:
1727     case OP_THEN_ARG:
1728     code += code[1];
1729 ph10 512 break;
1730 ph10 220 }
1731    
1732 ph10 218 /* Add in the fixed length from the table */
1733 ph10 220
1734 nigel 77 code += _pcre_OP_lengths[c];
1735 ph10 220
1736 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1737     a multi-byte character. The length in the table is a minimum, so we have to
1738     arrange to skip the extra bytes. */
1739 ph10 220
1740 ph10 107 #ifdef SUPPORT_UTF8
1741 nigel 77 if (utf8) switch(c)
1742     {
1743     case OP_CHAR:
1744     case OP_CHARNC:
1745     case OP_EXACT:
1746     case OP_UPTO:
1747     case OP_MINUPTO:
1748 nigel 93 case OP_POSUPTO:
1749 nigel 77 case OP_STAR:
1750     case OP_MINSTAR:
1751 nigel 93 case OP_POSSTAR:
1752 nigel 77 case OP_PLUS:
1753     case OP_MINPLUS:
1754 nigel 93 case OP_POSPLUS:
1755 nigel 77 case OP_QUERY:
1756     case OP_MINQUERY:
1757 nigel 93 case OP_POSQUERY:
1758     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1759 nigel 77 break;
1760     }
1761 ph10 369 #else
1762     (void)(utf8); /* Keep compiler happy by referencing function argument */
1763 ph10 111 #endif
1764 nigel 77 }
1765     }
1766     }
1767    
1768    
1769    
1770     /*************************************************
1771     * Scan compiled regex for recursion reference *
1772     *************************************************/
1773    
1774     /* This little function scans through a compiled pattern until it finds an
1775     instance of OP_RECURSE.
1776    
1777     Arguments:
1778     code points to start of expression
1779     utf8 TRUE in UTF-8 mode
1780    
1781     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1782     */
1783    
1784     static const uschar *
1785     find_recurse(const uschar *code, BOOL utf8)
1786     {
1787     for (;;)
1788     {
1789     register int c = *code;
1790     if (c == OP_END) return NULL;
1791 nigel 91 if (c == OP_RECURSE) return code;
1792 ph10 220
1793 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1794     map. This includes negated single high-valued characters. The length in
1795     the table is zero; the actual length is stored in the compiled code. */
1796    
1797     if (c == OP_XCLASS) code += GET(code, 1);
1798    
1799 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1800     repeated character types, we have to test for \p and \P, which have an extra
1801 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1802 ph10 510 must add in its length. */
1803 nigel 91
1804 nigel 77 else
1805     {
1806 ph10 218 switch(c)
1807     {
1808     case OP_TYPESTAR:
1809     case OP_TYPEMINSTAR:
1810     case OP_TYPEPLUS:
1811     case OP_TYPEMINPLUS:
1812     case OP_TYPEQUERY:
1813     case OP_TYPEMINQUERY:
1814     case OP_TYPEPOSSTAR:
1815     case OP_TYPEPOSPLUS:
1816     case OP_TYPEPOSQUERY:
1817     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1818 ph10 220 break;
1819 ph10 221
1820     case OP_TYPEPOSUPTO:
1821     case OP_TYPEUPTO:
1822     case OP_TYPEMINUPTO:
1823     case OP_TYPEEXACT:
1824     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1825     break;
1826 ph10 512
1827 ph10 510 case OP_MARK:
1828     case OP_PRUNE_ARG:
1829     case OP_SKIP_ARG:
1830     case OP_THEN_ARG:
1831     code += code[1];
1832 ph10 512 break;
1833 ph10 220 }
1834    
1835 ph10 218 /* Add in the fixed length from the table */
1836    
1837 nigel 77 code += _pcre_OP_lengths[c];
1838 ph10 220
1839 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1840     by a multi-byte character. The length in the table is a minimum, so we have
1841     to arrange to skip the extra bytes. */
1842 ph10 220
1843 ph10 107 #ifdef SUPPORT_UTF8
1844 nigel 77 if (utf8) switch(c)
1845     {
1846     case OP_CHAR:
1847     case OP_CHARNC:
1848     case OP_EXACT:
1849     case OP_UPTO:
1850     case OP_MINUPTO:
1851 nigel 93 case OP_POSUPTO:
1852 nigel 77 case OP_STAR:
1853     case OP_MINSTAR:
1854 nigel 93 case OP_POSSTAR:
1855 nigel 77 case OP_PLUS:
1856     case OP_MINPLUS:
1857 nigel 93 case OP_POSPLUS:
1858 nigel 77 case OP_QUERY:
1859     case OP_MINQUERY:
1860 nigel 93 case OP_POSQUERY:
1861     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1862 nigel 77 break;
1863     }
1864 ph10 369 #else
1865     (void)(utf8); /* Keep compiler happy by referencing function argument */
1866 ph10 111 #endif
1867 nigel 77 }
1868     }
1869     }
1870    
1871    
1872    
1873     /*************************************************
1874     * Scan compiled branch for non-emptiness *
1875     *************************************************/
1876    
1877     /* This function scans through a branch of a compiled pattern to see whether it
1878 nigel 93 can match the empty string or not. It is called from could_be_empty()
1879     below and from compile_branch() when checking for an unlimited repeat of a
1880     group that can match nothing. Note that first_significant_code() skips over
1881 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1882     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1883     bracket whose current branch will already have been scanned.
1884 nigel 77
1885     Arguments:
1886     code points to start of search
1887     endcode points to where to stop
1888     utf8 TRUE if in UTF8 mode
1889 ph10 503 cd contains pointers to tables etc.
1890 nigel 77
1891     Returns: TRUE if what is matched could be empty
1892     */
1893    
1894     static BOOL
1895 ph10 503 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1896     compile_data *cd)
1897 nigel 77 {
1898     register int c;
1899 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1900 nigel 77 code < endcode;
1901     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1902     {
1903     const uschar *ccode;
1904    
1905     c = *code;
1906 ph10 507
1907 ph10 286 /* Skip over forward assertions; the other assertions are skipped by
1908 ph10 282 first_significant_code() with a TRUE final argument. */
1909 ph10 286
1910 ph10 282 if (c == OP_ASSERT)
1911 ph10 286 {
1912 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1913     c = *code;
1914     continue;
1915 ph10 286 }
1916 ph10 172
1917 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1918 nigel 77
1919 ph10 335 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1920 ph10 170 {
1921 ph10 172 code += _pcre_OP_lengths[c];
1922 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1923     c = *code;
1924     continue;
1925     }
1926 ph10 507
1927 ph10 503 /* For a recursion/subroutine call, if its end has been reached, which
1928     implies a subroutine call, we can scan it. */
1929 ph10 507
1930 ph10 503 if (c == OP_RECURSE)
1931     {
1932 ph10 507 BOOL empty_branch = FALSE;
1933 ph10 503 const uschar *scode = cd->start_code + GET(code, 1);
1934     if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
1935     do
1936     {
1937 ph10 504 if (could_be_empty_branch(scode, endcode, utf8, cd))
1938     {
1939     empty_branch = TRUE;
1940 ph10 507 break;
1941     }
1942 ph10 503 scode += GET(scode, 1);
1943     }
1944     while (*scode == OP_ALT);
1945 ph10 504 if (!empty_branch) return FALSE; /* All branches are non-empty */
1946 ph10 503 continue;
1947 ph10 507 }
1948 ph10 170
1949     /* For other groups, scan the branches. */
1950 ph10 172
1951 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1952 nigel 77 {
1953     BOOL empty_branch;
1954     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1955 ph10 406
1956     /* If a conditional group has only one branch, there is a second, implied,
1957 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
1958     Otherwise, scan the individual branches of the group. */
1959 ph10 406
1960 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1961 nigel 77 code += GET(code, 1);
1962 ph10 395 else
1963 ph10 406 {
1964 ph10 395 empty_branch = FALSE;
1965     do
1966     {
1967 ph10 503 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
1968 ph10 395 empty_branch = TRUE;
1969     code += GET(code, 1);
1970     }
1971     while (*code == OP_ALT);
1972     if (!empty_branch) return FALSE; /* All branches are non-empty */
1973 nigel 77 }
1974 ph10 406
1975 ph10 172 c = *code;
1976 nigel 93 continue;
1977 nigel 77 }
1978    
1979 nigel 93 /* Handle the other opcodes */
1980    
1981     switch (c)
1982 nigel 77 {
1983 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1984     cannot be represented just by a bit map. This includes negated single
1985     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1986 ph10 220 actual length is stored in the compiled code, so we must update "code"
1987 ph10 216 here. */
1988 nigel 77
1989     #ifdef SUPPORT_UTF8
1990     case OP_XCLASS:
1991 ph10 216 ccode = code += GET(code, 1);
1992 nigel 77 goto CHECK_CLASS_REPEAT;
1993     #endif
1994    
1995     case OP_CLASS:
1996     case OP_NCLASS:
1997     ccode = code + 33;
1998    
1999     #ifdef SUPPORT_UTF8
2000     CHECK_CLASS_REPEAT:
2001     #endif
2002    
2003     switch (*ccode)
2004     {
2005     case OP_CRSTAR: /* These could be empty; continue */
2006     case OP_CRMINSTAR:
2007     case OP_CRQUERY:
2008     case OP_CRMINQUERY:
2009     break;
2010    
2011     default: /* Non-repeat => class must match */
2012     case OP_CRPLUS: /* These repeats aren't empty */
2013     case OP_CRMINPLUS:
2014     return FALSE;
2015    
2016     case OP_CRRANGE:
2017     case OP_CRMINRANGE:
2018     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2019     break;
2020     }
2021     break;
2022    
2023     /* Opcodes that must match a character */
2024    
2025     case OP_PROP:
2026     case OP_NOTPROP:
2027     case OP_EXTUNI:
2028     case OP_NOT_DIGIT:
2029     case OP_DIGIT:
2030     case OP_NOT_WHITESPACE:
2031     case OP_WHITESPACE:
2032     case OP_NOT_WORDCHAR:
2033     case OP_WORDCHAR:
2034     case OP_ANY:
2035 ph10 345 case OP_ALLANY:
2036 nigel 77 case OP_ANYBYTE:
2037     case OP_CHAR:
2038     case OP_CHARNC:
2039     case OP_NOT:
2040     case OP_PLUS:
2041     case OP_MINPLUS:
2042 nigel 93 case OP_POSPLUS:
2043 nigel 77 case OP_EXACT:
2044     case OP_NOTPLUS:
2045     case OP_NOTMINPLUS:
2046 nigel 93 case OP_NOTPOSPLUS:
2047 nigel 77 case OP_NOTEXACT:
2048     case OP_TYPEPLUS:
2049     case OP_TYPEMINPLUS:
2050 nigel 93 case OP_TYPEPOSPLUS:
2051 nigel 77 case OP_TYPEEXACT:
2052     return FALSE;
2053 ph10 227
2054     /* These are going to continue, as they may be empty, but we have to
2055     fudge the length for the \p and \P cases. */
2056    
2057 ph10 224 case OP_TYPESTAR:
2058     case OP_TYPEMINSTAR:
2059     case OP_TYPEPOSSTAR:
2060     case OP_TYPEQUERY:
2061     case OP_TYPEMINQUERY:
2062     case OP_TYPEPOSQUERY:
2063     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2064 ph10 227 break;
2065    
2066 ph10 224 /* Same for these */
2067 ph10 227
2068 ph10 224 case OP_TYPEUPTO:
2069     case OP_TYPEMINUPTO:
2070     case OP_TYPEPOSUPTO:
2071     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2072     break;
2073 nigel 77
2074     /* End of branch */
2075    
2076     case OP_KET:
2077     case OP_KETRMAX:
2078     case OP_KETRMIN:
2079     case OP_ALT:
2080     return TRUE;
2081    
2082 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2083     MINUPTO, and POSUPTO may be followed by a multibyte character */
2084 nigel 77
2085     #ifdef SUPPORT_UTF8
2086     case OP_STAR:
2087     case OP_MINSTAR:
2088 nigel 93 case OP_POSSTAR:
2089 nigel 77 case OP_QUERY:
2090     case OP_MINQUERY:
2091 nigel 93 case OP_POSQUERY:
2092 ph10 426 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2093     break;
2094 ph10 461
2095 nigel 77 case OP_UPTO:
2096     case OP_MINUPTO:
2097 nigel 93 case OP_POSUPTO:
2098 ph10 426 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2099 nigel 77 break;
2100     #endif
2101 ph10 503
2102 ph10 510 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2103     string. */
2104    
2105     case OP_MARK:
2106     case OP_PRUNE_ARG:
2107     case OP_SKIP_ARG:
2108     case OP_THEN_ARG:
2109     code += code[1];
2110 ph10 512 break;
2111 ph10 510
2112 ph10 503 /* None of the remaining opcodes are required to match a character. */
2113 ph10 507
2114 ph10 503 default:
2115 ph10 507 break;
2116 nigel 77 }
2117     }
2118    
2119     return TRUE;
2120     }
2121    
2122    
2123    
2124     /*************************************************
2125     * Scan compiled regex for non-emptiness *
2126     *************************************************/
2127    
2128     /* This function is called to check for left recursive calls. We want to check
2129     the current branch of the current pattern to see if it could match the empty
2130     string. If it could, we must look outwards for branches at other levels,
2131     stopping when we pass beyond the bracket which is the subject of the recursion.
2132    
2133     Arguments:
2134     code points to start of the recursion
2135     endcode points to where to stop (current RECURSE item)
2136     bcptr points to the chain of current (unclosed) branch starts
2137     utf8 TRUE if in UTF-8 mode
2138 ph10 507 cd pointers to tables etc
2139 nigel 77
2140     Returns: TRUE if what is matched could be empty
2141     */
2142    
2143     static BOOL
2144     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2145 ph10 503 BOOL utf8, compile_data *cd)
2146 nigel 77 {
2147 ph10 475 while (bcptr != NULL && bcptr->current_branch >= code)
2148 nigel 77 {
2149 ph10 503 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2150 ph10 475 return FALSE;
2151 nigel 77 bcptr = bcptr->outer;
2152     }
2153     return TRUE;
2154     }
2155    
2156    
2157    
2158     /*************************************************
2159     * Check for POSIX class syntax *
2160     *************************************************/
2161    
2162     /* This function is called when the sequence "[:" or "[." or "[=" is
2163 ph10 295 encountered in a character class. It checks whether this is followed by a
2164 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2165 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2166 nigel 77
2167 ph10 298 Originally, this function only recognized a sequence of letters between the
2168     terminators, but it seems that Perl recognizes any sequence of characters,
2169     though of course unknown POSIX names are subsequently rejected. Perl gives an
2170     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2171     didn't consider this to be a POSIX class. Likewise for [:1234:].
2172 ph10 295
2173 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2174     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2175     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2176     below handles the special case of \], but does not try to do any other escape
2177     processing. This makes it different from Perl for cases such as [:l\ower:]
2178 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2179 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2180 ph10 295 I think.
2181    
2182     Arguments:
2183 nigel 77 ptr pointer to the initial [
2184     endptr where to return the end pointer
2185    
2186     Returns: TRUE or FALSE
2187     */
2188    
2189     static BOOL
2190 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2191 nigel 77 {
2192     int terminator; /* Don't combine these lines; the Solaris cc */
2193     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2194 ph10 295 for (++ptr; *ptr != 0; ptr++)
2195 nigel 77 {
2196 ph10 391 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2197 ph10 298 {
2198 ph10 391 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2199     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2200 ph10 295 {
2201     *endptr = ptr;
2202     return TRUE;
2203 ph10 298 }
2204     }
2205     }
2206 nigel 77 return FALSE;
2207     }
2208    
2209    
2210    
2211    
2212     /*************************************************
2213     * Check POSIX class name *
2214     *************************************************/
2215    
2216     /* This function is called to check the name given in a POSIX-style class entry
2217     such as [:alnum:].
2218    
2219     Arguments:
2220     ptr points to the first letter
2221     len the length of the name
2222    
2223     Returns: a value representing the name, or -1 if unknown
2224     */
2225    
2226     static int
2227     check_posix_name(const uschar *ptr, int len)
2228     {
2229 ph10 240 const char *pn = posix_names;
2230 nigel 77 register int yield = 0;
2231     while (posix_name_lengths[yield] != 0)
2232     {
2233     if (len == posix_name_lengths[yield] &&
2234 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2235 ph10 243 pn += posix_name_lengths[yield] + 1;
2236 nigel 77 yield++;
2237     }
2238     return -1;
2239     }
2240    
2241    
2242     /*************************************************
2243     * Adjust OP_RECURSE items in repeated group *
2244     *************************************************/
2245    
2246     /* OP_RECURSE items contain an offset from the start of the regex to the group
2247     that is referenced. This means that groups can be replicated for fixed
2248     repetition simply by copying (because the recursion is allowed to refer to
2249     earlier groups that are outside the current group). However, when a group is
2250 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2251     inserted before it, after it has been compiled. This means that any OP_RECURSE
2252     items within it that refer to the group itself or any contained groups have to
2253     have their offsets adjusted. That one of the jobs of this function. Before it
2254     is called, the partially compiled regex must be temporarily terminated with
2255     OP_END.
2256 nigel 77
2257 nigel 93 This function has been extended with the possibility of forward references for
2258     recursions and subroutine calls. It must also check the list of such references
2259     for the group we are dealing with. If it finds that one of the recursions in
2260     the current group is on this list, it adjusts the offset in the list, not the
2261     value in the reference (which is a group number).
2262    
2263 nigel 77 Arguments:
2264     group points to the start of the group
2265     adjust the amount by which the group is to be moved
2266     utf8 TRUE in UTF-8 mode
2267     cd contains pointers to tables etc.
2268 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2269 nigel 77
2270     Returns: nothing
2271     */
2272    
2273     static void
2274 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2275     uschar *save_hwm)
2276 nigel 77 {
2277     uschar *ptr = group;
2278 ph10 224
2279 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2280     {
2281 nigel 93 int offset;
2282     uschar *hc;
2283    
2284     /* See if this recursion is on the forward reference list. If so, adjust the
2285     reference. */
2286 ph10 345
2287 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2288     {
2289     offset = GET(hc, 0);
2290     if (cd->start_code + offset == ptr + 1)
2291     {
2292     PUT(hc, 0, offset + adjust);
2293     break;
2294     }
2295     }
2296    
2297     /* Otherwise, adjust the recursion offset if it's after the start of this
2298     group. */
2299    
2300     if (hc >= cd->hwm)
2301     {
2302     offset = GET(ptr, 1);
2303     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2304     }
2305    
2306 nigel 77 ptr += 1 + LINK_SIZE;
2307     }
2308     }
2309    
2310    
2311    
2312     /*************************************************
2313     * Insert an automatic callout point *
2314     *************************************************/
2315    
2316     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2317     callout points before each pattern item.
2318    
2319     Arguments:
2320     code current code pointer
2321     ptr current pattern pointer
2322     cd pointers to tables etc
2323    
2324     Returns: new code pointer
2325     */
2326    
2327     static uschar *
2328     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2329     {
2330     *code++ = OP_CALLOUT;
2331     *code++ = 255;
2332 ph10 530 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
2333     PUT(code, LINK_SIZE, 0); /* Default length */
2334 nigel 77 return code + 2*LINK_SIZE;
2335     }
2336    
2337    
2338    
2339     /*************************************************
2340     * Complete a callout item *
2341     *************************************************/
2342    
2343     /* A callout item contains the length of the next item in the pattern, which
2344     we can't fill in till after we have reached the relevant point. This is used
2345     for both automatic and manual callouts.
2346    
2347     Arguments:
2348     previous_callout points to previous callout item
2349     ptr current pattern pointer
2350     cd pointers to tables etc
2351    
2352     Returns: nothing
2353     */
2354    
2355     static void
2356     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2357     {
2358 ph10 530 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2359 nigel 77 PUT(previous_callout, 2 + LINK_SIZE, length);
2360     }
2361    
2362    
2363    
2364     #ifdef SUPPORT_UCP
2365     /*************************************************
2366     * Get othercase range *
2367     *************************************************/
2368    
2369     /* This function is passed the start and end of a class range, in UTF-8 mode
2370     with UCP support. It searches up the characters, looking for internal ranges of
2371     characters in the "other" case. Each call returns the next one, updating the
2372     start address.
2373    
2374     Arguments:
2375     cptr points to starting character value; updated
2376     d end value
2377     ocptr where to put start of othercase range
2378     odptr where to put end of othercase range
2379    
2380     Yield: TRUE when range returned; FALSE when no more
2381     */
2382    
2383     static BOOL
2384 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2385     unsigned int *odptr)
2386 nigel 77 {
2387 nigel 93 unsigned int c, othercase, next;
2388 nigel 77
2389     for (c = *cptr; c <= d; c++)
2390 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2391 nigel 77
2392     if (c > d) return FALSE;
2393    
2394     *ocptr = othercase;
2395     next = othercase + 1;
2396    
2397     for (++c; c <= d; c++)
2398     {
2399 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2400 nigel 77 next++;
2401     }
2402    
2403     *odptr = next - 1;
2404     *cptr = c;
2405    
2406     return TRUE;
2407     }
2408 ph10 532
2409    
2410    
2411     /*************************************************
2412     * Check a character and a property *
2413     *************************************************/
2414    
2415     /* This function is called by check_auto_possessive() when a property item
2416     is adjacent to a fixed character.
2417    
2418     Arguments:
2419     c the character
2420     ptype the property type
2421     pdata the data for the type
2422     negated TRUE if it's a negated property (\P or \p{^)
2423 ph10 535
2424 ph10 532 Returns: TRUE if auto-possessifying is OK
2425 ph10 535 */
2426 ph10 532
2427     static BOOL
2428     check_char_prop(int c, int ptype, int pdata, BOOL negated)
2429     {
2430     const ucd_record *prop = GET_UCD(c);
2431     switch(ptype)
2432     {
2433     case PT_LAMP:
2434     return (prop->chartype == ucp_Lu ||
2435     prop->chartype == ucp_Ll ||
2436     prop->chartype == ucp_Lt) == negated;
2437    
2438     case PT_GC:
2439     return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
2440    
2441     case PT_PC:
2442     return (pdata == prop->chartype) == negated;
2443    
2444     case PT_SC:
2445     return (pdata == prop->script) == negated;
2446    
2447     /* These are specials */
2448    
2449     case PT_ALNUM:
2450     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2451     _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
2452    
2453     case PT_SPACE: /* Perl space */
2454     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2455     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2456     == negated;
2457    
2458     case PT_PXSPACE: /* POSIX space */
2459     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2460     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2461     c == CHAR_FF || c == CHAR_CR)
2462     == negated;
2463    
2464     case PT_WORD:
2465     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2466     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2467     c == CHAR_UNDERSCORE) == negated;
2468     }
2469 ph10 535 return FALSE;
2470 ph10 532 }
2471 nigel 77 #endif /* SUPPORT_UCP */
2472    
2473    
2474 nigel 93
2475 nigel 77 /*************************************************
2476 nigel 93 * Check if auto-possessifying is possible *
2477     *************************************************/
2478    
2479     /* This function is called for unlimited repeats of certain items, to see
2480     whether the next thing could possibly match the repeated item. If not, it makes
2481     sense to automatically possessify the repeated item.
2482    
2483     Arguments:
2484 ph10 532 previous pointer to the repeated opcode
2485 nigel 93 utf8 TRUE in UTF-8 mode
2486     ptr next character in pattern
2487     options options bits
2488     cd contains pointers to tables etc.
2489    
2490     Returns: TRUE if possessifying is wanted
2491     */
2492    
2493     static BOOL
2494 ph10 535 check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
2495 ph10 532 int options, compile_data *cd)
2496 nigel 93 {
2497 ph10 532 int c, next;
2498     int op_code = *previous++;
2499 nigel 93
2500     /* Skip whitespace and comments in extended mode */
2501    
2502     if ((options & PCRE_EXTENDED) != 0)
2503     {
2504     for (;;)
2505     {
2506     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2507 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2508 nigel 93 {
2509     while (*(++ptr) != 0)
2510     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2511     }
2512     else break;
2513     }
2514     }
2515    
2516     /* If the next item is one that we can handle, get its value. A non-negative
2517     value is a character, a negative value is an escape value. */
2518    
2519 ph10 391 if (*ptr == CHAR_BACKSLASH)
2520 nigel 93 {
2521     int temperrorcode = 0;
2522     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2523     if (temperrorcode != 0) return FALSE;
2524     ptr++; /* Point after the escape sequence */
2525     }
2526    
2527     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2528     {
2529     #ifdef SUPPORT_UTF8
2530     if (utf8) { GETCHARINC(next, ptr); } else
2531     #endif
2532     next = *ptr++;
2533     }
2534    
2535     else return FALSE;
2536    
2537     /* Skip whitespace and comments in extended mode */
2538    
2539     if ((options & PCRE_EXTENDED) != 0)
2540     {
2541     for (;;)
2542     {
2543     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2544 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2545 nigel 93 {
2546     while (*(++ptr) != 0)
2547     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2548     }
2549     else break;
2550     }
2551     }
2552    
2553     /* If the next thing is itself optional, we have to give up. */
2554    
2555 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2556 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2557     return FALSE;
2558 nigel 93
2559 ph10 532 /* Now compare the next item with the previous opcode. First, handle cases when
2560     the next item is a character. */
2561 nigel 93
2562     if (next >= 0) switch(op_code)
2563     {
2564     case OP_CHAR:
2565 ph10 535 #ifdef SUPPORT_UTF8
2566 ph10 532 GETCHARTEST(c, previous);
2567 ph10 369 #else
2568 ph10 532 c = *previous;
2569 ph10 535 #endif
2570     return c != next;
2571 nigel 93
2572     /* For CHARNC (caseless character) we must check the other case. If we have
2573     Unicode property support, we can use it to test the other case of
2574     high-valued characters. */
2575    
2576     case OP_CHARNC:
2577 ph10 535 #ifdef SUPPORT_UTF8
2578 ph10 532 GETCHARTEST(c, previous);
2579     #else
2580     c = *previous;
2581 ph10 535 #endif
2582 ph10 532 if (c == next) return FALSE;
2583 nigel 93 #ifdef SUPPORT_UTF8
2584     if (utf8)
2585     {
2586     unsigned int othercase;
2587     if (next < 128) othercase = cd->fcc[next]; else
2588     #ifdef SUPPORT_UCP
2589 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2590 nigel 93 #else
2591     othercase = NOTACHAR;
2592     #endif
2593 ph10 532 return (unsigned int)c != othercase;
2594 nigel 93 }
2595     else
2596     #endif /* SUPPORT_UTF8 */
2597 ph10 532 return (c != cd->fcc[next]); /* Non-UTF-8 mode */
2598 nigel 93
2599 ph10 532 /* For OP_NOT, its data is always a single-byte character. */
2600 nigel 93
2601     case OP_NOT:
2602 ph10 532 if ((c = *previous) == next) return TRUE;
2603 nigel 93 if ((options & PCRE_CASELESS) == 0) return FALSE;
2604     #ifdef SUPPORT_UTF8
2605     if (utf8)
2606     {
2607     unsigned int othercase;
2608     if (next < 128) othercase = cd->fcc[next]; else
2609     #ifdef SUPPORT_UCP
2610 ph10 349 othercase = UCD_OTHERCASE(next);
2611 nigel 93 #else
2612     othercase = NOTACHAR;
2613     #endif
2614 ph10 532 return (unsigned int)c == othercase;
2615 nigel 93 }
2616     else
2617     #endif /* SUPPORT_UTF8 */
2618 ph10 532 return (c == cd->fcc[next]); /* Non-UTF-8 mode */
2619 nigel 93
2620 ph10 535 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
2621     When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
2622    
2623 nigel 93 case OP_DIGIT:
2624     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2625    
2626     case OP_NOT_DIGIT:
2627     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2628    
2629     case OP_WHITESPACE:
2630     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2631    
2632     case OP_NOT_WHITESPACE:
2633     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2634    
2635     case OP_WORDCHAR:
2636     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2637    
2638     case OP_NOT_WORDCHAR:
2639     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2640    
2641 ph10 180 case OP_HSPACE:
2642     case OP_NOT_HSPACE:
2643     switch(next)
2644     {
2645     case 0x09:
2646     case 0x20:
2647     case 0xa0:
2648     case 0x1680:
2649     case 0x180e:
2650     case 0x2000:
2651     case 0x2001:
2652     case 0x2002:
2653     case 0x2003:
2654     case 0x2004:
2655     case 0x2005:
2656     case 0x2006:
2657     case 0x2007:
2658     case 0x2008:
2659     case 0x2009:
2660     case 0x200A:
2661     case 0x202f:
2662     case 0x205f:
2663     case 0x3000:
2664 ph10 528 return op_code == OP_NOT_HSPACE;
2665 ph10 180 default:
2666 ph10 528 return op_code != OP_NOT_HSPACE;
2667 ph10 180 }
2668    
2669 ph10 528 case OP_ANYNL:
2670 ph10 180 case OP_VSPACE:
2671     case OP_NOT_VSPACE:
2672     switch(next)
2673     {
2674     case 0x0a:
2675     case 0x0b:
2676     case 0x0c:
2677     case 0x0d:
2678     case 0x85:
2679     case 0x2028:
2680     case 0x2029:
2681 ph10 528 return op_code == OP_NOT_VSPACE;
2682 ph10 180 default:
2683 ph10 528 return op_code != OP_NOT_VSPACE;
2684 ph10 180 }
2685    
2686 ph10 532 #ifdef SUPPORT_UCP
2687     case OP_PROP:
2688     return check_char_prop(next, previous[0], previous[1], FALSE);
2689 ph10 535
2690 ph10 532 case OP_NOTPROP:
2691     return check_char_prop(next, previous[0], previous[1], TRUE);
2692     #endif
2693    
2694 nigel 93 default:
2695     return FALSE;
2696     }
2697    
2698    
2699 ph10 535 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
2700     is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
2701     generated only when PCRE_UCP is *not* set, that is, when only ASCII
2702     characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
2703 ph10 532 replaced by OP_PROP codes when PCRE_UCP is set. */
2704 nigel 93
2705     switch(op_code)
2706     {
2707     case OP_CHAR:
2708     case OP_CHARNC:
2709 ph10 535 #ifdef SUPPORT_UTF8
2710 ph10 532 GETCHARTEST(c, previous);
2711     #else
2712     c = *previous;
2713 ph10 535 #endif
2714 nigel 93 switch(-next)
2715     {
2716     case ESC_d:
2717 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
2718 nigel 93
2719     case ESC_D:
2720 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
2721 nigel 93
2722     case ESC_s:
2723 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
2724 nigel 93
2725     case ESC_S:
2726 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
2727 nigel 93
2728     case ESC_w:
2729 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
2730 nigel 93
2731     case ESC_W:
2732 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
2733 ph10 182
2734 ph10 180 case ESC_h:
2735     case ESC_H:
2736 ph10 532 switch(c)
2737 ph10 180 {
2738     case 0x09:
2739     case 0x20:
2740     case 0xa0:
2741     case 0x1680:
2742     case 0x180e:
2743     case 0x2000:
2744     case 0x2001:
2745     case 0x2002:
2746     case 0x2003:
2747     case 0x2004:
2748     case 0x2005:
2749     case 0x2006:
2750     case 0x2007:
2751     case 0x2008:
2752     case 0x2009:
2753     case 0x200A:
2754     case 0x202f:
2755     case 0x205f:
2756     case 0x3000:
2757     return -next != ESC_h;
2758     default:
2759     return -next == ESC_h;
2760 ph10 182 }
2761    
2762 ph10 180 case ESC_v:
2763     case ESC_V:
2764 ph10 532 switch(c)
2765 ph10 180 {
2766     case 0x0a:
2767     case 0x0b:
2768     case 0x0c:
2769     case 0x0d:
2770     case 0x85:
2771     case 0x2028:
2772     case 0x2029:
2773     return -next != ESC_v;
2774     default:
2775     return -next == ESC_v;
2776 ph10 182 }
2777 ph10 535
2778     /* When PCRE_UCP is set, these values get generated for \d etc. Find
2779     their substitutions and process them. The result will always be either
2780 ph10 532 -ESC_p or -ESC_P. Then fall through to process those values. */
2781 ph10 535
2782 ph10 532 #ifdef SUPPORT_UCP
2783     case ESC_du:
2784     case ESC_DU:
2785     case ESC_wu:
2786     case ESC_WU:
2787     case ESC_su:
2788     case ESC_SU:
2789     {
2790     int temperrorcode = 0;
2791     ptr = substitutes[-next - ESC_DU];
2792     next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
2793     if (temperrorcode != 0) return FALSE;
2794     ptr++; /* For compatibility */
2795     }
2796 ph10 535 /* Fall through */
2797 nigel 93
2798 ph10 532 case ESC_p:
2799     case ESC_P:
2800     {
2801     int ptype, pdata, errorcodeptr;
2802 ph10 535 BOOL negated;
2803    
2804 ph10 532 ptr--; /* Make ptr point at the p or P */
2805     ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
2806     if (ptype < 0) return FALSE;
2807     ptr++; /* Point past the final curly ket */
2808 ph10 535
2809 ph10 532 /* If the property item is optional, we have to give up. (When generated
2810     from \d etc by PCRE_UCP, this test will have been applied much earlier,
2811     to the original \d etc. At this point, ptr will point to a zero byte. */
2812 ph10 535
2813 ph10 532 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2814     strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2815     return FALSE;
2816 ph10 535
2817 ph10 532 /* Do the property check. */
2818 ph10 535
2819 ph10 532 return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
2820 ph10 535 }
2821 ph10 532 #endif
2822    
2823 nigel 93 default:
2824     return FALSE;
2825     }
2826    
2827 ph10 535 /* In principle, support for Unicode properties should be integrated here as
2828     well. It means re-organizing the above code so as to get hold of the property
2829     values before switching on the op-code. However, I wonder how many patterns
2830     combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
2831     these op-codes are never generated.) */
2832    
2833 nigel 93 case OP_DIGIT:
2834 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2835 ph10 528 next == -ESC_h || next == -ESC_v || next == -ESC_R;
2836 nigel 93
2837     case OP_NOT_DIGIT:
2838     return next == -ESC_d;
2839    
2840     case OP_WHITESPACE:
2841 ph10 528 return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
2842 nigel 93
2843     case OP_NOT_WHITESPACE:
2844 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2845 nigel 93
2846 ph10 180 case OP_HSPACE:
2847 ph10 535 return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
2848 ph10 528 next == -ESC_w || next == -ESC_v || next == -ESC_R;
2849 ph10 180
2850     case OP_NOT_HSPACE:
2851     return next == -ESC_h;
2852 ph10 182
2853 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2854 ph10 535 case OP_ANYNL:
2855 ph10 182 case OP_VSPACE:
2856 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2857    
2858     case OP_NOT_VSPACE:
2859 ph10 528 return next == -ESC_v || next == -ESC_R;
2860 ph10 180
2861 nigel 93 case OP_WORDCHAR:
2862 ph10 535 return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
2863 ph10 528 next == -ESC_v || next == -ESC_R;
2864 nigel 93
2865     case OP_NOT_WORDCHAR:
2866     return next == -ESC_w || next == -ESC_d;
2867 ph10 182
2868 nigel 93 default:
2869     return FALSE;
2870     }
2871    
2872     /* Control does not reach here */
2873     }
2874    
2875    
2876    
2877     /*************************************************
2878 nigel 77 * Compile one branch *
2879     *************************************************/
2880    
2881 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2882 nigel 77 changed during the branch, the pointer is used to change the external options
2883 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2884     to find out the amount of memory needed, as well as during the real compile
2885     phase. The value of lengthptr distinguishes the two phases.
2886 nigel 77
2887     Arguments:
2888     optionsptr pointer to the option bits
2889     codeptr points to the pointer to the current code point
2890     ptrptr points to the current pattern pointer
2891     errorcodeptr points to error code variable
2892     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2893     reqbyteptr set to the last literal character required, else < 0
2894     bcptr points to current branch chain
2895     cd contains pointers to tables etc.
2896 nigel 93 lengthptr NULL during the real compile phase
2897     points to length accumulator during pre-compile phase
2898 nigel 77
2899     Returns: TRUE on success
2900     FALSE, with *errorcodeptr set non-zero on error
2901     */
2902    
2903     static BOOL
2904 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2905     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2906     compile_data *cd, int *lengthptr)
2907 nigel 77 {
2908     int repeat_type, op_type;
2909     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2910     int bravalue = 0;
2911     int greedy_default, greedy_non_default;
2912     int firstbyte, reqbyte;
2913     int zeroreqbyte, zerofirstbyte;
2914     int req_caseopt, reqvary, tempreqvary;
2915     int options = *optionsptr;
2916     int after_manual_callout = 0;
2917 nigel 93 int length_prevgroup = 0;
2918 nigel 77 register int c;
2919     register uschar *code = *codeptr;
2920 nigel 93 uschar *last_code = code;
2921     uschar *orig_code = code;
2922 nigel 77 uschar *tempcode;
2923     BOOL inescq = FALSE;
2924     BOOL groupsetfirstbyte = FALSE;
2925     const uschar *ptr = *ptrptr;
2926     const uschar *tempptr;
2927 ph10 518 const uschar *nestptr = NULL;
2928 nigel 77 uschar *previous = NULL;
2929     uschar *previous_callout = NULL;
2930 nigel 93 uschar *save_hwm = NULL;
2931 nigel 77 uschar classbits[32];
2932    
2933     #ifdef SUPPORT_UTF8
2934     BOOL class_utf8;
2935     BOOL utf8 = (options & PCRE_UTF8) != 0;
2936     uschar *class_utf8data;
2937 ph10 300 uschar *class_utf8data_base;
2938 nigel 77 uschar utf8_char[6];
2939     #else
2940     BOOL utf8 = FALSE;
2941 nigel 93 uschar *utf8_char = NULL;
2942 nigel 77 #endif
2943    
2944 ph10 475 #ifdef PCRE_DEBUG
2945 nigel 93 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2946     #endif
2947    
2948 nigel 77 /* Set up the default and non-default settings for greediness */
2949    
2950     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2951     greedy_non_default = greedy_default ^ 1;
2952    
2953     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2954     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2955     matches a non-fixed char first char; reqbyte just remains unset if we never
2956     find one.
2957    
2958     When we hit a repeat whose minimum is zero, we may have to adjust these values
2959     to take the zero repeat into account. This is implemented by setting them to
2960     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2961     item types that can be repeated set these backoff variables appropriately. */
2962    
2963     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2964    
2965     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2966     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2967     value > 255. It is added into the firstbyte or reqbyte variables to record the
2968     case status of the value. This is used only for ASCII characters. */
2969    
2970     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2971    
2972     /* Switch on next character until the end of the branch */
2973    
2974     for (;; ptr++)
2975     {
2976     BOOL negate_class;
2977 ph10 286 BOOL should_flip_negation;
2978 nigel 77 BOOL possessive_quantifier;
2979     BOOL is_quantifier;
2980 nigel 93 BOOL is_recurse;
2981 ph10 180 BOOL reset_bracount;
2982 nigel 77 int class_charcount;
2983     int class_lastchar;
2984     int newoptions;
2985     int recno;
2986 ph10 172 int refsign;
2987 nigel 77 int skipbytes;
2988     int subreqbyte;
2989     int subfirstbyte;
2990 nigel 93 int terminator;
2991 nigel 77 int mclength;
2992     uschar mcbuffer[8];
2993    
2994 nigel 93 /* Get next byte in the pattern */
2995 nigel 77
2996     c = *ptr;
2997 ph10 345
2998 ph10 535 /* If we are at the end of a nested substitution, revert to the outer level
2999 ph10 518 string. Nesting only happens one level deep. */
3000    
3001     if (c == 0 && nestptr != NULL)
3002     {
3003     ptr = nestptr;
3004     nestptr = NULL;
3005     c = *ptr;
3006     }
3007    
3008 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
3009     previous cycle of this loop. */
3010    
3011     if (lengthptr != NULL)
3012     {
3013 ph10 475 #ifdef PCRE_DEBUG
3014 nigel 93 if (code > cd->hwm) cd->hwm = code; /* High water info */
3015     #endif
3016 ph10 505 if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */
3017 nigel 93 {
3018     *errorcodeptr = ERR52;
3019     goto FAILED;
3020     }
3021    
3022     /* There is at least one situation where code goes backwards: this is the
3023     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3024     the class is simply eliminated. However, it is created first, so we have to
3025     allow memory for it. Therefore, don't ever reduce the length at this point.
3026     */
3027    
3028     if (code < last_code) code = last_code;
3029 ph10 202
3030     /* Paranoid check for integer overflow */
3031    
3032     if (OFLOW_MAX - *lengthptr < code - last_code)
3033     {
3034     *errorcodeptr = ERR20;
3035     goto FAILED;
3036     }
3037    
3038 ph10 530 *lengthptr += (int)(code - last_code);
3039 nigel 93 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
3040    
3041     /* If "previous" is set and it is not at the start of the work space, move
3042     it back to there, in order to avoid filling up the work space. Otherwise,
3043     if "previous" is NULL, reset the current code pointer to the start. */
3044    
3045     if (previous != NULL)
3046     {
3047     if (previous > orig_code)
3048     {
3049     memmove(orig_code, previous, code - previous);
3050     code -= previous - orig_code;
3051     previous = orig_code;
3052     }
3053     }
3054     else code = orig_code;
3055    
3056     /* Remember where this code item starts so we can pick up the length
3057     next time round. */
3058    
3059     last_code = code;
3060     }
3061    
3062     /* In the real compile phase, just check the workspace used by the forward
3063     reference list. */
3064    
3065 ph10 505 else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
3066 nigel 93 {
3067     *errorcodeptr = ERR52;
3068     goto FAILED;
3069     }
3070    
3071 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
3072    
3073     if (inescq && c != 0)
3074     {
3075 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3076 nigel 77 {
3077     inescq = FALSE;
3078     ptr++;
3079     continue;
3080     }
3081     else
3082     {
3083     if (previous_callout != NULL)
3084     {
3085 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3086     complete_callout(previous_callout, ptr, cd);
3087 nigel 77 previous_callout = NULL;
3088     }
3089     if ((options & PCRE_AUTO_CALLOUT) != 0)
3090     {
3091     previous_callout = code;
3092     code = auto_callout(code, ptr, cd);
3093     }
3094     goto NORMAL_CHAR;
3095     }
3096     }
3097    
3098     /* Fill in length of a previous callout, except when the next thing is
3099     a quantifier. */
3100    
3101 ph10 392 is_quantifier =
3102 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3103     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3104 nigel 77
3105     if (!is_quantifier && previous_callout != NULL &&
3106     after_manual_callout-- <= 0)
3107     {
3108 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3109     complete_callout(previous_callout, ptr, cd);
3110 nigel 77 previous_callout = NULL;
3111     }
3112    
3113     /* In extended mode, skip white space and comments */
3114    
3115     if ((options & PCRE_EXTENDED) != 0)
3116     {
3117     if ((cd->ctypes[c] & ctype_space) != 0) continue;
3118 ph10 391 if (c == CHAR_NUMBER_SIGN)
3119 nigel 77 {
3120 nigel 93 while (*(++ptr) != 0)
3121 nigel 91 {
3122 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3123 nigel 91 }
3124 nigel 93 if (*ptr != 0) continue;
3125    
3126 nigel 91 /* Else fall through to handle end of string */
3127     c = 0;
3128 nigel 77 }
3129     }
3130    
3131     /* No auto callout for quantifiers. */
3132    
3133     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3134     {
3135     previous_callout = code;
3136     code = auto_callout(code, ptr, cd);
3137     }
3138    
3139     switch(c)
3140     {
3141 nigel 93 /* ===================================================================*/
3142     case 0: /* The branch terminates at string end */
3143 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
3144     case CHAR_RIGHT_PARENTHESIS:
3145 nigel 77 *firstbyteptr = firstbyte;
3146     *reqbyteptr = reqbyte;
3147     *codeptr = code;
3148     *ptrptr = ptr;
3149 nigel 93 if (lengthptr != NULL)
3150     {
3151 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
3152     {
3153     *errorcodeptr = ERR20;
3154     goto FAILED;
3155     }
3156 ph10 530 *lengthptr += (int)(code - last_code); /* To include callout length */
3157 nigel 93 DPRINTF((">> end branch\n"));
3158     }
3159 nigel 77 return TRUE;
3160    
3161 nigel 93
3162     /* ===================================================================*/
3163 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
3164     the setting of any following char as a first character. */
3165    
3166 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
3167 nigel 77 if ((options & PCRE_MULTILINE) != 0)
3168     {
3169     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3170     }
3171     previous = NULL;
3172     *code++ = OP_CIRC;
3173     break;
3174    
3175 ph10 391 case CHAR_DOLLAR_SIGN:
3176 nigel 77 previous = NULL;
3177     *code++ = OP_DOLL;
3178     break;
3179    
3180     /* There can never be a first char if '.' is first, whatever happens about
3181     repeats. The value of reqbyte doesn't change either. */
3182    
3183 ph10 391 case CHAR_DOT:
3184 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3185     zerofirstbyte = firstbyte;
3186     zeroreqbyte = reqbyte;
3187     previous = code;
3188 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3189 nigel 77 break;
3190    
3191 nigel 93
3192     /* ===================================================================*/
3193 nigel 87 /* Character classes. If the included characters are all < 256, we build a
3194     32-byte bitmap of the permitted characters, except in the special case
3195     where there is only one such character. For negated classes, we build the
3196     map as usual, then invert it at the end. However, we use a different opcode
3197     so that data characters > 255 can be handled correctly.
3198 nigel 77
3199     If the class contains characters outside the 0-255 range, a different
3200     opcode is compiled. It may optionally have a bit map for characters < 256,
3201     but those above are are explicitly listed afterwards. A flag byte tells
3202     whether the bitmap is present, and whether this is a negated class or not.
3203 ph10 345
3204 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
3205     default (Perl) mode, it is treated as a data character. */
3206 ph10 345
3207 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
3208 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3209     {
3210     *errorcodeptr = ERR64;
3211 ph10 345 goto FAILED;
3212 ph10 336 }
3213 ph10 345 goto NORMAL_CHAR;
3214 nigel 77
3215 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
3216 nigel 77 previous = code;
3217    
3218     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3219     they are encountered at the top level, so we'll do that too. */
3220    
3221 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3222 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
3223 ph10 295 check_posix_syntax(ptr, &tempptr))
3224 nigel 77 {
3225 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3226 nigel 77 goto FAILED;
3227     }
3228    
3229 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
3230 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
3231 ph10 205 skip them too. This makes for compatibility with Perl. */
3232 ph10 208
3233 ph10 205 negate_class = FALSE;
3234     for (;;)
3235 nigel 77 {
3236     c = *(++ptr);
3237 ph10 391 if (c == CHAR_BACKSLASH)
3238 ph10 205 {
3239 ph10 392 if (ptr[1] == CHAR_E)
3240 ph10 391 ptr++;
3241 ph10 392 else if (strncmp((const char *)ptr+1,
3242     STR_Q STR_BACKSLASH STR_E, 3) == 0)
3243 ph10 391 ptr += 3;
3244 ph10 392 else
3245 ph10 391 break;
3246 ph10 205 }
3247 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3248 ph10 205 negate_class = TRUE;
3249     else break;
3250 ph10 208 }
3251 ph10 345
3252     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3253     an initial ']' is taken as a data character -- the code below handles
3254 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3255     [^] must match any character, so generate OP_ALLANY. */
3256 ph10 345
3257 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3258 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3259 ph10 341 {
3260     *code++ = negate_class? OP_ALLANY : OP_FAIL;
3261     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3262     zerofirstbyte = firstbyte;
3263     break;
3264 ph10 345 }
3265 nigel 77
3266 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
3267     negation flag at the end, so that support for characters > 255 works
3268 ph10 264 correctly (they are all included in the class). */
3269    
3270     should_flip_negation = FALSE;
3271    
3272 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
3273 nigel 93 of just a single character (as long as it's < 256). However, For higher
3274     valued UTF-8 characters, we don't yet do any optimization. */
3275 nigel 77
3276     class_charcount = 0;
3277     class_lastchar = -1;
3278    
3279 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
3280     temporary bit of memory, in case the class contains only 1 character (less
3281     than 256), because in that case the compiled code doesn't use the bit map.
3282     */
3283    
3284     memset(classbits, 0, 32 * sizeof(uschar));
3285    
3286 nigel 77 #ifdef SUPPORT_UTF8
3287     class_utf8 = FALSE; /* No chars >= 256 */
3288 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3289 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3290 nigel 77 #endif
3291    
3292     /* Process characters until ] is reached. By writing this as a "do" it
3293 nigel 93 means that an initial ] is taken as a data character. At the start of the
3294     loop, c contains the first byte of the character. */
3295 nigel 77
3296 nigel 93 if (c != 0) do
3297 nigel 77 {
3298 nigel 93 const uschar *oldptr;
3299    
3300 nigel 77 #ifdef SUPPORT_UTF8
3301     if (utf8 && c > 127)
3302     { /* Braces are required because the */
3303     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3304     }
3305 ph10 535
3306 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3307 ph10 309 data and reset the pointer. This is so that very large classes that
3308 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
3309 ph10 309 (which is on the stack). */
3310    
3311 ph10 300 if (lengthptr != NULL)
3312     {
3313     *lengthptr += class_utf8data - class_utf8data_base;
3314 ph10 309 class_utf8data = class_utf8data_base;
3315     }
3316    
3317 nigel 77 #endif
3318    
3319     /* Inside \Q...\E everything is literal except \E */
3320    
3321     if (inescq)
3322     {
3323 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3324 nigel 77 {
3325 nigel 93 inescq = FALSE; /* Reset literal state */
3326     ptr++; /* Skip the 'E' */
3327     continue; /* Carry on with next */
3328 nigel 77 }
3329 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
3330 nigel 77 }
3331    
3332     /* Handle POSIX class names. Perl allows a negation extension of the
3333     form [:^name:]. A square bracket that doesn't match the syntax is
3334     treated as a literal. We also recognize the POSIX constructions
3335     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3336     5.6 and 5.8 do. */
3337    
3338 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3339 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3340 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3341 nigel 77 {
3342     BOOL local_negate = FALSE;
3343 nigel 87 int posix_class, taboffset, tabopt;
3344 nigel 77 register const uschar *cbits = cd->cbits;
3345 nigel 87 uschar pbits[32];
3346 nigel 77
3347 ph10 391 if (ptr[1] != CHAR_COLON)
3348 nigel 77 {
3349     *errorcodeptr = ERR31;
3350     goto FAILED;
3351     }
3352    
3353     ptr += 2;
3354 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3355 nigel 77 {
3356     local_negate = TRUE;
3357 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3358 nigel 77 ptr++;
3359     }
3360    
3361 ph10 530 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3362 nigel 77 if (posix_class < 0)
3363     {
3364     *errorcodeptr = ERR30;
3365     goto FAILED;
3366     }
3367    
3368     /* If matching is caseless, upper and lower are converted to
3369     alpha. This relies on the fact that the class table starts with
3370     alpha, lower, upper as the first 3 entries. */
3371    
3372     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3373     posix_class = 0;
3374 ph10 535
3375     /* When PCRE_UCP is set, some of the POSIX classes are converted to
3376 ph10 518 different escape sequences that use Unicode properties. */
3377 ph10 535
3378 ph10 518 #ifdef SUPPORT_UCP
3379     if ((options & PCRE_UCP) != 0)
3380     {
3381     int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3382     if (posix_substitutes[pc] != NULL)
3383     {
3384 ph10 535 nestptr = tempptr + 1;
3385 ph10 518 ptr = posix_substitutes[pc] - 1;
3386 ph10 535 continue;
3387     }
3388     }
3389     #endif
3390 ph10 518 /* In the non-UCP case, we build the bit map for the POSIX class in a
3391     chunk of local store because we may be adding and subtracting from it,
3392     and we don't want to subtract bits that may be in the main map already.
3393     At the end we or the result into the bit map that is being built. */
3394 nigel 77
3395     posix_class *= 3;
3396 nigel 87
3397     /* Copy in the first table (always present) */
3398    
3399     memcpy(pbits, cbits + posix_class_maps[posix_class],
3400     32 * sizeof(uschar));
3401    
3402     /* If there is a second table, add or remove it as required. */
3403    
3404     taboffset = posix_class_maps[posix_class + 1];
3405     tabopt = posix_class_maps[posix_class + 2];
3406    
3407     if (taboffset >= 0)
3408 nigel 77 {
3409 nigel 87 if (tabopt >= 0)
3410     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3411 nigel 77 else
3412 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3413 nigel 77 }
3414    
3415 nigel 87 /* Not see if we need to remove any special characters. An option
3416     value of 1 removes vertical space and 2 removes underscore. */
3417    
3418     if (tabopt < 0) tabopt = -tabopt;
3419     if (tabopt == 1) pbits[1] &= ~0x3c;
3420     else if (tabopt == 2) pbits[11] &= 0x7f;
3421    
3422     /* Add the POSIX table or its complement into the main table that is
3423     being built and we are done. */
3424    
3425     if (local_negate)
3426     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3427     else
3428     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3429    
3430 nigel 77 ptr = tempptr + 1;
3431     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3432     continue; /* End of POSIX syntax handling */
3433     }
3434    
3435     /* Backslash may introduce a single character, or it may introduce one
3436 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3437 ph10 513 case. Inside a class (and only there) it is treated as backspace. We
3438     assume that other escapes have more than one character in them, so set
3439     class_charcount bigger than one. Unrecognized escapes fall through and
3440     are either treated as literal characters (by default), or are faulted if
3441     PCRE_EXTRA is set. */
3442 nigel 77
3443 ph10 391 if (c == CHAR_BACKSLASH)
3444 nigel 77 {
3445 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3446     if (*errorcodeptr != 0) goto FAILED;
3447 nigel 77
3448 ph10 513 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3449 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3450     {
3451 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3452 nigel 77 {
3453     ptr += 2; /* avoid empty string */
3454     }
3455     else inescq = TRUE;
3456     continue;
3457     }
3458 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3459 nigel 77
3460     if (c < 0)
3461     {
3462     register const uschar *cbits = cd->cbits;
3463     class_charcount += 2; /* Greater than 1 is what matters */
3464 nigel 93
3465 ph10 518 switch (-c)
3466 nigel 77 {
3467 ph10 518 #ifdef SUPPORT_UCP
3468     case ESC_du: /* These are the values given for \d etc */
3469     case ESC_DU: /* when PCRE_UCP is set. We replace the */
3470     case ESC_wu: /* escape sequence with an appropriate \p */
3471     case ESC_WU: /* or \P to test Unicode properties instead */
3472     case ESC_su: /* of the default ASCII testing. */
3473     case ESC_SU:
3474     nestptr = ptr;
3475     ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
3476 ph10 535 class_charcount -= 2; /* Undo! */
3477 ph10 518 continue;
3478     #endif
3479 nigel 77 case ESC_d:
3480     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3481     continue;
3482    
3483     case ESC_D:
3484 ph10 286 should_flip_negation = TRUE;
3485 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3486     continue;
3487    
3488     case ESC_w:
3489     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3490     continue;
3491    
3492     case ESC_W:
3493 ph10 286 should_flip_negation = TRUE;
3494 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3495     continue;
3496    
3497     case ESC_s:
3498     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3499     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3500     continue;
3501    
3502     case ESC_S:
3503 ph10 286 should_flip_negation = TRUE;
3504 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3505     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3506     continue;
3507    
3508 ph10 518 case ESC_h:
3509 ph10 178 SETBIT(classbits, 0x09); /* VT */
3510     SETBIT(classbits, 0x20); /* SPACE */
3511 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
3512 ph10 178 #ifdef SUPPORT_UTF8
3513     if (utf8)
3514 ph10 180 {
3515 ph10 178 class_utf8 = TRUE;
3516     *class_utf8data++ = XCL_SINGLE;
3517 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3518 ph10 178 *class_utf8data++ = XCL_SINGLE;
3519 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3520     *class_utf8data++ = XCL_RANGE;
3521     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3522     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3523 ph10 178 *class_utf8data++ = XCL_SINGLE;
3524 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3525 ph10 178 *class_utf8data++ = XCL_SINGLE;
3526 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3527 ph10 178 *class_utf8data++ = XCL_SINGLE;
3528 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3529     }
3530     #endif
3531     continue;
3532 nigel 93
3533 ph10 518 case ESC_H:
3534 ph10 178 for (c = 0; c < 32; c++)
3535     {
3536     int x = 0xff;
3537     switch (c)
3538 ph10 180 {
3539 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
3540     case 0x20/8: x ^= 1 << (0x20%8); break;
3541     case 0xa0/8: x ^= 1 << (0xa0%8); break;
3542     default: break;
3543     }
3544     classbits[c] |= x;
3545 ph10 180 }
3546    
3547 ph10 178 #ifdef SUPPORT_UTF8
3548     if (utf8)
3549 ph10 180 {
3550 ph10 178 class_utf8 = TRUE;
3551 ph10 180 *class_utf8data++ = XCL_RANGE;
3552     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3553     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3554     *class_utf8data++ = XCL_RANGE;
3555     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3556     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3557     *class_utf8data++ = XCL_RANGE;
3558     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3559     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3560     *class_utf8data++ = XCL_RANGE;
3561     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3562     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3563     *class_utf8data++ = XCL_RANGE;
3564     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3565     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3566     *class_utf8data++ = XCL_RANGE;
3567     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3568     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3569     *class_utf8data++ = XCL_RANGE;
3570     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3571     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3572     }
3573     #endif
3574     continue;
3575 ph10 178
3576 ph10 518 case ESC_v:
3577 ph10 178 SETBIT(classbits, 0x0a); /* LF */
3578     SETBIT(classbits, 0x0b); /* VT */
3579 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3580     SETBIT(classbits, 0x0d); /* CR */
3581     SETBIT(classbits, 0x85); /* NEL */
3582 ph10 178 #ifdef SUPPORT_UTF8
3583     if (utf8)
3584 ph10 180 {
3585 ph10 178 class_utf8 = TRUE;
3586 ph10 180 *class_utf8data++ = XCL_RANGE;
3587     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3588     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3589     }
3590     #endif
3591     continue;
3592 ph10 178
3593 ph10 518 case ESC_V:
3594 ph10 178 for (c = 0; c < 32; c++)
3595     {
3596     int x = 0xff;
3597     switch (c)
3598 ph10 180 {
3599 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3600     x ^= 1 << (0x0b%8);
3601     x ^= 1 << (0x0c%8);
3602 ph10 180 x ^= 1 << (0x0d%8);
3603 ph10 178 break;
3604     case 0x85/8: x ^= 1 << (0x85%8); break;
3605     default: break;
3606     }
3607     classbits[c] |= x;
3608 ph10 180 }
3609    
3610 ph10 178 #ifdef SUPPORT_UTF8
3611     if (utf8)
3612 ph10 180 {
3613 ph10 178 class_utf8 = TRUE;
3614 ph10 180 *class_utf8data++ = XCL_RANGE;
3615     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3616     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3617     *class_utf8data++ = XCL_RANGE;
3618     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3619     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3620     }
3621     #endif
3622     continue;
3623 ph10 178
3624 nigel 77 #ifdef SUPPORT_UCP
3625 ph10 518 case ESC_p:
3626     case ESC_P:
3627     {
3628     BOOL negated;
3629     int pdata;
3630     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3631     if (ptype < 0) goto FAILED;
3632     class_utf8 = TRUE;
3633     *class_utf8data++ = ((-c == ESC_p) != negated)?
3634     XCL_PROP : XCL_NOTPROP;
3635     *class_utf8data++ = ptype;
3636     *class_utf8data++ = pdata;
3637     class_charcount -= 2; /* Not a < 256 character */
3638     continue;
3639     }
3640 nigel 77 #endif
3641 ph10 518 /* Unrecognized escapes are faulted if PCRE is running in its
3642     strict mode. By default, for compatibility with Perl, they are
3643     treated as literals. */
3644 nigel 77
3645 ph10 518 default:
3646     if ((options & PCRE_EXTRA) != 0)
3647     {
3648     *errorcodeptr = ERR7;
3649     goto FAILED;
3650     }
3651     class_charcount -= 2; /* Undo the default count from above */
3652     c = *ptr; /* Get the final character and fall through */
3653     break;
3654 nigel 93 }
3655 nigel 77 }
3656    
3657     /* Fall through if we have a single character (c >= 0). This may be
3658 nigel 93 greater than 256 in UTF-8 mode. */
3659 nigel 77
3660     } /* End of backslash handling */
3661    
3662     /* A single character may be followed by '-' to form a range. However,
3663     Perl does not permit ']' to be the end of the range. A '-' character
3664 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3665     entirely. The code for handling \Q and \E is messy. */
3666 nigel 77
3667 nigel 93 CHECK_RANGE:
3668 ph10 391 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3669 nigel 77 {
3670 nigel 93 inescq = FALSE;
3671     ptr += 2;
3672     }
3673    
3674     oldptr = ptr;
3675 ph10 231
3676 ph10 230 /* Remember \r or \n */
3677 ph10 231
3678 ph10 391 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3679 ph10 231
3680 ph10 230 /* Check for range */
3681 nigel 93
3682 ph10 391 if (!inescq && ptr[1] == CHAR_MINUS)
3683 nigel 93 {
3684 nigel 77 int d;
3685     ptr += 2;
3686 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3687 nigel 77
3688 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3689     mode. */
3690    
3691 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3692 nigel 93 {
3693     ptr += 2;
3694 ph10 392 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3695 ph10 391 { ptr += 2; continue; }
3696 nigel 93 inescq = TRUE;
3697     break;
3698     }
3699    
3700 ph10 391 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3701 nigel 93 {
3702     ptr = oldptr;
3703     goto LONE_SINGLE_CHARACTER;
3704     }
3705    
3706 nigel 77 #ifdef SUPPORT_UTF8
3707     if (utf8)
3708     { /* Braces are required because the */
3709     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3710     }
3711     else
3712     #endif
3713     d = *ptr; /* Not UTF-8 mode */
3714    
3715     /* The second part of a range can be a single-character escape, but
3716     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3717     in such circumstances. */
3718    
3719 ph10 391 if (!inescq && d == CHAR_BACKSLASH)
3720 nigel 77 {
3721 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3722     if (*errorcodeptr != 0) goto FAILED;
3723 nigel 77
3724 ph10 514 /* \b is backspace; any other special means the '-' was literal */
3725 nigel 77
3726     if (d < 0)
3727     {
3728 ph10 514 if (d == -ESC_b) d = CHAR_BS; else
3729 nigel 77 {
3730 nigel 93 ptr = oldptr;
3731 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3732     }
3733     }
3734     }
3735    
3736 nigel 93 /* Check that the two values are in the correct order. Optimize
3737     one-character ranges */
3738 nigel 77
3739 nigel 93 if (d < c)
3740     {
3741     *errorcodeptr = ERR8;
3742     goto FAILED;
3743     }
3744    
3745 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3746    
3747 ph10 230 /* Remember \r or \n */
3748 ph10 231
3749 ph10 391 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3750 ph10 231
3751 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3752     matching, we have to use an XCLASS with extra data items. Caseless
3753     matching for characters > 127 is available only if UCP support is
3754     available. */
3755    
3756     #ifdef SUPPORT_UTF8
3757     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3758     {
3759     class_utf8 = TRUE;
3760    
3761     /* With UCP support, we can find the other case equivalents of
3762     the relevant characters. There may be several ranges. Optimize how
3763     they fit with the basic range. */
3764    
3765     #ifdef SUPPORT_UCP
3766     if ((options & PCRE_CASELESS) != 0)
3767     {
3768 nigel 93 unsigned int occ, ocd;
3769     unsigned int cc = c;
3770     unsigned int origd = d;
3771 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3772     {
3773 ph10 180 if (occ >= (unsigned int)c &&
3774     ocd <= (unsigned int)d)
3775 ph10 176 continue; /* Skip embedded ranges */
3776 nigel 77
3777 ph10 180 if (occ < (unsigned int)c &&
3778 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3779 nigel 77 { /* if there is overlap, */
3780     c = occ; /* noting that if occ < c */
3781     continue; /* we can't have ocd > d */
3782     } /* because a subrange is */
3783 ph10 180 if (ocd > (unsigned int)d &&
3784 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3785 nigel 77 { /* the basic range. */
3786     d = ocd;
3787     continue;
3788     }
3789    
3790     if (occ == ocd)
3791     {
3792     *class_utf8data++ = XCL_SINGLE;
3793     }
3794     else
3795     {
3796     *class_utf8data++ = XCL_RANGE;
3797     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3798     }
3799     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3800     }
3801     }
3802     #endif /* SUPPORT_UCP */
3803    
3804     /* Now record the original range, possibly modified for UCP caseless
3805     overlapping ranges. */
3806    
3807     *class_utf8data++ = XCL_RANGE;
3808     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3809     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3810    
3811     /* With UCP support, we are done. Without UCP support, there is no
3812     caseless matching for UTF-8 characters > 127; we can use the bit map
3813     for the smaller ones. */
3814    
3815     #ifdef SUPPORT_UCP
3816     continue; /* With next character in the class */
3817     #else
3818     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3819    
3820     /* Adjust upper limit and fall through to set up the map */
3821    
3822     d = 127;
3823    
3824     #endif /* SUPPORT_UCP */
3825     }
3826     #endif /* SUPPORT_UTF8 */
3827    
3828     /* We use the bit map for all cases when not in UTF-8 mode; else
3829     ranges that lie entirely within 0-127 when there is UCP support; else
3830     for partial ranges without UCP support. */
3831    
3832 nigel 93 class_charcount += d - c + 1;
3833     class_lastchar = d;
3834    
3835     /* We can save a bit of time by skipping this in the pre-compile. */
3836    
3837     if (lengthptr == NULL) for (; c <= d; c++)
3838 nigel 77 {
3839     classbits[c/8] |= (1 << (c&7));
3840     if ((options & PCRE_CASELESS) != 0)
3841     {
3842     int uc = cd->fcc[c]; /* flip case */
3843     classbits[uc/8] |= (1 << (uc&7));
3844     }
3845     }
3846    
3847     continue; /* Go get the next char in the class */
3848     }
3849    
3850     /* Handle a lone single character - we can get here for a normal
3851     non-escape char, or after \ that introduces a single character or for an
3852     apparent range that isn't. */
3853    
3854     LONE_SINGLE_CHARACTER:
3855 ph10 231
3856 nigel 77 /* Handle a character that cannot go in the bit map */
3857    
3858     #ifdef SUPPORT_UTF8
3859     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3860     {
3861     class_utf8 = TRUE;
3862     *class_utf8data++ = XCL_SINGLE;
3863     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3864    
3865     #ifdef SUPPORT_UCP
3866     if ((options & PCRE_CASELESS) != 0)
3867     {
3868 nigel 93 unsigned int othercase;
3869 ph10 349 if ((othercase = UCD_OTHERCASE(c)) != c)
3870 nigel 77 {
3871     *class_utf8data++ = XCL_SINGLE;
3872     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3873     }
3874     }
3875     #endif /* SUPPORT_UCP */
3876    
3877     }
3878     else
3879     #endif /* SUPPORT_UTF8 */
3880    
3881     /* Handle a single-byte character */
3882     {
3883     classbits[c/8] |= (1 << (c&7));
3884     if ((options & PCRE_CASELESS) != 0)
3885     {
3886     c = cd->fcc[c]; /* flip case */
3887     classbits[c/8] |= (1 << (c&7));
3888     }
3889     class_charcount++;
3890     class_lastchar = c;
3891     }
3892     }
3893    
3894 ph10 518 /* Loop until ']' reached. This "while" is the end of the "do" far above.
3895     If we are at the end of an internal nested string, revert to the outer
3896     string. */
3897 nigel 77
3898 ph10 518 while (((c = *(++ptr)) != 0 ||
3899     (nestptr != NULL &&
3900     (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
3901     (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3902 nigel 77
3903 ph10 518 /* Check for missing terminating ']' */
3904    
3905     if (c == 0)
3906 nigel 93 {
3907     *errorcodeptr = ERR6;
3908     goto FAILED;
3909     }
3910 ph10 231
3911 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3912 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3913     use of \p or \P, in other words, no use of any XCLASS features, we can
3914     optimize.
3915    
3916 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3917     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3918     operate on single-bytes only. This is an historical hangover. Maybe one day
3919     we can tidy these opcodes to handle multi-byte characters.
3920 nigel 77
3921     The optimization throws away the bit map. We turn the item into a
3922     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3923     that OP_NOT does not support multibyte characters. In the positive case, it
3924     can cause firstbyte to be set. Otherwise, there can be no first char if
3925     this item is first, whatever repeat count may follow. In the case of
3926     reqbyte, save the previous value for reinstating. */
3927 ph10 535
3928 nigel 77 #ifdef SUPPORT_UTF8
3929 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3930 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3931 nigel 77 #else
3932     if (class_charcount == 1)
3933     #endif
3934     {
3935     zeroreqbyte = reqbyte;
3936    
3937     /* The OP_NOT opcode works on one-byte characters only. */
3938    
3939     if (negate_class)
3940     {
3941     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3942     zerofirstbyte = firstbyte;
3943     *code++ = OP_NOT;
3944     *code++ = class_lastchar;
3945     break;
3946     }
3947    
3948     /* For a single, positive character, get the value into mcbuffer, and
3949     then we can handle this with the normal one-character code. */
3950    
3951     #ifdef SUPPORT_UTF8
3952     if (utf8 && class_lastchar > 127)