/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 550 - (hide annotations) (download)
Sun Oct 10 16:24:11 2010 UTC (2 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 233481 byte(s)
Fix problem with (*THEN) not backing up far enough.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 475 Copyright (c) 1997-2010 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 ph10 475 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57     also used by pcretest. PCRE_DEBUG is not defined when building a production
58     library. */
59 nigel 85
60 ph10 475 #ifdef PCRE_DEBUG
61 nigel 85 #include "pcre_printint.src"
62     #endif
63    
64    
65 ph10 178 /* Macro for setting individual bits in class bitmaps. */
66    
67     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68    
69 ph10 202 /* Maximum length value to check against when making sure that the integer that
70     holds the compiled pattern length does not overflow. We make it a bit less than
71     INT_MAX to allow for adding in group terminating bytes, so that we don't have
72     to check them every time. */
73 ph10 178
74 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
75    
76    
77 nigel 77 /*************************************************
78     * Code parameters and static tables *
79     *************************************************/
80    
81 nigel 93 /* This value specifies the size of stack workspace that is used during the
82     first pre-compile phase that determines how much memory is required. The regex
83     is partly compiled into this space, but the compiled parts are discarded as
84     soon as they can be, so that hopefully there will never be an overrun. The code
85     does, however, check for an overrun. The largest amount I've seen used is 218,
86     so this number is very generous.
87 nigel 77
88 nigel 93 The same workspace is used during the second, actual compile phase for
89     remembering forward references to groups so that they can be filled in at the
90     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91     is 4 there is plenty of room. */
92 nigel 77
93 nigel 93 #define COMPILE_WORK_SIZE (4096)
94 nigel 77
95 ph10 507 /* The overrun tests check for a slightly smaller size so that they detect the
96 ph10 505 overrun before it actually does run off the end of the data block. */
97 nigel 93
98 ph10 505 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99    
100    
101 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
102     are simple data values; negative values are for special things like \d and so
103     on. Zero means further processing is needed (for things like \x), or the escape
104     is invalid. */
105    
106 ph10 391 #ifndef EBCDIC
107    
108     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
109 ph10 392 in UTF-8 mode. */
110 ph10 391
111 ph10 392 static const short int escapes[] = {
112 ph10 391 0, 0,
113     0, 0,
114 ph10 392 0, 0,
115     0, 0,
116     0, 0,
117 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
118 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
119 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
120 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
121     -ESC_B, -ESC_C,
122     -ESC_D, -ESC_E,
123     0, -ESC_G,
124     -ESC_H, 0,
125     0, -ESC_K,
126 ph10 391 0, 0,
127 ph10 514 -ESC_N, 0,
128 ph10 391 -ESC_P, -ESC_Q,
129     -ESC_R, -ESC_S,
130 ph10 392 0, 0,
131     -ESC_V, -ESC_W,
132     -ESC_X, 0,
133     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
134 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
135 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
136 ph10 391 CHAR_GRAVE_ACCENT, 7,
137 ph10 392 -ESC_b, 0,
138     -ESC_d, ESC_e,
139 ph10 391 ESC_f, 0,
140     -ESC_h, 0,
141 ph10 392 0, -ESC_k,
142 ph10 391 0, 0,
143     ESC_n, 0,
144 ph10 392 -ESC_p, 0,
145     ESC_r, -ESC_s,
146 ph10 391 ESC_tee, 0,
147 ph10 392 -ESC_v, -ESC_w,
148     0, 0,
149 ph10 391 -ESC_z
150 nigel 77 };
151    
152 ph10 392 #else
153 ph10 391
154     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155    
156 nigel 77 static const short int escapes[] = {
157     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
158     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
159     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
160     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
161     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
162     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
163     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
164     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
165 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
166 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
167 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
168 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
169 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
170     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
171     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
172     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
173 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
174 ph10 514 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
175 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
176 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
177 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
178     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
179     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
180     };
181     #endif
182    
183    
184 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185     searched linearly. Put all the names into a single string, in order to reduce
186 ph10 392 the number of relocations when a shared library is dynamically linked. The
187     string is built from string macros so that it works in UTF-8 mode on EBCDIC
188 ph10 391 platforms. */
189 ph10 210
190     typedef struct verbitem {
191 ph10 510 int len; /* Length of verb name */
192     int op; /* Op when no arg, or -1 if arg mandatory */
193     int op_arg; /* Op when arg present, or -1 if not allowed */
194 ph10 211 } verbitem;
195 ph10 210
196 ph10 240 static const char verbnames[] =
197 ph10 510 "\0" /* Empty name is a shorthand for MARK */
198 ph10 512 STRING_MARK0
199 ph10 391 STRING_ACCEPT0
200     STRING_COMMIT0
201     STRING_F0
202     STRING_FAIL0
203     STRING_PRUNE0
204     STRING_SKIP0
205     STRING_THEN;
206 ph10 240
207 ph10 327 static const verbitem verbs[] = {
208 ph10 510 { 0, -1, OP_MARK },
209 ph10 512 { 4, -1, OP_MARK },
210 ph10 510 { 6, OP_ACCEPT, -1 },
211     { 6, OP_COMMIT, -1 },
212     { 1, OP_FAIL, -1 },
213     { 4, OP_FAIL, -1 },
214     { 5, OP_PRUNE, OP_PRUNE_ARG },
215     { 4, OP_SKIP, OP_SKIP_ARG },
216     { 4, OP_THEN, OP_THEN_ARG }
217 ph10 210 };
218    
219 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
220 ph10 210
221    
222 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
223     now all in a single string, to reduce the number of relocations when a shared
224 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
225     length entry. The first three must be alpha, lower, upper, as this is assumed
226     for handling case independence. */
227 nigel 77
228 ph10 240 static const char posix_names[] =
229 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
230     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
231 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
232     STRING_word0 STRING_xdigit;
233 nigel 77
234     static const uschar posix_name_lengths[] = {
235     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
236    
237 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
238     base map, with an optional addition or removal of another map. Then, for some
239     classes, there is some additional tweaking: for [:blank:] the vertical space
240     characters are removed, and for [:alpha:] and [:alnum:] the underscore
241     character is removed. The triples in the table consist of the base map offset,
242     second map offset or -1 if no second map, and a non-negative value for map
243     addition or a negative value for map subtraction (if there are two maps). The
244     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
245     remove vertical space characters, 2 => remove underscore. */
246 nigel 77
247     static const int posix_class_maps[] = {
248 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
249     cbit_lower, -1, 0, /* lower */
250     cbit_upper, -1, 0, /* upper */
251     cbit_word, -1, 2, /* alnum - word without underscore */
252     cbit_print, cbit_cntrl, 0, /* ascii */
253     cbit_space, -1, 1, /* blank - a GNU extension */
254     cbit_cntrl, -1, 0, /* cntrl */
255     cbit_digit, -1, 0, /* digit */
256     cbit_graph, -1, 0, /* graph */
257     cbit_print, -1, 0, /* print */
258     cbit_punct, -1, 0, /* punct */
259     cbit_space, -1, 0, /* space */
260     cbit_word, -1, 0, /* word - a Perl extension */
261     cbit_xdigit,-1, 0 /* xdigit */
262 nigel 77 };
263    
264 ph10 535 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
265     substitutes must be in the order of the names, defined above, and there are
266 ph10 518 both positive and negative cases. NULL means no substitute. */
267 nigel 77
268 ph10 518 #ifdef SUPPORT_UCP
269     static const uschar *substitutes[] = {
270     (uschar *)"\\P{Nd}", /* \D */
271     (uschar *)"\\p{Nd}", /* \d */
272     (uschar *)"\\P{Xsp}", /* \S */ /* NOTE: Xsp is Perl space */
273     (uschar *)"\\p{Xsp}", /* \s */
274     (uschar *)"\\P{Xwd}", /* \W */
275 ph10 535 (uschar *)"\\p{Xwd}" /* \w */
276 ph10 518 };
277 ph10 535
278 ph10 518 static const uschar *posix_substitutes[] = {
279     (uschar *)"\\p{L}", /* alpha */
280 ph10 535 (uschar *)"\\p{Ll}", /* lower */
281     (uschar *)"\\p{Lu}", /* upper */
282     (uschar *)"\\p{Xan}", /* alnum */
283 ph10 518 NULL, /* ascii */
284     (uschar *)"\\h", /* blank */
285     NULL, /* cntrl */
286     (uschar *)"\\p{Nd}", /* digit */
287     NULL, /* graph */
288     NULL, /* print */
289     NULL, /* punct */
290     (uschar *)"\\p{Xps}", /* space */ /* NOTE: Xps is POSIX space */
291     (uschar *)"\\p{Xwd}", /* word */
292 ph10 535 NULL, /* xdigit */
293 ph10 518 /* Negated cases */
294     (uschar *)"\\P{L}", /* ^alpha */
295 ph10 535 (uschar *)"\\P{Ll}", /* ^lower */
296     (uschar *)"\\P{Lu}", /* ^upper */
297     (uschar *)"\\P{Xan}", /* ^alnum */
298 ph10 518 NULL, /* ^ascii */
299     (uschar *)"\\H", /* ^blank */
300     NULL, /* ^cntrl */
301     (uschar *)"\\P{Nd}", /* ^digit */
302     NULL, /* ^graph */
303     NULL, /* ^print */
304     NULL, /* ^punct */
305     (uschar *)"\\P{Xps}", /* ^space */ /* NOTE: Xps is POSIX space */
306     (uschar *)"\\P{Xwd}", /* ^word */
307 ph10 535 NULL /* ^xdigit */
308 ph10 518 };
309     #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
310 ph10 535 #endif
311 ph10 518
312 nigel 93 #define STRING(a) # a
313     #define XSTRING(s) STRING(s)
314    
315 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
316 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
317     they are documented. Always add a new error instead. Messages marked DEAD below
318 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
319     the number of relocations needed when a shared library is loaded dynamically,
320     it is now one long string. We cannot use a table of offsets, because the
321     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
322     simply count through to the one we want - this isn't a performance issue
323 ph10 507 because these strings are used only when there is a compilation error.
324 nigel 77
325 ph10 507 Each substring ends with \0 to insert a null character. This includes the final
326     substring, so that the whole string ends with \0\0, which can be detected when
327 ph10 499 counting through. */
328    
329 ph10 240 static const char error_texts[] =
330     "no error\0"
331     "\\ at end of pattern\0"
332     "\\c at end of pattern\0"
333     "unrecognized character follows \\\0"
334     "numbers out of order in {} quantifier\0"
335 nigel 77 /* 5 */
336 ph10 240 "number too big in {} quantifier\0"
337     "missing terminating ] for character class\0"
338     "invalid escape sequence in character class\0"
339     "range out of order in character class\0"
340     "nothing to repeat\0"
341 nigel 77 /* 10 */
342 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
343     "internal error: unexpected repeat\0"
344 ph10 269 "unrecognized character after (? or (?-\0"
345 ph10 240 "POSIX named classes are supported only within a class\0"
346     "missing )\0"
347 nigel 77 /* 15 */
348 ph10 240 "reference to non-existent subpattern\0"
349     "erroffset passed as NULL\0"
350     "unknown option bit(s) set\0"
351     "missing ) after comment\0"
352     "parentheses nested too deeply\0" /** DEAD **/
353 nigel 77 /* 20 */
354 ph10 240 "regular expression is too large\0"
355     "failed to get memory\0"
356     "unmatched parentheses\0"
357     "internal error: code overflow\0"
358     "unrecognized character after (?<\0"
359 nigel 77 /* 25 */
360 ph10 240 "lookbehind assertion is not fixed length\0"
361     "malformed number or name after (?(\0"
362     "conditional group contains more than two branches\0"
363     "assertion expected after (?(\0"
364     "(?R or (?[+-]digits must be followed by )\0"
365 nigel 77 /* 30 */
366 ph10 240 "unknown POSIX class name\0"
367     "POSIX collating elements are not supported\0"
368     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
369     "spare error\0" /** DEAD **/
370     "character value in \\x{...} sequence is too large\0"
371 nigel 77 /* 35 */
372 ph10 240 "invalid condition (?(0)\0"
373     "\\C not allowed in lookbehind assertion\0"
374 ph10 514 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
375 ph10 240 "number after (?C is > 255\0"
376     "closing ) for (?C expected\0"
377 nigel 77 /* 40 */
378 ph10 240 "recursive call could loop indefinitely\0"
379     "unrecognized character after (?P\0"
380     "syntax error in subpattern name (missing terminator)\0"
381     "two named subpatterns have the same name\0"
382     "invalid UTF-8 string\0"
383 nigel 77 /* 45 */
384 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
385     "malformed \\P or \\p sequence\0"
386     "unknown property name after \\P or \\p\0"
387     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
388     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
389 nigel 91 /* 50 */
390 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
391     "octal value is greater than \\377 (not in UTF-8 mode)\0"
392     "internal error: overran compiling workspace\0"
393     "internal error: previously-checked referenced subpattern not found\0"
394     "DEFINE group contains more than one branch\0"
395 nigel 93 /* 55 */
396 ph10 240 "repeating a DEFINE group is not allowed\0"
397     "inconsistent NEWLINE options\0"
398 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
399     "a numbered reference must not be zero\0"
400 ph10 510 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
401 ph10 211 /* 60 */
402 ph10 240 "(*VERB) not recognized\0"
403 ph10 268 "number is too big\0"
404 ph10 272 "subpattern name expected\0"
405 ph10 336 "digit expected after (?+\0"
406 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
407     /* 65 */
408 ph10 510 "different names for subpatterns of the same number are not allowed\0"
409 ph10 512 "(*MARK) must have an argument\0"
410 ph10 535 "this version of PCRE is not compiled with PCRE_UCP support\0"
411 ph10 510 ;
412 nigel 77
413     /* Table to identify digits and hex digits. This is used when compiling
414     patterns. Note that the tables in chartables are dependent on the locale, and
415     may mark arbitrary characters as digits - but the PCRE compiling code expects
416     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
417     a private table here. It costs 256 bytes, but it is a lot faster than doing
418     character value tests (at least in some simple cases I timed), and in some
419     applications one wants PCRE to compile efficiently as well as match
420     efficiently.
421    
422     For convenience, we use the same bit definitions as in chartables:
423    
424     0x04 decimal digit
425     0x08 hexadecimal digit
426    
427     Then we can use ctype_digit and ctype_xdigit in the code. */
428    
429 ph10 392 #ifndef EBCDIC
430 ph10 391
431 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
432 ph10 391 UTF-8 mode. */
433    
434 nigel 77 static const unsigned char digitab[] =
435     {
436     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
437     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
438     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
439     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
440     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
441     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
442     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
443     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
444     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
445     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
446     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
447     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
448     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
449     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
450     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
451     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
452     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
453     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
454     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
455     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
456     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
457     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
458     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
459     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
460     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
461     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
462     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
463     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
464     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
465     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
466     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
467     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
468    
469 ph10 392 #else
470 ph10 391
471     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
472    
473 nigel 77 static const unsigned char digitab[] =
474     {
475     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
476     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
477     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
478     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
479     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
480     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
481     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
482     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
483     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
484     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
485     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
486 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
487 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
488     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
489     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
490     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
491     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
492     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
493     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
494     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
495     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
496     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
497     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
498     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
499     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
500     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
501     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
502     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
503     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
504     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
505     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
506     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
507    
508     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
509     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
510     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
511     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
512     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
513     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
514     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
515     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
516     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
517     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
518     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
519     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
520 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
521 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
522     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
523     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
524     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
525     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
526     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
527     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
528     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
529     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
530     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
531     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
532     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
533     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
534     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
535     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
536     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
537     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
538     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
539     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
540     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
541     #endif
542    
543    
544     /* Definition to allow mutual recursion */
545    
546     static BOOL
547 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
548 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
549 nigel 77
550    
551    
552     /*************************************************
553 ph10 240 * Find an error text *
554     *************************************************/
555    
556 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
557     some of the text is of unknown length, we can't use a table of offsets.
558     Instead, just count through the strings. This is not a performance issue
559 ph10 240 because it happens only when there has been a compilation error.
560    
561     Argument: the error number
562     Returns: pointer to the error string
563     */
564    
565     static const char *
566     find_error_text(int n)
567     {
568     const char *s = error_texts;
569 ph10 507 for (; n > 0; n--)
570 ph10 499 {
571     while (*s++ != 0) {};
572     if (*s == 0) return "Error text not found (please report)";
573 ph10 507 }
574 ph10 240 return s;
575     }
576    
577    
578     /*************************************************
579 nigel 77 * Handle escapes *
580     *************************************************/
581    
582     /* This function is called when a \ has been encountered. It either returns a
583     positive value for a simple escape such as \n, or a negative value which
584 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
585     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
586     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
587     ptr is pointing at the \. On exit, it is on the final character of the escape
588     sequence.
589 nigel 77
590     Arguments:
591     ptrptr points to the pattern position pointer
592     errorcodeptr points to the errorcode variable
593     bracount number of previous extracting brackets
594     options the options bits
595     isclass TRUE if inside a character class
596    
597     Returns: zero or positive => a data character
598     negative => a special escape sequence
599 ph10 213 on error, errorcodeptr is set
600 nigel 77 */
601    
602     static int
603     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
604     int options, BOOL isclass)
605     {
606 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
607     const uschar *ptr = *ptrptr + 1;
608 nigel 77 int c, i;
609    
610 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
611     ptr--; /* Set pointer back to the last byte */
612    
613 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
614    
615     if (c == 0) *errorcodeptr = ERR1;
616    
617 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
618     in a table. A non-zero result is something that can be returned immediately.
619 nigel 77 Otherwise further processing may be required. */
620    
621 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
622     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
623     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
624 nigel 77
625 ph10 97 #else /* EBCDIC coding */
626 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
627 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
628     #endif
629    
630     /* Escapes that need further processing, or are illegal. */
631    
632     else
633     {
634     const uschar *oldptr;
635 nigel 93 BOOL braced, negated;
636    
637 nigel 77 switch (c)
638     {
639     /* A number of Perl escapes are not handled by PCRE. We give an explicit
640     error. */
641    
642 ph10 391 case CHAR_l:
643     case CHAR_L:
644     case CHAR_u:
645     case CHAR_U:
646 nigel 77 *errorcodeptr = ERR37;
647     break;
648    
649 ph10 333 /* \g must be followed by one of a number of specific things:
650 ph10 345
651 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
652     backreference. If negative, it is a relative backreference. This is a Perl
653     5.10 feature.
654 ph10 345
655 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
656     is part of Perl's movement towards a unified syntax for back references. As
657     this is synonymous with \k{name}, we fudge it up by pretending it really
658     was \k.
659 ph10 345
660     (3) For Oniguruma compatibility we also support \g followed by a name or a
661     number either in angle brackets or in single quotes. However, these are
662     (possibly recursive) subroutine calls, _not_ backreferences. Just return
663 ph10 333 the -ESC_g code (cf \k). */
664 nigel 93
665 ph10 391 case CHAR_g:
666     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
667 ph10 333 {
668     c = -ESC_g;
669 ph10 345 break;
670     }
671 ph10 333
672     /* Handle the Perl-compatible cases */
673 ph10 345
674 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
675 nigel 93 {
676 ph10 171 const uschar *p;
677 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
678     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
679     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
680 ph10 171 {
681     c = -ESC_k;
682     break;
683 ph10 172 }
684 nigel 93 braced = TRUE;
685     ptr++;
686     }
687     else braced = FALSE;
688    
689 ph10 391 if (ptr[1] == CHAR_MINUS)
690 nigel 93 {
691     negated = TRUE;
692     ptr++;
693     }
694     else negated = FALSE;
695    
696     c = 0;
697     while ((digitab[ptr[1]] & ctype_digit) != 0)
698 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
699 ph10 220
700 ph10 333 if (c < 0) /* Integer overflow */
701 ph10 213 {
702     *errorcodeptr = ERR61;
703     break;
704 ph10 220 }
705 ph10 345
706 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
707 nigel 93 {
708     *errorcodeptr = ERR57;
709 ph10 213 break;
710 nigel 93 }
711 ph10 345
712 ph10 333 if (c == 0)
713     {
714     *errorcodeptr = ERR58;
715     break;
716 ph10 345 }
717 nigel 93
718     if (negated)
719     {
720     if (c > bracount)
721     {
722     *errorcodeptr = ERR15;
723 ph10 213 break;
724 nigel 93 }
725     c = bracount - (c - 1);
726     }
727    
728     c = -(ESC_REF + c);
729     break;
730    
731 nigel 77 /* The handling of escape sequences consisting of a string of digits
732     starting with one that is not zero is not straightforward. By experiment,
733     the way Perl works seems to be as follows:
734    
735     Outside a character class, the digits are read as a decimal number. If the
736     number is less than 10, or if there are that many previous extracting
737     left brackets, then it is a back reference. Otherwise, up to three octal
738     digits are read to form an escaped byte. Thus \123 is likely to be octal
739     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
740     value is greater than 377, the least significant 8 bits are taken. Inside a
741     character class, \ followed by a digit is always an octal number. */
742    
743 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
744     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
745 nigel 77
746     if (!isclass)
747     {
748     oldptr = ptr;
749 ph10 391 c -= CHAR_0;
750 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
751 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
752 ph10 333 if (c < 0) /* Integer overflow */
753 ph10 213 {
754     *errorcodeptr = ERR61;
755 ph10 220 break;
756     }
757 nigel 77 if (c < 10 || c <= bracount)
758     {
759     c = -(ESC_REF + c);
760     break;
761     }
762     ptr = oldptr; /* Put the pointer back and fall through */
763     }
764    
765     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
766     generates a binary zero byte and treats the digit as a following literal.
767     Thus we have to pull back the pointer by one. */
768    
769 ph10 391 if ((c = *ptr) >= CHAR_8)
770 nigel 77 {
771     ptr--;
772     c = 0;
773     break;
774     }
775    
776     /* \0 always starts an octal number, but we may drop through to here with a
777 nigel 91 larger first octal digit. The original code used just to take the least
778     significant 8 bits of octal numbers (I think this is what early Perls used
779     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
780     than 3 octal digits. */
781 nigel 77
782 ph10 391 case CHAR_0:
783     c -= CHAR_0;
784     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
785     c = c * 8 + *(++ptr) - CHAR_0;
786 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
787 nigel 77 break;
788    
789 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
790     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
791     treated as a data character. */
792 nigel 77
793 ph10 391 case CHAR_x:
794     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
795 nigel 77 {
796     const uschar *pt = ptr + 2;
797 nigel 87 int count = 0;
798    
799 nigel 77 c = 0;
800     while ((digitab[*pt] & ctype_xdigit) != 0)
801     {
802 nigel 87 register int cc = *pt++;
803 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
804 nigel 77 count++;
805 nigel 87
806 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
807     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
808     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
809 ph10 97 #else /* EBCDIC coding */
810 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
811     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
812 nigel 77 #endif
813     }
814 nigel 87
815 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
816 nigel 77 {
817 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
818 nigel 77 ptr = pt;
819     break;
820     }
821 nigel 87
822 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
823     recognize this construct; fall through to the normal \x handling. */
824     }
825    
826 nigel 87 /* Read just a single-byte hex-defined char */
827 nigel 77
828     c = 0;
829     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
830     {
831 ph10 391 int cc; /* Some compilers don't like */
832     cc = *(++ptr); /* ++ in initializers */
833     #ifndef EBCDIC /* ASCII/UTF-8 coding */
834     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
835     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
836 ph10 97 #else /* EBCDIC coding */
837 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
838     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
839 nigel 77 #endif
840     }
841     break;
842    
843 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
844     This coding is ASCII-specific, but then the whole concept of \cx is
845     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
846 nigel 77
847 ph10 391 case CHAR_c:
848 nigel 77 c = *(++ptr);
849     if (c == 0)
850     {
851     *errorcodeptr = ERR2;
852 ph10 213 break;
853 nigel 77 }
854    
855 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
856     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
857 nigel 77 c ^= 0x40;
858 ph10 97 #else /* EBCDIC coding */
859 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
860 nigel 77 c ^= 0xC0;
861     #endif
862     break;
863    
864     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
865 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
866     otherwise, for Perl compatibility, it is a literal. This code looks a bit
867     odd, but there used to be some cases other than the default, and there may
868     be again in future, so I haven't "optimized" it. */
869 nigel 77
870     default:
871     if ((options & PCRE_EXTRA) != 0) switch(c)
872     {
873     default:
874     *errorcodeptr = ERR3;
875     break;
876     }
877     break;
878     }
879     }
880 ph10 518
881     /* Perl supports \N{name} for character names, as well as plain \N for "not
882 ph10 514 newline". PCRE does not support \N{name}. */
883 nigel 77
884 ph10 514 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
885 ph10 518 *errorcodeptr = ERR37;
886 ph10 514
887 ph10 518 /* If PCRE_UCP is set, we change the values for \d etc. */
888    
889     if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
890     c -= (ESC_DU - ESC_D);
891    
892     /* Set the pointer to the final character before returning. */
893    
894 nigel 77 *ptrptr = ptr;
895     return c;
896     }
897    
898    
899    
900     #ifdef SUPPORT_UCP
901     /*************************************************
902     * Handle \P and \p *
903     *************************************************/
904    
905     /* This function is called after \P or \p has been encountered, provided that
906     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
907     pointing at the P or p. On exit, it is pointing at the final character of the
908     escape sequence.
909    
910     Argument:
911     ptrptr points to the pattern position pointer
912     negptr points to a boolean that is set TRUE for negation else FALSE
913 nigel 87 dptr points to an int that is set to the detailed property value
914 nigel 77 errorcodeptr points to the error code variable
915    
916 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
917 nigel 77 */
918    
919     static int
920 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
921 nigel 77 {
922     int c, i, bot, top;
923     const uschar *ptr = *ptrptr;
924 nigel 87 char name[32];
925 nigel 77
926     c = *(++ptr);
927     if (c == 0) goto ERROR_RETURN;
928    
929     *negptr = FALSE;
930    
931 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
932     negation. */
933 nigel 77
934 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
935 nigel 77 {
936 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
937 nigel 77 {
938     *negptr = TRUE;
939     ptr++;
940     }
941 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
942 nigel 77 {
943     c = *(++ptr);
944     if (c == 0) goto ERROR_RETURN;
945 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
946 nigel 77 name[i] = c;
947     }
948 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
949 nigel 77 name[i] = 0;
950     }
951    
952     /* Otherwise there is just one following character */
953    
954     else
955     {
956     name[0] = c;
957     name[1] = 0;
958     }
959    
960     *ptrptr = ptr;
961    
962     /* Search for a recognized property name using binary chop */
963    
964     bot = 0;
965     top = _pcre_utt_size;
966    
967     while (bot < top)
968     {
969 nigel 87 i = (bot + top) >> 1;
970 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
971 nigel 87 if (c == 0)
972     {
973     *dptr = _pcre_utt[i].value;
974     return _pcre_utt[i].type;
975     }
976 nigel 77 if (c > 0) bot = i + 1; else top = i;
977     }
978    
979     *errorcodeptr = ERR47;
980     *ptrptr = ptr;
981     return -1;
982    
983     ERROR_RETURN:
984     *errorcodeptr = ERR46;
985     *ptrptr = ptr;
986     return -1;
987     }
988     #endif
989    
990    
991    
992    
993     /*************************************************
994     * Check for counted repeat *
995     *************************************************/
996    
997     /* This function is called when a '{' is encountered in a place where it might
998     start a quantifier. It looks ahead to see if it really is a quantifier or not.
999     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1000     where the ddds are digits.
1001    
1002     Arguments:
1003     p pointer to the first char after '{'
1004    
1005     Returns: TRUE or FALSE
1006     */
1007    
1008     static BOOL
1009     is_counted_repeat(const uschar *p)
1010     {
1011     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1012     while ((digitab[*p] & ctype_digit) != 0) p++;
1013 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1014 nigel 77
1015 ph10 391 if (*p++ != CHAR_COMMA) return FALSE;
1016     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1017 nigel 77
1018     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1019     while ((digitab[*p] & ctype_digit) != 0) p++;
1020    
1021 ph10 391 return (*p == CHAR_RIGHT_CURLY_BRACKET);
1022 nigel 77 }
1023    
1024    
1025    
1026     /*************************************************
1027     * Read repeat counts *
1028     *************************************************/
1029    
1030     /* Read an item of the form {n,m} and return the values. This is called only
1031     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1032     so the syntax is guaranteed to be correct, but we need to check the values.
1033    
1034     Arguments:
1035     p pointer to first char after '{'
1036     minp pointer to int for min
1037     maxp pointer to int for max
1038     returned as -1 if no max
1039     errorcodeptr points to error code variable
1040    
1041     Returns: pointer to '}' on success;
1042     current ptr on error, with errorcodeptr set non-zero
1043     */
1044    
1045     static const uschar *
1046     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
1047     {
1048     int min = 0;
1049     int max = -1;
1050    
1051 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
1052     an integer overflow. */
1053    
1054 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1055 nigel 81 if (min < 0 || min > 65535)
1056     {
1057     *errorcodeptr = ERR5;
1058     return p;
1059     }
1060 nigel 77
1061 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
1062     Also, max must not be less than min. */
1063    
1064 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1065 nigel 77 {
1066 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1067 nigel 77 {
1068     max = 0;
1069 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1070 nigel 81 if (max < 0 || max > 65535)
1071     {
1072     *errorcodeptr = ERR5;
1073     return p;
1074     }
1075 nigel 77 if (max < min)
1076     {
1077     *errorcodeptr = ERR4;
1078     return p;
1079     }
1080     }
1081     }
1082    
1083 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1084     '}'. */
1085 nigel 77
1086 nigel 81 *minp = min;
1087     *maxp = max;
1088 nigel 77 return p;
1089     }
1090    
1091    
1092    
1093     /*************************************************
1094 ph10 408 * Subroutine for finding forward reference *
1095 nigel 91 *************************************************/
1096    
1097 ph10 408 /* This recursive function is called only from find_parens() below. The
1098     top-level call starts at the beginning of the pattern. All other calls must
1099     start at a parenthesis. It scans along a pattern's text looking for capturing
1100 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1101     name it is given, it returns its number. Alternatively, if the name is NULL, it
1102 ph10 408 returns when it reaches a given numbered subpattern. We know that if (?P< is
1103     encountered, the name will be terminated by '>' because that is checked in the
1104 ph10 411 first pass. Recursion is used to keep track of subpatterns that reset the
1105 ph10 408 capturing group numbers - the (?| feature.
1106 nigel 91
1107     Arguments:
1108 ph10 408 ptrptr address of the current character pointer (updated)
1109 ph10 345 cd compile background data
1110 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1111     lorn name length, or subpattern number if name is NULL
1112     xmode TRUE if we are in /x mode
1113 ph10 411 count pointer to the current capturing subpattern number (updated)
1114 nigel 91
1115     Returns: the number of the named subpattern, or -1 if not found
1116     */
1117    
1118     static int
1119 ph10 408 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1120     BOOL xmode, int *count)
1121 nigel 91 {
1122 ph10 408 uschar *ptr = *ptrptr;
1123     int start_count = *count;
1124     int hwm_count = start_count;
1125     BOOL dup_parens = FALSE;
1126 nigel 93
1127 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1128 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1129    
1130     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1131     {
1132 ph10 544 /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1133 ph10 545
1134 ph10 544 if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1135 ph10 545
1136 ph10 544 /* Handle a normal, unnamed capturing parenthesis. */
1137 ph10 408
1138 ph10 544 else if (ptr[1] != CHAR_QUESTION_MARK)
1139 ph10 408 {
1140     *count += 1;
1141     if (name == NULL && *count == lorn) return *count;
1142 ph10 411 ptr++;
1143 ph10 408 }
1144    
1145 ph10 544 /* All cases now have (? at the start. Remember when we are in a group
1146     where the parenthesis numbers are duplicated. */
1147    
1148     else if (ptr[2] == CHAR_VERTICAL_LINE)
1149     {
1150     ptr += 3;
1151     dup_parens = TRUE;
1152     }
1153 ph10 545
1154 ph10 544 /* Handle comments; all characters are allowed until a ket is reached. */
1155    
1156     else if (ptr[2] == CHAR_NUMBER_SIGN)
1157     {
1158     for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1159     goto FAIL_EXIT;
1160 ph10 545 }
1161 ph10 544
1162 ph10 408 /* Handle a condition. If it is an assertion, just carry on so that it
1163     is processed as normal. If not, skip to the closing parenthesis of the
1164 ph10 544 condition (there can't be any nested parens). */
1165 ph10 411
1166 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1167     {
1168 ph10 411 ptr += 2;
1169 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1170     {
1171     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1172 ph10 411 if (*ptr != 0) ptr++;
1173 ph10 408 }
1174 ph10 411 }
1175    
1176 ph10 544 /* Start with (? but not a condition. */
1177 ph10 408
1178     else
1179 ph10 411 {
1180 ph10 408 ptr += 2;
1181     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1182    
1183     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1184 ph10 411
1185 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1186     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1187     {
1188     int term;
1189     const uschar *thisname;
1190     *count += 1;
1191     if (name == NULL && *count == lorn) return *count;
1192     term = *ptr++;
1193     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1194     thisname = ptr;
1195     while (*ptr != term) ptr++;
1196     if (name != NULL && lorn == ptr - thisname &&
1197     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1198     return *count;
1199 ph10 461 term++;
1200 ph10 411 }
1201 ph10 408 }
1202 ph10 411 }
1203 ph10 408
1204 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1205 ph10 408 bars. */
1206    
1207 nigel 91 for (; *ptr != 0; ptr++)
1208     {
1209 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1210    
1211 ph10 391 if (*ptr == CHAR_BACKSLASH)
1212 nigel 93 {
1213 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1214 ph10 391 if (*ptr == CHAR_Q) for (;;)
1215 nigel 93 {
1216 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1217 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1218 ph10 391 if (*(++ptr) == CHAR_E) break;
1219 nigel 93 }
1220     continue;
1221     }
1222    
1223 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1224     are handled for real. If the first character is '^', skip it. Also, if the
1225     first few characters (either before or after ^) are \Q\E or \E we skip them
1226 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1227 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1228 nigel 93
1229 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1230 nigel 93 {
1231 ph10 340 BOOL negate_class = FALSE;
1232     for (;;)
1233     {
1234 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1235 ph10 340 {
1236 ph10 438 if (ptr[2] == CHAR_E)
1237     ptr+= 2;
1238     else if (strncmp((const char *)ptr+2,
1239 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1240 ph10 438 ptr += 4;
1241 ph10 392 else
1242 ph10 391 break;
1243 ph10 340 }
1244 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1245 ph10 461 {
1246 ph10 340 negate_class = TRUE;
1247 ph10 438 ptr++;
1248 ph10 461 }
1249 ph10 340 else break;
1250     }
1251    
1252     /* If the next character is ']', it is a data character that must be
1253 ph10 341 skipped, except in JavaScript compatibility mode. */
1254 ph10 345
1255 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1256 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1257 ph10 345 ptr++;
1258    
1259 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1260 nigel 93 {
1261 ph10 220 if (*ptr == 0) return -1;
1262 ph10 391 if (*ptr == CHAR_BACKSLASH)
1263 nigel 93 {
1264 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1265 ph10 391 if (*ptr == CHAR_Q) for (;;)
1266 nigel 93 {
1267 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1268 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1269 ph10 391 if (*(++ptr) == CHAR_E) break;
1270 nigel 93 }
1271     continue;
1272     }
1273     }
1274     continue;
1275     }
1276    
1277     /* Skip comments in /x mode */
1278    
1279 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1280 nigel 93 {
1281 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1282 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1283 nigel 93 continue;
1284     }
1285    
1286 ph10 408 /* Check for the special metacharacters */
1287 ph10 411
1288 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1289 nigel 93 {
1290 ph10 408 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1291     if (rc > 0) return rc;
1292     if (*ptr == 0) goto FAIL_EXIT;
1293 nigel 93 }
1294 ph10 411
1295 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1296     {
1297     if (dup_parens && *count < hwm_count) *count = hwm_count;
1298 ph10 545 goto FAIL_EXIT;
1299 ph10 408 }
1300 ph10 411
1301     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1302 ph10 408 {
1303     if (*count > hwm_count) hwm_count = *count;
1304     *count = start_count;
1305 ph10 411 }
1306 ph10 408 }
1307 nigel 93
1308 ph10 408 FAIL_EXIT:
1309     *ptrptr = ptr;
1310     return -1;
1311     }
1312 nigel 93
1313    
1314    
1315    
1316 ph10 408 /*************************************************
1317     * Find forward referenced subpattern *
1318     *************************************************/
1319 nigel 93
1320 ph10 408 /* This function scans along a pattern's text looking for capturing
1321     subpatterns, and counting them. If it finds a named pattern that matches the
1322     name it is given, it returns its number. Alternatively, if the name is NULL, it
1323     returns when it reaches a given numbered subpattern. This is used for forward
1324     references to subpatterns. We used to be able to start this scan from the
1325     current compiling point, using the current count value from cd->bracount, and
1326     do it all in a single loop, but the addition of the possibility of duplicate
1327     subpattern numbers means that we have to scan from the very start, in order to
1328     take account of such duplicates, and to use a recursive function to keep track
1329     of the different types of group.
1330    
1331     Arguments:
1332     cd compile background data
1333     name name to seek, or NULL if seeking a numbered subpattern
1334     lorn name length, or subpattern number if name is NULL
1335     xmode TRUE if we are in /x mode
1336    
1337     Returns: the number of the found subpattern, or -1 if not found
1338     */
1339    
1340     static int
1341     find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1342     {
1343     uschar *ptr = (uschar *)cd->start_pattern;
1344     int count = 0;
1345     int rc;
1346    
1347     /* If the pattern does not start with an opening parenthesis, the first call
1348     to find_parens_sub() will scan right to the end (if necessary). However, if it
1349     does start with a parenthesis, find_parens_sub() will return when it hits the
1350     matching closing parens. That is why we have to have a loop. */
1351    
1352 ph10 411 for (;;)
1353     {
1354 ph10 408 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1355 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1356     }
1357    
1358 ph10 408 return rc;
1359 nigel 91 }
1360    
1361    
1362    
1363 ph10 408
1364 nigel 91 /*************************************************
1365 nigel 77 * Find first significant op code *
1366     *************************************************/
1367    
1368     /* This is called by several functions that scan a compiled expression looking
1369     for a fixed first character, or an anchoring op code etc. It skips over things
1370     that do not influence this. For some calls, a change of option is important.
1371     For some calls, it makes sense to skip negative forward and all backward
1372     assertions, and also the \b assertion; for others it does not.
1373    
1374     Arguments:
1375     code pointer to the start of the group
1376     options pointer to external options
1377     optbit the option bit whose changing is significant, or
1378     zero if none are
1379     skipassert TRUE if certain assertions are to be skipped
1380    
1381     Returns: pointer to the first significant opcode
1382     */
1383    
1384     static const uschar*
1385     first_significant_code(const uschar *code, int *options, int optbit,
1386     BOOL skipassert)
1387     {
1388     for (;;)
1389     {
1390     switch ((int)*code)
1391     {
1392     case OP_OPT:
1393     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1394     *options = (int)code[1];
1395     code += 2;
1396     break;
1397    
1398     case OP_ASSERT_NOT:
1399     case OP_ASSERTBACK:
1400     case OP_ASSERTBACK_NOT:
1401     if (!skipassert) return code;
1402     do code += GET(code, 1); while (*code == OP_ALT);
1403     code += _pcre_OP_lengths[*code];
1404     break;
1405    
1406     case OP_WORD_BOUNDARY:
1407     case OP_NOT_WORD_BOUNDARY:
1408     if (!skipassert) return code;
1409     /* Fall through */
1410    
1411     case OP_CALLOUT:
1412     case OP_CREF:
1413 ph10 459 case OP_NCREF:
1414 nigel 93 case OP_RREF:
1415 ph10 459 case OP_NRREF:
1416 nigel 93 case OP_DEF:
1417 nigel 77 code += _pcre_OP_lengths[*code];
1418     break;
1419    
1420     default:
1421     return code;
1422     }
1423     }
1424     /* Control never reaches here */
1425     }
1426    
1427    
1428    
1429    
1430     /*************************************************
1431 ph10 454 * Find the fixed length of a branch *
1432 nigel 77 *************************************************/
1433    
1434 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1435 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1436 ph10 461 In UTF8 mode, the result is in characters rather than bytes. The branch is
1437 ph10 454 temporarily terminated with OP_END when this function is called.
1438 nigel 77
1439 ph10 461 This function is called when a backward assertion is encountered, so that if it
1440     fails, the error message can point to the correct place in the pattern.
1441 ph10 454 However, we cannot do this when the assertion contains subroutine calls,
1442 ph10 461 because they can be forward references. We solve this by remembering this case
1443 ph10 454 and doing the check at the end; a flag specifies which mode we are running in.
1444    
1445 nigel 77 Arguments:
1446     code points to the start of the pattern (the bracket)
1447     options the compiling options
1448 ph10 461 atend TRUE if called when the pattern is complete
1449     cd the "compile data" structure
1450 nigel 77
1451 ph10 461 Returns: the fixed length,
1452 ph10 454 or -1 if there is no fixed length,
1453 nigel 77 or -2 if \C was encountered
1454 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1455 nigel 77 */
1456    
1457     static int
1458 ph10 454 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1459 nigel 77 {
1460     int length = -1;
1461    
1462     register int branchlength = 0;
1463     register uschar *cc = code + 1 + LINK_SIZE;
1464    
1465     /* Scan along the opcodes for this branch. If we get to the end of the
1466     branch, check the length against that of the other branches. */
1467    
1468     for (;;)
1469     {
1470     int d;
1471 ph10 454 uschar *ce, *cs;
1472 nigel 77 register int op = *cc;
1473     switch (op)
1474     {
1475 nigel 93 case OP_CBRA:
1476 nigel 77 case OP_BRA:
1477     case OP_ONCE:
1478     case OP_COND:
1479 ph10 454 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1480 nigel 77 if (d < 0) return d;
1481     branchlength += d;
1482     do cc += GET(cc, 1); while (*cc == OP_ALT);
1483     cc += 1 + LINK_SIZE;
1484     break;
1485    
1486     /* Reached end of a branch; if it's a ket it is the end of a nested
1487     call. If it's ALT it is an alternation in a nested call. If it is
1488     END it's the end of the outer call. All can be handled by the same code. */
1489    
1490     case OP_ALT:
1491     case OP_KET:
1492     case OP_KETRMAX:
1493     case OP_KETRMIN:
1494     case OP_END:
1495     if (length < 0) length = branchlength;
1496     else if (length != branchlength) return -1;
1497     if (*cc != OP_ALT) return length;
1498     cc += 1 + LINK_SIZE;
1499     branchlength = 0;
1500     break;
1501 ph10 461
1502 ph10 454 /* A true recursion implies not fixed length, but a subroutine call may
1503     be OK. If the subroutine is a forward reference, we can't deal with
1504     it until the end of the pattern, so return -3. */
1505 ph10 461
1506 ph10 454 case OP_RECURSE:
1507     if (!atend) return -3;
1508     cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1509     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1510     if (cc > cs && cc < ce) return -1; /* Recursion */
1511     d = find_fixedlength(cs + 2, options, atend, cd);
1512 ph10 461 if (d < 0) return d;
1513 ph10 454 branchlength += d;
1514     cc += 1 + LINK_SIZE;
1515 ph10 461 break;
1516 nigel 77
1517     /* Skip over assertive subpatterns */
1518    
1519     case OP_ASSERT:
1520     case OP_ASSERT_NOT:
1521     case OP_ASSERTBACK:
1522     case OP_ASSERTBACK_NOT:
1523     do cc += GET(cc, 1); while (*cc == OP_ALT);
1524     /* Fall through */
1525    
1526     /* Skip over things that don't match chars */
1527    
1528     case OP_REVERSE:
1529     case OP_CREF:
1530 ph10 459 case OP_NCREF:
1531 nigel 93 case OP_RREF:
1532 ph10 459 case OP_NRREF:
1533 nigel 93 case OP_DEF:
1534 nigel 77 case OP_OPT:
1535     case OP_CALLOUT:
1536     case OP_SOD:
1537     case OP_SOM:
1538 ph10 500 case OP_SET_SOM:
1539 nigel 77 case OP_EOD:
1540     case OP_EODN:
1541     case OP_CIRC:
1542     case OP_DOLL:
1543     case OP_NOT_WORD_BOUNDARY:
1544     case OP_WORD_BOUNDARY:
1545     cc += _pcre_OP_lengths[*cc];
1546     break;
1547    
1548     /* Handle literal characters */
1549    
1550     case OP_CHAR:
1551     case OP_CHARNC:
1552 nigel 91 case OP_NOT:
1553 nigel 77 branchlength++;
1554     cc += 2;
1555     #ifdef SUPPORT_UTF8
1556 ph10 461 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1557 ph10 426 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1558 nigel 77 #endif
1559     break;
1560    
1561     /* Handle exact repetitions. The count is already in characters, but we
1562     need to skip over a multibyte character in UTF8 mode. */
1563    
1564     case OP_EXACT:
1565     branchlength += GET2(cc,1);
1566     cc += 4;
1567     #ifdef SUPPORT_UTF8
1568 ph10 461 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1569 ph10 426 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1570 nigel 77 #endif
1571     break;
1572    
1573     case OP_TYPEEXACT:
1574     branchlength += GET2(cc,1);
1575 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1576 nigel 77 cc += 4;
1577     break;
1578    
1579     /* Handle single-char matchers */
1580    
1581     case OP_PROP:
1582     case OP_NOTPROP:
1583 nigel 87 cc += 2;
1584 nigel 77 /* Fall through */
1585    
1586     case OP_NOT_DIGIT:
1587     case OP_DIGIT:
1588     case OP_NOT_WHITESPACE:
1589     case OP_WHITESPACE:
1590     case OP_NOT_WORDCHAR:
1591     case OP_WORDCHAR:
1592     case OP_ANY:
1593 ph10 342 case OP_ALLANY:
1594 nigel 77 branchlength++;
1595     cc++;
1596     break;
1597    
1598     /* The single-byte matcher isn't allowed */
1599    
1600     case OP_ANYBYTE:
1601     return -2;
1602    
1603     /* Check a class for variable quantification */
1604    
1605     #ifdef SUPPORT_UTF8
1606     case OP_XCLASS:
1607     cc += GET(cc, 1) - 33;
1608     /* Fall through */
1609     #endif
1610    
1611     case OP_CLASS:
1612     case OP_NCLASS:
1613     cc += 33;
1614    
1615     switch (*cc)
1616     {
1617     case OP_CRSTAR:
1618     case OP_CRMINSTAR:
1619     case OP_CRQUERY:
1620     case OP_CRMINQUERY:
1621     return -1;
1622    
1623     case OP_CRRANGE:
1624     case OP_CRMINRANGE:
1625     if (GET2(cc,1) != GET2(cc,3)) return -1;
1626     branchlength += GET2(cc,1);
1627     cc += 5;
1628     break;
1629    
1630     default:
1631     branchlength++;
1632     }
1633     break;
1634    
1635     /* Anything else is variable length */
1636    
1637     default:
1638     return -1;
1639     }
1640     }
1641     /* Control never gets here */
1642     }
1643    
1644    
1645    
1646    
1647     /*************************************************
1648 ph10 454 * Scan compiled regex for specific bracket *
1649 nigel 77 *************************************************/
1650    
1651     /* This little function scans through a compiled pattern until it finds a
1652 ph10 454 capturing bracket with the given number, or, if the number is negative, an
1653 ph10 461 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1654     so that it can be called from pcre_study() when finding the minimum matching
1655 ph10 455 length.
1656 nigel 77
1657     Arguments:
1658     code points to start of expression
1659     utf8 TRUE in UTF-8 mode
1660 ph10 454 number the required bracket number or negative to find a lookbehind
1661 nigel 77
1662     Returns: pointer to the opcode for the bracket, or NULL if not found
1663     */
1664    
1665 ph10 455 const uschar *
1666     _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1667 nigel 77 {
1668     for (;;)
1669     {
1670     register int c = *code;
1671     if (c == OP_END) return NULL;
1672 nigel 91
1673     /* XCLASS is used for classes that cannot be represented just by a bit
1674     map. This includes negated single high-valued characters. The length in
1675     the table is zero; the actual length is stored in the compiled code. */
1676    
1677     if (c == OP_XCLASS) code += GET(code, 1);
1678 ph10 461
1679 ph10 454 /* Handle recursion */
1680 ph10 461
1681 ph10 454 else if (c == OP_REVERSE)
1682     {
1683 ph10 461 if (number < 0) return (uschar *)code;
1684 ph10 454 code += _pcre_OP_lengths[c];
1685     }
1686 nigel 91
1687 nigel 93 /* Handle capturing bracket */
1688 nigel 91
1689 nigel 93 else if (c == OP_CBRA)
1690 nigel 77 {
1691 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1692 nigel 77 if (n == number) return (uschar *)code;
1693 nigel 93 code += _pcre_OP_lengths[c];
1694 nigel 77 }
1695 nigel 91
1696 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1697     repeated character types, we have to test for \p and \P, which have an extra
1698 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1699 ph10 510 must add in its length. */
1700 nigel 91
1701 nigel 77 else
1702     {
1703 ph10 218 switch(c)
1704     {
1705     case OP_TYPESTAR:
1706     case OP_TYPEMINSTAR:
1707     case OP_TYPEPLUS:
1708     case OP_TYPEMINPLUS:
1709     case OP_TYPEQUERY:
1710     case OP_TYPEMINQUERY:
1711     case OP_TYPEPOSSTAR:
1712     case OP_TYPEPOSPLUS:
1713     case OP_TYPEPOSQUERY:
1714     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1715 ph10 220 break;
1716 ph10 221
1717     case OP_TYPEUPTO:
1718     case OP_TYPEMINUPTO:
1719     case OP_TYPEEXACT:
1720     case OP_TYPEPOSUPTO:
1721     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1722     break;
1723 ph10 512
1724 ph10 510 case OP_MARK:
1725     case OP_PRUNE_ARG:
1726     case OP_SKIP_ARG:
1727     code += code[1];
1728 ph10 512 break;
1729 ph10 550
1730     case OP_THEN_ARG:
1731     code += code[1+LINK_SIZE];
1732     break;
1733 ph10 220 }
1734    
1735 ph10 218 /* Add in the fixed length from the table */
1736 ph10 220
1737 nigel 77 code += _pcre_OP_lengths[c];
1738 ph10 220
1739 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1740     a multi-byte character. The length in the table is a minimum, so we have to
1741     arrange to skip the extra bytes. */
1742 ph10 220
1743 ph10 107 #ifdef SUPPORT_UTF8
1744 nigel 77 if (utf8) switch(c)
1745     {
1746     case OP_CHAR:
1747     case OP_CHARNC:
1748     case OP_EXACT:
1749     case OP_UPTO:
1750     case OP_MINUPTO:
1751 nigel 93 case OP_POSUPTO:
1752 nigel 77 case OP_STAR:
1753     case OP_MINSTAR:
1754 nigel 93 case OP_POSSTAR:
1755 nigel 77 case OP_PLUS:
1756     case OP_MINPLUS:
1757 nigel 93 case OP_POSPLUS:
1758 nigel 77 case OP_QUERY:
1759     case OP_MINQUERY:
1760 nigel 93 case OP_POSQUERY:
1761     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1762 nigel 77 break;
1763     }
1764 ph10 369 #else
1765     (void)(utf8); /* Keep compiler happy by referencing function argument */
1766 ph10 111 #endif
1767 nigel 77 }
1768     }
1769     }
1770    
1771    
1772    
1773     /*************************************************
1774     * Scan compiled regex for recursion reference *
1775     *************************************************/
1776    
1777     /* This little function scans through a compiled pattern until it finds an
1778     instance of OP_RECURSE.
1779    
1780     Arguments:
1781     code points to start of expression
1782     utf8 TRUE in UTF-8 mode
1783    
1784     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1785     */
1786    
1787     static const uschar *
1788     find_recurse(const uschar *code, BOOL utf8)
1789     {
1790     for (;;)
1791     {
1792     register int c = *code;
1793     if (c == OP_END) return NULL;
1794 nigel 91 if (c == OP_RECURSE) return code;
1795 ph10 220
1796 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1797     map. This includes negated single high-valued characters. The length in
1798     the table is zero; the actual length is stored in the compiled code. */
1799    
1800     if (c == OP_XCLASS) code += GET(code, 1);
1801    
1802 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1803     repeated character types, we have to test for \p and \P, which have an extra
1804 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1805 ph10 510 must add in its length. */
1806 nigel 91
1807 nigel 77 else
1808     {
1809 ph10 218 switch(c)
1810     {
1811     case OP_TYPESTAR:
1812     case OP_TYPEMINSTAR:
1813     case OP_TYPEPLUS:
1814     case OP_TYPEMINPLUS:
1815     case OP_TYPEQUERY:
1816     case OP_TYPEMINQUERY:
1817     case OP_TYPEPOSSTAR:
1818     case OP_TYPEPOSPLUS:
1819     case OP_TYPEPOSQUERY:
1820     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1821 ph10 220 break;
1822 ph10 221
1823     case OP_TYPEPOSUPTO:
1824     case OP_TYPEUPTO:
1825     case OP_TYPEMINUPTO:
1826     case OP_TYPEEXACT:
1827     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1828     break;
1829 ph10 512
1830 ph10 510 case OP_MARK:
1831     case OP_PRUNE_ARG:
1832     case OP_SKIP_ARG:
1833     code += code[1];
1834 ph10 512 break;
1835 ph10 550
1836     case OP_THEN_ARG:
1837     code += code[1+LINK_SIZE];
1838     break;
1839 ph10 220 }
1840    
1841 ph10 218 /* Add in the fixed length from the table */
1842    
1843 nigel 77 code += _pcre_OP_lengths[c];
1844 ph10 220
1845 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1846     by a multi-byte character. The length in the table is a minimum, so we have
1847     to arrange to skip the extra bytes. */
1848 ph10 220
1849 ph10 107 #ifdef SUPPORT_UTF8
1850 nigel 77 if (utf8) switch(c)
1851     {
1852     case OP_CHAR:
1853     case OP_CHARNC:
1854     case OP_EXACT:
1855     case OP_UPTO:
1856     case OP_MINUPTO:
1857 nigel 93 case OP_POSUPTO:
1858 nigel 77 case OP_STAR:
1859     case OP_MINSTAR:
1860 nigel 93 case OP_POSSTAR:
1861 nigel 77 case OP_PLUS:
1862     case OP_MINPLUS:
1863 nigel 93 case OP_POSPLUS:
1864 nigel 77 case OP_QUERY:
1865     case OP_MINQUERY:
1866 nigel 93 case OP_POSQUERY:
1867     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1868 nigel 77 break;
1869     }
1870 ph10 369 #else
1871     (void)(utf8); /* Keep compiler happy by referencing function argument */
1872 ph10 111 #endif
1873 nigel 77 }
1874     }
1875     }
1876    
1877    
1878    
1879     /*************************************************
1880     * Scan compiled branch for non-emptiness *
1881     *************************************************/
1882    
1883     /* This function scans through a branch of a compiled pattern to see whether it
1884 nigel 93 can match the empty string or not. It is called from could_be_empty()
1885     below and from compile_branch() when checking for an unlimited repeat of a
1886     group that can match nothing. Note that first_significant_code() skips over
1887 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1888     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1889     bracket whose current branch will already have been scanned.
1890 nigel 77
1891     Arguments:
1892     code points to start of search
1893     endcode points to where to stop
1894     utf8 TRUE if in UTF8 mode
1895 ph10 503 cd contains pointers to tables etc.
1896 nigel 77
1897     Returns: TRUE if what is matched could be empty
1898     */
1899    
1900     static BOOL
1901 ph10 503 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1902     compile_data *cd)
1903 nigel 77 {
1904     register int c;
1905 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1906 nigel 77 code < endcode;
1907     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1908     {
1909     const uschar *ccode;
1910    
1911     c = *code;
1912 ph10 507
1913 ph10 286 /* Skip over forward assertions; the other assertions are skipped by
1914 ph10 282 first_significant_code() with a TRUE final argument. */
1915 ph10 286
1916 ph10 282 if (c == OP_ASSERT)
1917 ph10 286 {
1918 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1919     c = *code;
1920     continue;
1921 ph10 286 }
1922 ph10 172
1923 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1924 nigel 77
1925 ph10 335 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1926 ph10 170 {
1927 ph10 172 code += _pcre_OP_lengths[c];
1928 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1929     c = *code;
1930     continue;
1931     }
1932 ph10 507
1933 ph10 503 /* For a recursion/subroutine call, if its end has been reached, which
1934     implies a subroutine call, we can scan it. */
1935 ph10 507
1936 ph10 503 if (c == OP_RECURSE)
1937     {
1938 ph10 507 BOOL empty_branch = FALSE;
1939 ph10 503 const uschar *scode = cd->start_code + GET(code, 1);
1940     if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
1941     do
1942     {
1943 ph10 504 if (could_be_empty_branch(scode, endcode, utf8, cd))
1944     {
1945     empty_branch = TRUE;
1946 ph10 507 break;
1947     }
1948 ph10 503 scode += GET(scode, 1);
1949     }
1950     while (*scode == OP_ALT);
1951 ph10 504 if (!empty_branch) return FALSE; /* All branches are non-empty */
1952 ph10 503 continue;
1953 ph10 507 }
1954 ph10 170
1955     /* For other groups, scan the branches. */
1956 ph10 172
1957 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1958 nigel 77 {
1959     BOOL empty_branch;
1960     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1961 ph10 406
1962     /* If a conditional group has only one branch, there is a second, implied,
1963 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
1964     Otherwise, scan the individual branches of the group. */
1965 ph10 406
1966 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1967 nigel 77 code += GET(code, 1);
1968 ph10 395 else
1969 ph10 406 {
1970 ph10 395 empty_branch = FALSE;
1971     do
1972     {
1973 ph10 503 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
1974 ph10 395 empty_branch = TRUE;
1975     code += GET(code, 1);
1976     }
1977     while (*code == OP_ALT);
1978     if (!empty_branch) return FALSE; /* All branches are non-empty */
1979 nigel 77 }
1980 ph10 406
1981 ph10 172 c = *code;
1982 nigel 93 continue;
1983 nigel 77 }
1984    
1985 nigel 93 /* Handle the other opcodes */
1986    
1987     switch (c)
1988 nigel 77 {
1989 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1990     cannot be represented just by a bit map. This includes negated single
1991     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1992 ph10 220 actual length is stored in the compiled code, so we must update "code"
1993 ph10 216 here. */
1994 nigel 77
1995     #ifdef SUPPORT_UTF8
1996     case OP_XCLASS:
1997 ph10 216 ccode = code += GET(code, 1);
1998 nigel 77 goto CHECK_CLASS_REPEAT;
1999     #endif
2000    
2001     case OP_CLASS:
2002     case OP_NCLASS:
2003     ccode = code + 33;
2004    
2005     #ifdef SUPPORT_UTF8
2006     CHECK_CLASS_REPEAT:
2007     #endif
2008    
2009     switch (*ccode)
2010     {
2011     case OP_CRSTAR: /* These could be empty; continue */
2012     case OP_CRMINSTAR:
2013     case OP_CRQUERY:
2014     case OP_CRMINQUERY:
2015     break;
2016    
2017     default: /* Non-repeat => class must match */
2018     case OP_CRPLUS: /* These repeats aren't empty */
2019     case OP_CRMINPLUS:
2020     return FALSE;
2021    
2022     case OP_CRRANGE:
2023     case OP_CRMINRANGE:
2024     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2025     break;
2026     }
2027     break;
2028    
2029     /* Opcodes that must match a character */
2030    
2031     case OP_PROP:
2032     case OP_NOTPROP:
2033     case OP_EXTUNI:
2034     case OP_NOT_DIGIT:
2035     case OP_DIGIT:
2036     case OP_NOT_WHITESPACE:
2037     case OP_WHITESPACE:
2038     case OP_NOT_WORDCHAR:
2039     case OP_WORDCHAR:
2040     case OP_ANY:
2041 ph10 345 case OP_ALLANY:
2042 nigel 77 case OP_ANYBYTE:
2043     case OP_CHAR:
2044     case OP_CHARNC:
2045     case OP_NOT:
2046     case OP_PLUS:
2047     case OP_MINPLUS:
2048 nigel 93 case OP_POSPLUS:
2049 nigel 77 case OP_EXACT:
2050     case OP_NOTPLUS:
2051     case OP_NOTMINPLUS:
2052 nigel 93 case OP_NOTPOSPLUS:
2053 nigel 77 case OP_NOTEXACT:
2054     case OP_TYPEPLUS:
2055     case OP_TYPEMINPLUS:
2056 nigel 93 case OP_TYPEPOSPLUS:
2057 nigel 77 case OP_TYPEEXACT:
2058     return FALSE;
2059 ph10 227
2060     /* These are going to continue, as they may be empty, but we have to
2061     fudge the length for the \p and \P cases. */
2062    
2063 ph10 224 case OP_TYPESTAR:
2064     case OP_TYPEMINSTAR:
2065     case OP_TYPEPOSSTAR:
2066     case OP_TYPEQUERY:
2067     case OP_TYPEMINQUERY:
2068     case OP_TYPEPOSQUERY:
2069     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2070 ph10 227 break;
2071    
2072 ph10 224 /* Same for these */
2073 ph10 227
2074 ph10 224 case OP_TYPEUPTO:
2075     case OP_TYPEMINUPTO:
2076     case OP_TYPEPOSUPTO:
2077     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2078     break;
2079 nigel 77
2080     /* End of branch */
2081    
2082     case OP_KET:
2083     case OP_KETRMAX:
2084     case OP_KETRMIN:
2085     case OP_ALT:
2086     return TRUE;
2087    
2088 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2089     MINUPTO, and POSUPTO may be followed by a multibyte character */
2090 nigel 77
2091     #ifdef SUPPORT_UTF8
2092     case OP_STAR:
2093     case OP_MINSTAR:
2094 nigel 93 case OP_POSSTAR:
2095 nigel 77 case OP_QUERY:
2096     case OP_MINQUERY:
2097 nigel 93 case OP_POSQUERY:
2098 ph10 426 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2099     break;
2100 ph10 461
2101 nigel 77 case OP_UPTO:
2102     case OP_MINUPTO:
2103 nigel 93 case OP_POSUPTO:
2104 ph10 426 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2105 nigel 77 break;
2106     #endif
2107 ph10 503
2108 ph10 510 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2109     string. */
2110    
2111     case OP_MARK:
2112     case OP_PRUNE_ARG:
2113     case OP_SKIP_ARG:
2114     code += code[1];
2115 ph10 512 break;
2116 ph10 510
2117 ph10 550 case OP_THEN_ARG:
2118     code += code[1+LINK_SIZE];
2119     break;
2120    
2121 ph10 503 /* None of the remaining opcodes are required to match a character. */
2122 ph10 507
2123 ph10 503 default:
2124 ph10 507 break;
2125 nigel 77 }
2126     }
2127    
2128     return TRUE;
2129     }
2130    
2131    
2132    
2133     /*************************************************
2134     * Scan compiled regex for non-emptiness *
2135     *************************************************/
2136    
2137     /* This function is called to check for left recursive calls. We want to check
2138     the current branch of the current pattern to see if it could match the empty
2139     string. If it could, we must look outwards for branches at other levels,
2140     stopping when we pass beyond the bracket which is the subject of the recursion.
2141    
2142     Arguments:
2143     code points to start of the recursion
2144     endcode points to where to stop (current RECURSE item)
2145     bcptr points to the chain of current (unclosed) branch starts
2146     utf8 TRUE if in UTF-8 mode
2147 ph10 507 cd pointers to tables etc
2148 nigel 77
2149     Returns: TRUE if what is matched could be empty
2150     */
2151    
2152     static BOOL
2153     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2154 ph10 503 BOOL utf8, compile_data *cd)
2155 nigel 77 {
2156 ph10 475 while (bcptr != NULL && bcptr->current_branch >= code)
2157 nigel 77 {
2158 ph10 503 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2159 ph10 475 return FALSE;
2160 nigel 77 bcptr = bcptr->outer;
2161     }
2162     return TRUE;
2163     }
2164    
2165    
2166    
2167     /*************************************************
2168     * Check for POSIX class syntax *
2169     *************************************************/
2170    
2171     /* This function is called when the sequence "[:" or "[." or "[=" is
2172 ph10 295 encountered in a character class. It checks whether this is followed by a
2173 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2174 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2175 nigel 77
2176 ph10 298 Originally, this function only recognized a sequence of letters between the
2177     terminators, but it seems that Perl recognizes any sequence of characters,
2178     though of course unknown POSIX names are subsequently rejected. Perl gives an
2179     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2180     didn't consider this to be a POSIX class. Likewise for [:1234:].
2181 ph10 295
2182 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2183     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2184     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2185     below handles the special case of \], but does not try to do any other escape
2186     processing. This makes it different from Perl for cases such as [:l\ower:]
2187 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2188 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2189 ph10 295 I think.
2190    
2191     Arguments:
2192 nigel 77 ptr pointer to the initial [
2193     endptr where to return the end pointer
2194    
2195     Returns: TRUE or FALSE
2196     */
2197    
2198     static BOOL
2199 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2200 nigel 77 {
2201     int terminator; /* Don't combine these lines; the Solaris cc */
2202     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2203 ph10 295 for (++ptr; *ptr != 0; ptr++)
2204 nigel 77 {
2205 ph10 391 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2206 ph10 298 {
2207 ph10 391 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2208     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2209 ph10 295 {
2210     *endptr = ptr;
2211     return TRUE;
2212 ph10 298 }
2213     }
2214     }
2215 nigel 77 return FALSE;
2216     }
2217    
2218    
2219    
2220    
2221     /*************************************************
2222     * Check POSIX class name *
2223     *************************************************/
2224    
2225     /* This function is called to check the name given in a POSIX-style class entry
2226     such as [:alnum:].
2227    
2228     Arguments:
2229     ptr points to the first letter
2230     len the length of the name
2231    
2232     Returns: a value representing the name, or -1 if unknown
2233     */
2234    
2235     static int
2236     check_posix_name(const uschar *ptr, int len)
2237     {
2238 ph10 240 const char *pn = posix_names;
2239 nigel 77 register int yield = 0;
2240     while (posix_name_lengths[yield] != 0)
2241     {
2242     if (len == posix_name_lengths[yield] &&
2243 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2244 ph10 243 pn += posix_name_lengths[yield] + 1;
2245 nigel 77 yield++;
2246     }
2247     return -1;
2248     }
2249    
2250    
2251     /*************************************************
2252     * Adjust OP_RECURSE items in repeated group *
2253     *************************************************/
2254    
2255     /* OP_RECURSE items contain an offset from the start of the regex to the group
2256     that is referenced. This means that groups can be replicated for fixed
2257     repetition simply by copying (because the recursion is allowed to refer to
2258     earlier groups that are outside the current group). However, when a group is
2259 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2260     inserted before it, after it has been compiled. This means that any OP_RECURSE
2261     items within it that refer to the group itself or any contained groups have to
2262     have their offsets adjusted. That one of the jobs of this function. Before it
2263     is called, the partially compiled regex must be temporarily terminated with
2264     OP_END.
2265 nigel 77
2266 nigel 93 This function has been extended with the possibility of forward references for
2267     recursions and subroutine calls. It must also check the list of such references
2268     for the group we are dealing with. If it finds that one of the recursions in
2269     the current group is on this list, it adjusts the offset in the list, not the
2270     value in the reference (which is a group number).
2271    
2272 nigel 77 Arguments:
2273     group points to the start of the group
2274     adjust the amount by which the group is to be moved
2275     utf8 TRUE in UTF-8 mode
2276     cd contains pointers to tables etc.
2277 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2278 nigel 77
2279     Returns: nothing
2280     */
2281    
2282     static void
2283 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2284     uschar *save_hwm)
2285 nigel 77 {
2286     uschar *ptr = group;
2287 ph10 224
2288 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2289     {
2290 nigel 93 int offset;
2291     uschar *hc;
2292    
2293     /* See if this recursion is on the forward reference list. If so, adjust the
2294     reference. */
2295 ph10 345
2296 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2297     {
2298     offset = GET(hc, 0);
2299     if (cd->start_code + offset == ptr + 1)
2300     {
2301     PUT(hc, 0, offset + adjust);
2302     break;
2303     }
2304     }
2305    
2306     /* Otherwise, adjust the recursion offset if it's after the start of this
2307     group. */
2308    
2309     if (hc >= cd->hwm)
2310     {
2311     offset = GET(ptr, 1);
2312     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2313     }
2314    
2315 nigel 77 ptr += 1 + LINK_SIZE;
2316     }
2317     }
2318    
2319    
2320    
2321     /*************************************************
2322     * Insert an automatic callout point *
2323     *************************************************/
2324    
2325     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2326     callout points before each pattern item.
2327    
2328     Arguments:
2329     code current code pointer
2330     ptr current pattern pointer
2331     cd pointers to tables etc
2332    
2333     Returns: new code pointer
2334     */
2335    
2336     static uschar *
2337     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2338     {
2339     *code++ = OP_CALLOUT;
2340     *code++ = 255;
2341 ph10 530 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
2342     PUT(code, LINK_SIZE, 0); /* Default length */
2343 nigel 77 return code + 2*LINK_SIZE;
2344     }
2345    
2346    
2347    
2348     /*************************************************
2349     * Complete a callout item *
2350     *************************************************/
2351    
2352     /* A callout item contains the length of the next item in the pattern, which
2353     we can't fill in till after we have reached the relevant point. This is used
2354     for both automatic and manual callouts.
2355    
2356     Arguments:
2357     previous_callout points to previous callout item
2358     ptr current pattern pointer
2359     cd pointers to tables etc
2360    
2361     Returns: nothing
2362     */
2363    
2364     static void
2365     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2366     {
2367 ph10 530 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2368 nigel 77 PUT(previous_callout, 2 + LINK_SIZE, length);
2369     }
2370    
2371    
2372    
2373     #ifdef SUPPORT_UCP
2374     /*************************************************
2375     * Get othercase range *
2376     *************************************************/
2377    
2378     /* This function is passed the start and end of a class range, in UTF-8 mode
2379     with UCP support. It searches up the characters, looking for internal ranges of
2380     characters in the "other" case. Each call returns the next one, updating the
2381     start address.
2382    
2383     Arguments:
2384     cptr points to starting character value; updated
2385     d end value
2386     ocptr where to put start of othercase range
2387     odptr where to put end of othercase range
2388    
2389     Yield: TRUE when range returned; FALSE when no more
2390     */
2391    
2392     static BOOL
2393 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2394     unsigned int *odptr)
2395 nigel 77 {
2396 nigel 93 unsigned int c, othercase, next;
2397 nigel 77
2398     for (c = *cptr; c <= d; c++)
2399 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2400 nigel 77
2401     if (c > d) return FALSE;
2402    
2403     *ocptr = othercase;
2404     next = othercase + 1;
2405    
2406     for (++c; c <= d; c++)
2407     {
2408 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2409 nigel 77 next++;
2410     }
2411    
2412     *odptr = next - 1;
2413     *cptr = c;
2414    
2415     return TRUE;
2416     }
2417 ph10 532
2418    
2419    
2420     /*************************************************
2421     * Check a character and a property *
2422     *************************************************/
2423    
2424     /* This function is called by check_auto_possessive() when a property item
2425     is adjacent to a fixed character.
2426    
2427     Arguments:
2428     c the character
2429     ptype the property type
2430     pdata the data for the type
2431     negated TRUE if it's a negated property (\P or \p{^)
2432 ph10 535
2433 ph10 532 Returns: TRUE if auto-possessifying is OK
2434 ph10 535 */
2435 ph10 532
2436     static BOOL
2437     check_char_prop(int c, int ptype, int pdata, BOOL negated)
2438     {
2439     const ucd_record *prop = GET_UCD(c);
2440     switch(ptype)
2441     {
2442     case PT_LAMP:
2443     return (prop->chartype == ucp_Lu ||
2444     prop->chartype == ucp_Ll ||
2445     prop->chartype == ucp_Lt) == negated;
2446    
2447     case PT_GC:
2448     return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
2449    
2450     case PT_PC:
2451     return (pdata == prop->chartype) == negated;
2452    
2453     case PT_SC:
2454     return (pdata == prop->script) == negated;
2455    
2456     /* These are specials */
2457    
2458     case PT_ALNUM:
2459     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2460     _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
2461    
2462     case PT_SPACE: /* Perl space */
2463     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2464     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2465     == negated;
2466    
2467     case PT_PXSPACE: /* POSIX space */
2468     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2469     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2470     c == CHAR_FF || c == CHAR_CR)
2471     == negated;
2472    
2473     case PT_WORD:
2474     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2475     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2476     c == CHAR_UNDERSCORE) == negated;
2477     }
2478 ph10 535 return FALSE;
2479 ph10 532 }
2480 nigel 77 #endif /* SUPPORT_UCP */
2481    
2482    
2483 nigel 93
2484 nigel 77 /*************************************************
2485 nigel 93 * Check if auto-possessifying is possible *
2486     *************************************************/
2487    
2488     /* This function is called for unlimited repeats of certain items, to see
2489     whether the next thing could possibly match the repeated item. If not, it makes
2490     sense to automatically possessify the repeated item.
2491    
2492     Arguments:
2493 ph10 532 previous pointer to the repeated opcode
2494 nigel 93 utf8 TRUE in UTF-8 mode
2495     ptr next character in pattern
2496     options options bits
2497     cd contains pointers to tables etc.
2498    
2499     Returns: TRUE if possessifying is wanted
2500     */
2501    
2502     static BOOL
2503 ph10 535 check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
2504 ph10 532 int options, compile_data *cd)
2505 nigel 93 {
2506 ph10 532 int c, next;
2507     int op_code = *previous++;
2508 nigel 93
2509     /* Skip whitespace and comments in extended mode */
2510    
2511     if ((options & PCRE_EXTENDED) != 0)
2512     {
2513     for (;;)
2514     {
2515     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2516 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2517 nigel 93 {
2518     while (*(++ptr) != 0)
2519     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2520     }
2521     else break;
2522     }
2523     }
2524    
2525     /* If the next item is one that we can handle, get its value. A non-negative
2526     value is a character, a negative value is an escape value. */
2527    
2528 ph10 391 if (*ptr == CHAR_BACKSLASH)
2529 nigel 93 {
2530     int temperrorcode = 0;
2531     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2532     if (temperrorcode != 0) return FALSE;
2533     ptr++; /* Point after the escape sequence */
2534     }
2535    
2536     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2537     {
2538     #ifdef SUPPORT_UTF8
2539     if (utf8) { GETCHARINC(next, ptr); } else
2540     #endif
2541     next = *ptr++;
2542     }
2543    
2544     else return FALSE;
2545    
2546     /* Skip whitespace and comments in extended mode */
2547    
2548     if ((options & PCRE_EXTENDED) != 0)
2549     {
2550     for (;;)
2551     {
2552     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2553 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2554 nigel 93 {
2555     while (*(++ptr) != 0)
2556     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2557     }
2558     else break;
2559     }
2560     }
2561    
2562     /* If the next thing is itself optional, we have to give up. */
2563    
2564 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2565 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2566     return FALSE;
2567 nigel 93
2568 ph10 532 /* Now compare the next item with the previous opcode. First, handle cases when
2569     the next item is a character. */
2570 nigel 93
2571     if (next >= 0) switch(op_code)
2572     {
2573     case OP_CHAR:
2574 ph10 535 #ifdef SUPPORT_UTF8
2575 ph10 532 GETCHARTEST(c, previous);
2576 ph10 369 #else
2577 ph10 532 c = *previous;
2578 ph10 535 #endif
2579     return c != next;
2580 nigel 93
2581     /* For CHARNC (caseless character) we must check the other case. If we have
2582     Unicode property support, we can use it to test the other case of
2583     high-valued characters. */
2584    
2585     case OP_CHARNC:
2586 ph10 535 #ifdef SUPPORT_UTF8
2587 ph10 532 GETCHARTEST(c, previous);
2588     #else
2589     c = *previous;
2590 ph10 535 #endif
2591 ph10 532 if (c == next) return FALSE;
2592 nigel 93 #ifdef SUPPORT_UTF8
2593     if (utf8)
2594     {
2595     unsigned int othercase;
2596     if (next < 128) othercase = cd->fcc[next]; else
2597     #ifdef SUPPORT_UCP
2598 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2599 nigel 93 #else
2600     othercase = NOTACHAR;
2601     #endif
2602 ph10 532 return (unsigned int)c != othercase;
2603 nigel 93 }
2604     else
2605     #endif /* SUPPORT_UTF8 */
2606 ph10 532 return (c != cd->fcc[next]); /* Non-UTF-8 mode */
2607 nigel 93
2608 ph10 532 /* For OP_NOT, its data is always a single-byte character. */
2609 nigel 93
2610     case OP_NOT:
2611 ph10 532 if ((c = *previous) == next) return TRUE;
2612 nigel 93 if ((options & PCRE_CASELESS) == 0) return FALSE;
2613     #ifdef SUPPORT_UTF8
2614     if (utf8)
2615     {
2616     unsigned int othercase;
2617     if (next < 128) othercase = cd->fcc[next]; else
2618     #ifdef SUPPORT_UCP
2619 ph10 349 othercase = UCD_OTHERCASE(next);
2620 nigel 93 #else
2621     othercase = NOTACHAR;
2622     #endif
2623 ph10 532 return (unsigned int)c == othercase;
2624 nigel 93 }
2625     else
2626     #endif /* SUPPORT_UTF8 */
2627 ph10 532 return (c == cd->fcc[next]); /* Non-UTF-8 mode */
2628 nigel 93
2629 ph10 535 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
2630     When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
2631    
2632 nigel 93 case OP_DIGIT:
2633     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2634    
2635     case OP_NOT_DIGIT:
2636     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2637    
2638     case OP_WHITESPACE:
2639     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2640    
2641     case OP_NOT_WHITESPACE:
2642     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2643    
2644     case OP_WORDCHAR:
2645     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2646    
2647     case OP_NOT_WORDCHAR:
2648     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2649    
2650 ph10 180 case OP_HSPACE:
2651     case OP_NOT_HSPACE:
2652     switch(next)
2653     {
2654     case 0x09:
2655     case 0x20:
2656     case 0xa0:
2657     case 0x1680:
2658     case 0x180e:
2659     case 0x2000:
2660     case 0x2001:
2661     case 0x2002:
2662     case 0x2003:
2663     case 0x2004:
2664     case 0x2005:
2665     case 0x2006:
2666     case 0x2007:
2667     case 0x2008:
2668     case 0x2009:
2669     case 0x200A:
2670     case 0x202f:
2671     case 0x205f:
2672     case 0x3000:
2673 ph10 528 return op_code == OP_NOT_HSPACE;
2674 ph10 180 default:
2675 ph10 528 return op_code != OP_NOT_HSPACE;
2676 ph10 180 }
2677    
2678 ph10 528 case OP_ANYNL:
2679 ph10 180 case OP_VSPACE:
2680     case OP_NOT_VSPACE:
2681     switch(next)
2682     {
2683     case 0x0a:
2684     case 0x0b:
2685     case 0x0c:
2686     case 0x0d:
2687     case 0x85:
2688     case 0x2028:
2689     case 0x2029:
2690 ph10 528 return op_code == OP_NOT_VSPACE;
2691 ph10 180 default:
2692 ph10 528 return op_code != OP_NOT_VSPACE;
2693 ph10 180 }
2694    
2695 ph10 532 #ifdef SUPPORT_UCP
2696     case OP_PROP:
2697     return check_char_prop(next, previous[0], previous[1], FALSE);
2698 ph10 535
2699 ph10 532 case OP_NOTPROP:
2700     return check_char_prop(next, previous[0], previous[1], TRUE);
2701     #endif
2702    
2703 nigel 93 default:
2704     return FALSE;
2705     }
2706    
2707    
2708 ph10 535 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
2709     is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
2710     generated only when PCRE_UCP is *not* set, that is, when only ASCII
2711     characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
2712 ph10 532 replaced by OP_PROP codes when PCRE_UCP is set. */
2713 nigel 93
2714     switch(op_code)
2715     {
2716     case OP_CHAR:
2717     case OP_CHARNC:
2718 ph10 535 #ifdef SUPPORT_UTF8
2719 ph10 532 GETCHARTEST(c, previous);
2720     #else
2721     c = *previous;
2722 ph10 535 #endif
2723 nigel 93 switch(-next)
2724     {
2725     case ESC_d:
2726 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
2727 nigel 93
2728     case ESC_D:
2729 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
2730 nigel 93
2731     case ESC_s:
2732 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
2733 nigel 93
2734     case ESC_S:
2735 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
2736 nigel 93
2737     case ESC_w:
2738 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
2739 nigel 93
2740     case ESC_W:
2741 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
2742 ph10 182
2743 ph10 180 case ESC_h:
2744     case ESC_H:
2745 ph10 532 switch(c)
2746 ph10 180 {
2747     case 0x09:
2748     case 0x20:
2749     case 0xa0:
2750     case 0x1680:
2751     case 0x180e:
2752     case 0x2000:
2753     case 0x2001:
2754     case 0x2002:
2755     case 0x2003:
2756     case 0x2004:
2757     case 0x2005:
2758     case 0x2006:
2759     case 0x2007:
2760     case 0x2008:
2761     case 0x2009:
2762     case 0x200A:
2763     case 0x202f:
2764     case 0x205f:
2765     case 0x3000:
2766     return -next != ESC_h;
2767     default:
2768     return -next == ESC_h;
2769 ph10 182 }
2770    
2771 ph10 180 case ESC_v:
2772     case ESC_V:
2773 ph10 532 switch(c)
2774 ph10 180 {
2775     case 0x0a:
2776     case 0x0b:
2777     case 0x0c:
2778     case 0x0d:
2779     case 0x85:
2780     case 0x2028:
2781     case 0x2029:
2782     return -next != ESC_v;
2783     default:
2784     return -next == ESC_v;
2785 ph10 182 }
2786 ph10 535
2787     /* When PCRE_UCP is set, these values get generated for \d etc. Find
2788     their substitutions and process them. The result will always be either
2789 ph10 532 -ESC_p or -ESC_P. Then fall through to process those values. */
2790 ph10 535
2791 ph10 532 #ifdef SUPPORT_UCP
2792     case ESC_du:
2793     case ESC_DU:
2794     case ESC_wu:
2795     case ESC_WU:
2796     case ESC_su:
2797     case ESC_SU:
2798     {
2799     int temperrorcode = 0;
2800     ptr = substitutes[-next - ESC_DU];
2801     next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
2802     if (temperrorcode != 0) return FALSE;
2803     ptr++; /* For compatibility */
2804     }
2805 ph10 535 /* Fall through */
2806 nigel 93
2807 ph10 532 case ESC_p:
2808     case ESC_P:
2809     {
2810     int ptype, pdata, errorcodeptr;
2811 ph10 535 BOOL negated;
2812    
2813 ph10 532 ptr--; /* Make ptr point at the p or P */
2814     ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
2815     if (ptype < 0) return FALSE;
2816     ptr++; /* Point past the final curly ket */
2817 ph10 535
2818 ph10 532 /* If the property item is optional, we have to give up. (When generated
2819     from \d etc by PCRE_UCP, this test will have been applied much earlier,
2820     to the original \d etc. At this point, ptr will point to a zero byte. */
2821 ph10 535
2822 ph10 532 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2823     strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2824     return FALSE;
2825 ph10 535
2826 ph10 532 /* Do the property check. */
2827 ph10 535
2828 ph10 532 return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
2829 ph10 535 }
2830 ph10 532 #endif
2831    
2832 nigel 93 default:
2833     return FALSE;
2834     }
2835    
2836 ph10 535 /* In principle, support for Unicode properties should be integrated here as
2837     well. It means re-organizing the above code so as to get hold of the property
2838     values before switching on the op-code. However, I wonder how many patterns
2839     combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
2840     these op-codes are never generated.) */
2841    
2842 nigel 93 case OP_DIGIT:
2843 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2844 ph10 528 next == -ESC_h || next == -ESC_v || next == -ESC_R;
2845 nigel 93
2846     case OP_NOT_DIGIT:
2847     return next == -ESC_d;
2848    
2849     case OP_WHITESPACE:
2850 ph10 528 return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
2851 nigel 93
2852     case OP_NOT_WHITESPACE:
2853 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2854 nigel 93
2855 ph10 180 case OP_HSPACE:
2856 ph10 535 return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
2857 ph10 528 next == -ESC_w || next == -ESC_v || next == -ESC_R;
2858 ph10 180
2859     case OP_NOT_HSPACE:
2860     return next == -ESC_h;
2861 ph10 182
2862 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2863 ph10 535 case OP_ANYNL:
2864 ph10 182 case OP_VSPACE:
2865 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2866    
2867     case OP_NOT_VSPACE:
2868 ph10 528 return next == -ESC_v || next == -ESC_R;
2869 ph10 180
2870 nigel 93 case OP_WORDCHAR:
2871 ph10 535 return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
2872 ph10 528 next == -ESC_v || next == -ESC_R;
2873 nigel 93
2874     case OP_NOT_WORDCHAR:
2875     return next == -ESC_w || next == -ESC_d;
2876 ph10 182
2877 nigel 93 default:
2878     return FALSE;
2879     }
2880    
2881     /* Control does not reach here */
2882     }
2883    
2884    
2885    
2886     /*************************************************
2887 nigel 77 * Compile one branch *
2888     *************************************************/
2889    
2890 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2891 nigel 77 changed during the branch, the pointer is used to change the external options
2892 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2893     to find out the amount of memory needed, as well as during the real compile
2894     phase. The value of lengthptr distinguishes the two phases.
2895 nigel 77
2896     Arguments:
2897     optionsptr pointer to the option bits
2898     codeptr points to the pointer to the current code point
2899     ptrptr points to the current pattern pointer
2900     errorcodeptr points to error code variable
2901     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2902     reqbyteptr set to the last literal character required, else < 0
2903     bcptr points to current branch chain
2904     cd contains pointers to tables etc.
2905 nigel 93 lengthptr NULL during the real compile phase
2906     points to length accumulator during pre-compile phase
2907 nigel 77
2908     Returns: TRUE on success
2909     FALSE, with *errorcodeptr set non-zero on error
2910     */
2911    
2912     static BOOL
2913 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2914     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2915     compile_data *cd, int *lengthptr)
2916 nigel 77 {
2917     int repeat_type, op_type;
2918     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2919     int bravalue = 0;
2920     int greedy_default, greedy_non_default;
2921     int firstbyte, reqbyte;
2922     int zeroreqbyte, zerofirstbyte;
2923     int req_caseopt, reqvary, tempreqvary;
2924     int options = *optionsptr;
2925     int after_manual_callout = 0;
2926 nigel 93 int length_prevgroup = 0;
2927 nigel 77 register int c;
2928     register uschar *code = *codeptr;
2929 nigel 93 uschar *last_code = code;
2930     uschar *orig_code = code;
2931 nigel 77 uschar *tempcode;
2932     BOOL inescq = FALSE;
2933     BOOL groupsetfirstbyte = FALSE;
2934     const uschar *ptr = *ptrptr;
2935     const uschar *tempptr;
2936 ph10 518 const uschar *nestptr = NULL;
2937 nigel 77 uschar *previous = NULL;
2938     uschar *previous_callout = NULL;
2939 nigel 93 uschar *save_hwm = NULL;
2940 nigel 77 uschar classbits[32];
2941    
2942     #ifdef SUPPORT_UTF8
2943     BOOL class_utf8;
2944     BOOL utf8 = (options & PCRE_UTF8) != 0;
2945     uschar *class_utf8data;
2946 ph10 300 uschar *class_utf8data_base;
2947 nigel 77 uschar utf8_char[6];
2948     #else
2949     BOOL utf8 = FALSE;
2950 nigel 93 uschar *utf8_char = NULL;
2951 nigel 77 #endif
2952    
2953 ph10 475 #ifdef PCRE_DEBUG
2954 nigel 93 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2955     #endif
2956    
2957 nigel 77 /* Set up the default and non-default settings for greediness */
2958    
2959     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2960     greedy_non_default = greedy_default ^ 1;
2961    
2962     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2963     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2964     matches a non-fixed char first char; reqbyte just remains unset if we never
2965     find one.
2966    
2967     When we hit a repeat whose minimum is zero, we may have to adjust these values
2968     to take the zero repeat into account. This is implemented by setting them to
2969     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2970     item types that can be repeated set these backoff variables appropriately. */
2971    
2972     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2973    
2974     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2975     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2976     value > 255. It is added into the firstbyte or reqbyte variables to record the
2977     case status of the value. This is used only for ASCII characters. */
2978    
2979     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2980    
2981     /* Switch on next character until the end of the branch */
2982    
2983     for (;; ptr++)
2984     {
2985     BOOL negate_class;
2986 ph10 286 BOOL should_flip_negation;
2987 nigel 77 BOOL possessive_quantifier;
2988     BOOL is_quantifier;
2989 nigel 93 BOOL is_recurse;
2990 ph10 180 BOOL reset_bracount;
2991 nigel 77 int class_charcount;
2992     int class_lastchar;
2993     int newoptions;
2994     int recno;
2995 ph10 172 int refsign;
2996 nigel 77 int skipbytes;
2997     int subreqbyte;
2998     int subfirstbyte;
2999 nigel 93 int terminator;
3000 nigel 77 int mclength;
3001     uschar mcbuffer[8];
3002    
3003 nigel 93 /* Get next byte in the pattern */
3004 nigel 77
3005     c = *ptr;
3006 ph10 345
3007 ph10 535 /* If we are at the end of a nested substitution, revert to the outer level
3008 ph10 518 string. Nesting only happens one level deep. */
3009    
3010     if (c == 0 && nestptr != NULL)
3011     {
3012     ptr = nestptr;
3013     nestptr = NULL;
3014     c = *ptr;
3015     }
3016    
3017 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
3018     previous cycle of this loop. */
3019    
3020     if (lengthptr != NULL)
3021     {
3022 ph10 475 #ifdef PCRE_DEBUG
3023 nigel 93 if (code > cd->hwm) cd->hwm = code; /* High water info */
3024     #endif
3025 ph10 505 if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */
3026 nigel 93 {
3027     *errorcodeptr = ERR52;
3028     goto FAILED;
3029     }
3030    
3031     /* There is at least one situation where code goes backwards: this is the
3032     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3033     the class is simply eliminated. However, it is created first, so we have to
3034     allow memory for it. Therefore, don't ever reduce the length at this point.
3035     */
3036    
3037     if (code < last_code) code = last_code;
3038 ph10 202
3039     /* Paranoid check for integer overflow */
3040    
3041     if (OFLOW_MAX - *lengthptr < code - last_code)
3042     {
3043     *errorcodeptr = ERR20;
3044     goto FAILED;
3045     }
3046    
3047 ph10 530 *lengthptr += (int)(code - last_code);
3048 nigel 93 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
3049    
3050     /* If "previous" is set and it is not at the start of the work space, move
3051     it back to there, in order to avoid filling up the work space. Otherwise,
3052     if "previous" is NULL, reset the current code pointer to the start. */
3053    
3054     if (previous != NULL)
3055     {
3056     if (previous > orig_code)
3057     {
3058     memmove(orig_code, previous, code - previous);
3059     code -= previous - orig_code;
3060     previous = orig_code;
3061     }
3062     }
3063     else code = orig_code;
3064    
3065     /* Remember where this code item starts so we can pick up the length
3066     next time round. */
3067    
3068     last_code = code;
3069     }
3070    
3071     /* In the real compile phase, just check the workspace used by the forward
3072     reference list. */
3073    
3074 ph10 505 else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
3075 nigel 93 {
3076     *errorcodeptr = ERR52;
3077     goto FAILED;
3078     }
3079    
3080 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
3081    
3082     if (inescq && c != 0)
3083     {
3084 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3085 nigel 77 {
3086     inescq = FALSE;
3087     ptr++;
3088     continue;
3089     }
3090     else
3091     {
3092     if (previous_callout != NULL)
3093     {
3094 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3095     complete_callout(previous_callout, ptr, cd);
3096 nigel 77 previous_callout = NULL;
3097     }
3098     if ((options & PCRE_AUTO_CALLOUT) != 0)
3099     {
3100     previous_callout = code;
3101     code = auto_callout(code, ptr, cd);
3102     }
3103     goto NORMAL_CHAR;
3104     }
3105     }
3106    
3107     /* Fill in length of a previous callout, except when the next thing is
3108     a quantifier. */
3109    
3110 ph10 392 is_quantifier =
3111 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3112     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3113 nigel 77
3114     if (!is_quantifier && previous_callout != NULL &&
3115     after_manual_callout-- <= 0)
3116     {
3117 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3118     complete_callout(previous_callout, ptr, cd);
3119 nigel 77 previous_callout = NULL;
3120     }
3121    
3122     /* In extended mode, skip white space and comments */
3123    
3124     if ((options & PCRE_EXTENDED) != 0)
3125     {
3126     if ((cd->ctypes[c] & ctype_space) != 0) continue;
3127 ph10 391 if (c == CHAR_NUMBER_SIGN)
3128 nigel 77 {
3129 nigel 93 while (*(++ptr) != 0)
3130 nigel 91 {
3131 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3132 nigel 91 }
3133 nigel 93 if (*ptr != 0) continue;
3134    
3135 nigel 91 /* Else fall through to handle end of string */
3136     c = 0;
3137 nigel 77 }
3138     }
3139    
3140     /* No auto callout for quantifiers. */
3141    
3142     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3143     {
3144     previous_callout = code;
3145     code = auto_callout(code, ptr, cd);
3146     }
3147    
3148     switch(c)
3149     {
3150 nigel 93 /* ===================================================================*/
3151     case 0: /* The branch terminates at string end */
3152 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
3153     case CHAR_RIGHT_PARENTHESIS:
3154 nigel 77 *firstbyteptr = firstbyte;
3155     *reqbyteptr = reqbyte;
3156     *codeptr = code;
3157     *ptrptr = ptr;
3158 nigel 93 if (lengthptr != NULL)
3159     {
3160 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
3161     {
3162     *errorcodeptr = ERR20;
3163     goto FAILED;
3164     }
3165 ph10 530 *lengthptr += (int)(code - last_code); /* To include callout length */
3166 nigel 93 DPRINTF((">> end branch\n"));
3167     }
3168 nigel 77 return TRUE;
3169    
3170 nigel 93
3171     /* ===================================================================*/
3172 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
3173     the setting of any following char as a first character. */
3174    
3175 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
3176 nigel 77 if ((options & PCRE_MULTILINE) != 0)
3177     {
3178     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3179     }
3180     previous = NULL;
3181     *code++ = OP_CIRC;
3182     break;
3183    
3184 ph10 391 case CHAR_DOLLAR_SIGN:
3185 nigel 77 previous = NULL;
3186     *code++ = OP_DOLL;
3187     break;
3188    
3189     /* There can never be a first char if '.' is first, whatever happens about
3190     repeats. The value of reqbyte doesn't change either. */
3191    
3192 ph10 391 case CHAR_DOT:
3193 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3194     zerofirstbyte = firstbyte;
3195     zeroreqbyte = reqbyte;
3196     previous = code;
3197 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3198 nigel 77 break;
3199    
3200 nigel 93
3201     /* ===================================================================*/
3202 nigel 87 /* Character classes. If the included characters are all < 256, we build a
3203     32-byte bitmap of the permitted characters, except in the special case
3204     where there is only one such character. For negated classes, we build the
3205     map as usual, then invert it at the end. However, we use a different opcode
3206     so that data characters > 255 can be handled correctly.
3207 nigel 77
3208     If the class contains characters outside the 0-255 range, a different
3209     opcode is compiled. It may optionally have a bit map for characters < 256,
3210     but those above are are explicitly listed afterwards. A flag byte tells
3211     whether the bitmap is present, and whether this is a negated class or not.
3212 ph10 345
3213 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
3214     default (Perl) mode, it is treated as a data character. */
3215 ph10 345
3216 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
3217 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3218     {
3219     *errorcodeptr = ERR64;
3220 ph10 345 goto FAILED;
3221 ph10 336 }
3222 ph10 345 goto NORMAL_CHAR;
3223 nigel 77
3224 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
3225 nigel 77 previous = code;
3226    
3227     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3228     they are encountered at the top level, so we'll do that too. */
3229    
3230 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3231 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
3232 ph10 295 check_posix_syntax(ptr, &tempptr))
3233 nigel 77 {
3234 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3235 nigel 77 goto FAILED;
3236     }
3237    
3238 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
3239 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
3240 ph10 205 skip them too. This makes for compatibility with Perl. */
3241 ph10 208
3242 ph10 205 negate_class = FALSE;
3243     for (;;)
3244 nigel 77 {
3245     c = *(++ptr);
3246 ph10 391 if (c == CHAR_BACKSLASH)
3247 ph10 205 {
3248 ph10 392 if (ptr[1] == CHAR_E)
3249 ph10 391 ptr++;
3250 ph10 392 else if (strncmp((const char *)ptr+1,
3251     STR_Q STR_BACKSLASH STR_E, 3) == 0)
3252 ph10 391 ptr += 3;
3253 ph10 392 else
3254 ph10 391 break;
3255 ph10 205 }
3256 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3257 ph10 205 negate_class = TRUE;
3258     else break;
3259 ph10 208 }
3260 ph10 345
3261     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3262     an initial ']' is taken as a data character -- the code below handles
3263 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3264     [^] must match any character, so generate OP_ALLANY. */
3265 ph10 345
3266 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3267 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3268 ph10 341 {
3269     *code++ = negate_class? OP_ALLANY : OP_FAIL;
3270     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3271     zerofirstbyte = firstbyte;
3272     break;
3273 ph10 345 }
3274 nigel 77
3275 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
3276     negation flag at the end, so that support for characters > 255 works
3277 ph10 264 correctly (they are all included in the class). */
3278    
3279     should_flip_negation = FALSE;
3280    
3281 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
3282 nigel 93 of just a single character (as long as it's < 256). However, For higher
3283     valued UTF-8 characters, we don't yet do any optimization. */
3284 nigel 77
3285     class_charcount = 0;
3286     class_lastchar = -1;
3287    
3288 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
3289     temporary bit of memory, in case the class contains only 1 character (less
3290     than 256), because in that case the compiled code doesn't use the bit map.
3291     */
3292    
3293     memset(classbits, 0, 32 * sizeof(uschar));
3294    
3295 nigel 77 #ifdef SUPPORT_UTF8
3296     class_utf8 = FALSE; /* No chars >= 256 */
3297 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3298 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3299 nigel 77 #endif
3300    
3301     /* Process characters until ] is reached. By writing this as a "do" it
3302 nigel 93 means that an initial ] is taken as a data character. At the start of the
3303     loop, c contains the first byte of the character. */
3304 nigel 77
3305 nigel 93 if (c != 0) do
3306 nigel 77 {
3307 nigel 93 const uschar *oldptr;
3308    
3309 nigel 77 #ifdef SUPPORT_UTF8
3310     if (utf8 && c > 127)
3311     { /* Braces are required because the */
3312     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3313     }
3314 ph10 535
3315 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3316 ph10 309 data and reset the pointer. This is so that very large classes that
3317 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
3318 ph10 309 (which is on the stack). */
3319    
3320 ph10 300 if (lengthptr != NULL)
3321     {
3322     *lengthptr += class_utf8data - class_utf8data_base;
3323 ph10 309 class_utf8data = class_utf8data_base;
3324     }
3325    
3326 nigel 77 #endif
3327    
3328     /* Inside \Q...\E everything is literal except \E */
3329    
3330     if (inescq)
3331     {
3332 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3333 nigel 77 {
3334 nigel 93 inescq = FALSE; /* Reset literal state */
3335     ptr++; /* Skip the 'E' */
3336     continue; /* Carry on with next */
3337 nigel 77 }
3338 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
3339 nigel 77 }
3340    
3341     /* Handle POSIX class names. Perl allows a negation extension of the
3342     form [:^name:]. A square bracket that doesn't match the syntax is
3343     treated as a literal. We also recognize the POSIX constructions
3344     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3345     5.6 and 5.8 do. */
3346    
3347 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3348 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3349 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3350 nigel 77 {
3351     BOOL local_negate = FALSE;
3352 nigel 87 int posix_class, taboffset, tabopt;
3353 nigel 77 register const uschar *cbits = cd->cbits;
3354 nigel 87 uschar pbits[32];
3355 nigel 77
3356 ph10 391 if (ptr[1] != CHAR_COLON)
3357 nigel 77 {
3358     *errorcodeptr = ERR31;
3359     goto FAILED;
3360     }
3361    
3362     ptr += 2;
3363 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3364 nigel 77 {
3365     local_negate = TRUE;
3366 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3367 nigel 77 ptr++;
3368     }
3369    
3370 ph10 530 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3371 nigel 77 if (posix_class < 0)
3372     {
3373     *errorcodeptr = ERR30;
3374     goto FAILED;
3375     }
3376    
3377     /* If matching is caseless, upper and lower are converted to
3378     alpha. This relies on the fact that the class table starts with
3379     alpha, lower, upper as the first 3 entries. */
3380    
3381     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3382     posix_class = 0;
3383 ph10 535
3384     /* When PCRE_UCP is set, some of the POSIX classes are converted to
3385 ph10 518 different escape sequences that use Unicode properties. */
3386 ph10 535
3387 ph10 518 #ifdef SUPPORT_UCP
3388     if ((options & PCRE_UCP) != 0)
3389     {
3390     int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3391     if (posix_substitutes[pc] != NULL)
3392     {
3393 ph10 535 nestptr = tempptr + 1;
3394 ph10 518 ptr = posix_substitutes[pc] - 1;
3395 ph10 535 continue;
3396     }
3397     }
3398     #endif
3399 ph10 518 /* In the non-UCP case, we build the bit map for the POSIX class in a
3400     chunk of local store because we may be adding and subtracting from it,
3401     and we don't want to subtract bits that may be in the main map already.
3402     At the end we or the result into the bit map that is being built. */
3403 nigel 77
3404     posix_class *= 3;
3405 nigel 87
3406     /* Copy in the first table (always present) */
3407    
3408     memcpy(pbits, cbits + posix_class_maps[posix_class],
3409     32 * sizeof(uschar));
3410    
3411     /* If there is a second table, add or remove it as required. */
3412    
3413     taboffset = posix_class_maps[posix_class + 1];
3414     tabopt = posix_class_maps[posix_class + 2];
3415    
3416     if (taboffset >= 0)
3417 nigel 77 {
3418 nigel 87 if (tabopt >= 0)
3419     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3420 nigel 77 else
3421 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3422 nigel 77 }
3423    
3424 nigel 87 /* Not see if we need to remove any special characters. An option
3425     value of 1 removes vertical space and 2 removes underscore. */
3426    
3427     if (tabopt < 0) tabopt = -tabopt;
3428     if (tabopt == 1) pbits[1] &= ~0x3c;
3429     else if (tabopt == 2) pbits[11] &= 0x7f;
3430    
3431     /* Add the POSIX table or its complement into the main table that is
3432     being built and we are done. */
3433    
3434     if (local_negate)
3435     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3436     else
3437     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3438    
3439 nigel 77 ptr = tempptr + 1;
3440     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3441     continue; /* End of POSIX syntax handling */
3442     }
3443    
3444     /* Backslash may introduce a single character, or it may introduce one
3445 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3446 ph10 513 case. Inside a class (and only there) it is treated as backspace. We
3447     assume that other escapes have more than one character in them, so set
3448     class_charcount bigger than one. Unrecognized escapes fall through and
3449     are either treated as literal characters (by default), or are faulted if
3450     PCRE_EXTRA is set. */
3451 nigel 77
3452 ph10 391 if (c == CHAR_BACKSLASH)
3453 nigel 77 {
3454 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3455     if (*errorcodeptr != 0) goto FAILED;
3456 nigel 77
3457 ph10 513 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3458 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3459     {
3460 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3461 nigel 77 {
3462     ptr += 2; /* avoid empty string */
3463     }
3464     else inescq = TRUE;
3465     continue;
3466     }
3467 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3468 nigel 77
3469     if (c < 0)
3470     {
3471     register const uschar *cbits = cd->cbits;
3472     class_charcount += 2; /* Greater than 1 is what matters */
3473 nigel 93
3474 ph10 518 switch (-c)
3475 nigel 77 {
3476 ph10 518 #ifdef SUPPORT_UCP
3477     case ESC_du: /* These are the values given for \d etc */
3478     case ESC_DU: /* when PCRE_UCP is set. We replace the */
3479     case ESC_wu: /* escape sequence with an appropriate \p */
3480     case ESC_WU: /* or \P to test Unicode properties instead */
3481     case ESC_su: /* of the default ASCII testing. */
3482     case ESC_SU:
3483     nestptr = ptr;
3484     ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
3485 ph10 535 class_charcount -= 2; /* Undo! */
3486 ph10 518 continue;
3487     #endif
3488 nigel 77 case ESC_d:
3489     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3490     continue;
3491    
3492     case ESC_D:
3493 ph10 286 should_flip_negation = TRUE;
3494 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3495     continue;
3496    
3497     case ESC_w:
3498     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3499     continue;
3500    
3501     case ESC_W:
3502 ph10 286 should_flip_negation = TRUE;
3503 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3504     continue;
3505    
3506     case ESC_s:
3507     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3508     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3509     continue;
3510    
3511     case ESC_S:
3512 ph10 286 should_flip_negation = TRUE;
3513 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3514     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3515     continue;
3516    
3517 ph10 518 case ESC_h:
3518 ph10 178 SETBIT(classbits, 0x09); /* VT */
3519     SETBIT(classbits, 0x20); /* SPACE */
3520 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
3521 ph10 178 #ifdef SUPPORT_UTF8
3522     if (utf8)
3523 ph10 180 {
3524 ph10 178 class_utf8 = TRUE;
3525     *class_utf8data++ = XCL_SINGLE;
3526 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3527 ph10 178 *class_utf8data++ = XCL_SINGLE;
3528 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3529     *class_utf8data++ = XCL_RANGE;
3530     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3531     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3532 ph10 178 *class_utf8data++ = XCL_SINGLE;
3533 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3534 ph10 178 *class_utf8data++ = XCL_SINGLE;
3535 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3536 ph10 178 *class_utf8data++ = XCL_SINGLE;
3537 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3538     }
3539     #endif
3540     continue;
3541 nigel 93
3542 ph10 518 case ESC_H:
3543 ph10 178 for (c = 0; c < 32; c++)
3544     {
3545     int x = 0xff;
3546     switch (c)
3547 ph10 180 {
3548 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
3549     case 0x20/8: x ^= 1 << (0x20%8); break;
3550     case 0xa0/8: x ^= 1 << (0xa0%8); break;
3551     default: break;
3552     }
3553     classbits[c] |= x;
3554 ph10 180 }
3555    
3556 ph10 178 #ifdef SUPPORT_UTF8
3557     if (utf8)
3558 ph10 180 {
3559 ph10 178 class_utf8 = TRUE;
3560 ph10 180 *class_utf8data++ = XCL_RANGE;
3561     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3562     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3563     *class_utf8data++ = XCL_RANGE;
3564     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3565     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3566     *class_utf8data++ = XCL_RANGE;
3567     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3568     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3569     *class_utf8data++ = XCL_RANGE;
3570     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3571     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3572     *class_utf8data++ = XCL_RANGE;
3573     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3574     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3575     *class_utf8data++ = XCL_RANGE;
3576     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3577     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3578     *class_utf8data++ = XCL_RANGE;
3579     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3580     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3581     }
3582     #endif
3583     continue;
3584 ph10 178
3585 ph10 518 case ESC_v:
3586 ph10 178 SETBIT(classbits, 0x0a); /* LF */
3587     SETBIT(classbits, 0x0b); /* VT */
3588 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3589     SETBIT(classbits, 0x0d); /* CR */
3590     SETBIT(classbits, 0x85); /* NEL */
3591 ph10 178 #ifdef SUPPORT_UTF8
3592     if (utf8)
3593 ph10 180 {
3594 ph10 178 class_utf8 = TRUE;
3595 ph10 180 *class_utf8data++ = XCL_RANGE;
3596     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3597     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3598     }
3599     #endif
3600     continue;
3601 ph10 178
3602 ph10 518 case ESC_V:
3603 ph10 178 for (c = 0; c < 32; c++)
3604     {
3605     int x = 0xff;
3606     switch (c)
3607 ph10 180 {
3608 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3609     x ^= 1 << (0x0b%8);
3610     x ^= 1 << (0x0c%8);
3611 ph10 180 x ^= 1 << (0x0d%8);
3612 ph10 178 break;
3613     case 0x85/8: x ^= 1 << (0x85%8); break;
3614     default: break;
3615     }
3616     classbits[c] |= x;
3617 ph10 180 }
3618    
3619 ph10 178 #ifdef SUPPORT_UTF8
3620     if (utf8)
3621 ph10 180 {
3622 ph10 178 class_utf8 = TRUE;
3623 ph10 180 *class_utf8data++ = XCL_RANGE;
3624     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3625     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3626     *class_utf8data++ = XCL_RANGE;
3627     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3628     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3629     }
3630     #endif
3631     continue;
3632 ph10 178
3633 nigel 77 #ifdef SUPPORT_UCP
3634 ph10 518 case ESC_p:
3635     case ESC_P:
3636     {
3637     BOOL negated;
3638     int pdata;
3639     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3640     if (ptype < 0) goto FAILED;
3641     class_utf8 = TRUE;
3642     *class_utf8data++ = ((-c == ESC_p) != negated)?
3643     XCL_PROP : XCL_NOTPROP;
3644     *class_utf8data++ = ptype;
3645     *class_utf8data++ = pdata;
3646     class_charcount -= 2; /* Not a < 256 character */
3647     continue;
3648     }
3649 nigel 77 #endif
3650 ph10 518 /* Unrecognized escapes are faulted if PCRE is running in its
3651     strict mode. By default, for compatibility with Perl, they are
3652     treated as literals. */
3653 nigel 77
3654 ph10 518 default:
3655     if ((options & PCRE_EXTRA) != 0)
3656     {
3657     *errorcodeptr = ERR7;
3658     goto FAILED;
3659     }
3660     class_charcount -= 2; /* Undo the default count from above */
3661     c = *ptr; /* Get the final character and fall through */
3662     break;
3663 nigel 93 }
3664 nigel 77 }
3665    
3666     /* Fall through if we have a single character (c >= 0). This may be
3667 nigel 93 greater than 256 in UTF-8 mode. */
3668 nigel 77
3669     } /* End of backslash handling */
3670    
3671     /* A single character may be followed by '-' to form a range. However,
3672     Perl does not permit ']' to be the end of the range. A '-' character
3673 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3674     entirely. The code for handling \Q and \E is messy. */
3675 nigel 77
3676 nigel 93 CHECK_RANGE:
3677 ph10 391 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3678 nigel 77 {
3679 nigel 93 inescq = FALSE;
3680     ptr += 2;
3681     }
3682    
3683     oldptr = ptr;
3684 ph10 231
3685 ph10 230 /* Remember \r or \n */
3686 ph10 231
3687 ph10 391 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3688 ph10 231
3689 ph10 230 /* Check for range */
3690 nigel 93
3691 ph10 391 if (!inescq && ptr[1] == CHAR_MINUS)
3692 nigel 93 {
3693 nigel 77 int d;
3694     ptr += 2;
3695 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3696 nigel 77
3697 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3698     mode. */
3699    
3700 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3701 nigel 93 {
3702     ptr += 2;
3703 ph10 392 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3704 ph10 391 { ptr += 2; continue; }
3705 nigel 93 inescq = TRUE;
3706     break;
3707     }
3708    
3709 ph10 391 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3710 nigel 93 {
3711     ptr = oldptr;
3712     goto LONE_SINGLE_CHARACTER;
3713     }
3714    
3715 nigel 77 #ifdef SUPPORT_UTF8
3716     if (utf8)
3717     { /* Braces are required because the */
3718     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3719     }
3720     else
3721     #endif
3722     d = *ptr; /* Not UTF-8 mode */
3723    
3724     /* The second part of a range can be a single-character escape, but
3725     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3726     in such circumstances. */
3727    
3728 ph10 391 if (!inescq && d == CHAR_BACKSLASH)
3729 nigel 77 {
3730 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3731     if (*errorcodeptr != 0) goto FAILED;
3732 nigel 77
3733 ph10 514 /* \b is backspace; any other special means the '-' was literal */
3734 nigel 77
3735     if (d < 0)
3736     {
3737 ph10 514 if (d == -ESC_b) d = CHAR_BS; else
3738 nigel 77 {
3739 nigel 93 ptr = oldptr;
3740 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3741     }
3742     }
3743     }
3744    
3745 nigel 93 /* Check that the two values are in the correct order. Optimize
3746     one-character ranges */
3747 nigel 77
3748 nigel 93 if (d < c)
3749     {
3750     *errorcodeptr = ERR8;
3751     goto FAILED;
3752     }
3753    
3754 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3755    
3756 ph10 230 /* Remember \r or \n */
3757 ph10 231
3758 ph10 391 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3759 ph10 231
3760 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3761     matching, we have to use an XCLASS with extra data items. Caseless
3762     matching for characters > 127 is available only if UCP support is
3763     available. */
3764    
3765     #ifdef SUPPORT_UTF8
3766     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3767     {
3768     class_utf8 = TRUE;
3769    
3770     /* With UCP support, we can find the other case equivalents of
3771     the relevant characters. There may be several ranges. Optimize how
3772     they fit with the basic range. */
3773    
3774     #ifdef SUPPORT_UCP
3775     if ((options & PCRE_CASELESS) != 0)
3776     {
3777 nigel 93 unsigned int occ, ocd;
3778     unsigned int cc = c;
3779     unsigned int origd = d;
3780 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3781     {
3782 ph10 180 if (occ >= (unsigned int)c &&
3783     ocd <= (unsigned int)d)
3784 ph10 176 continue; /* Skip embedded ranges */
3785 nigel 77
3786 ph10 180 if (occ < (unsigned int)c &&
3787 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3788 nigel 77 { /* if there is overlap, */
3789     c = occ; /* noting that if occ < c */
3790     continue; /* we can't have ocd > d */
3791     } /* because a subrange is */
3792 ph10 180 if (ocd > (unsigned int)d &&
3793 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3794 nigel 77 { /* the basic range. */
3795     d = ocd;
3796     continue;
3797     }
3798    
3799     if (occ == ocd)
3800     {
3801     *class_utf8data++ = XCL_SINGLE;
3802     }
3803     else
3804     {
3805     *class_utf8data++ = XCL_RANGE;
3806     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3807     }
3808     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3809     }
3810     }
3811     #endif /* SUPPORT_UCP */
3812    
3813     /* Now record the original range, possibly modified for UCP caseless
3814     overlapping ranges. */
3815    
3816     *class_utf8data++ = XCL_RANGE;
3817     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3818     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3819    
3820     /* With UCP support, we are done. Without UCP support, there is no
3821     caseless matching for UTF-8 characters > 127; we can use the bit map
3822     for the smaller ones. */
3823    
3824     #ifdef SUPPORT_UCP
3825     continue; /* With next character in the class */
3826     #else
3827     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3828    
3829     /* Adjust upper limit and fall through to set up the map */
3830    
3831     d = 127;
3832    
3833     #endif /* SUPPORT_UCP */
3834     }
3835     #endif /* SUPPORT_UTF8 */
3836    
3837     /* We use the bit map for all cases when not in UTF-8 mode; else
3838     ranges that lie entirely within 0-127 when there is UCP support; else
3839     for partial ranges without UCP support. */
3840    
3841 nigel 93 class_charcount += d - c + 1;
3842     class_lastchar = d;
3843    
3844     /* We can save a bit of time by skipping this in the pre-compile. */
3845    
3846     if (lengthptr == NULL) for (; c <= d; c++)
3847 nigel 77 {
3848     classbits[c/8] |= (1 << (c&7));
3849     if ((options & PCRE_CASELESS) != 0)
3850     {
3851     int uc = cd->fcc[c]; /* flip case */
3852     classbits[uc/8] |= (1 << (uc&7));
3853     }
3854     }
3855    
3856     continue; /* Go get the next char in the class */
3857     }
3858    
3859     /* Handle a lone single character - we can get here for a normal
3860     non-escape char, or after \ that introduces a single character or for an
3861     apparent range that isn't. */
3862    
3863     LONE_SINGLE_CHARACTER:
3864 ph10 231
3865 nigel 77 /* Handle a character that cannot go in the bit map */
3866    
3867     #ifdef SUPPORT_UTF8
3868     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3869     {
3870     class_utf8 = TRUE;
3871     *class_utf8data++ = XCL_SINGLE;
3872     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3873    
3874     #ifdef SUPPORT_UCP
3875     if ((options & PCRE_CASELESS) != 0)
3876     {
3877 nigel 93 unsigned int othercase;
3878 ph10 349 if ((othercase = UCD_OTHERCASE(c)) != c)
3879 nigel 77 {
3880     *class_utf8data++ = XCL_SINGLE;
3881     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3882     }
3883     }
3884     #endif /* SUPPORT_UCP */
3885    
3886     }
3887     else
3888     #endif /* SUPPORT_UTF8 */
3889    
3890     /* Handle a single-byte character */
3891     {