/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 773 - (hide annotations) (download)
Wed Nov 30 18:10:27 2011 UTC (2 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 255428 byte(s)
Expand compile workspace for very many forward references. This ups the limit 
by a factor of 100.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 ph10 475 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57     also used by pcretest. PCRE_DEBUG is not defined when building a production
58     library. */
59 nigel 85
60 ph10 475 #ifdef PCRE_DEBUG
61 nigel 85 #include "pcre_printint.src"
62     #endif
63    
64    
65 ph10 178 /* Macro for setting individual bits in class bitmaps. */
66    
67     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68    
69 ph10 202 /* Maximum length value to check against when making sure that the integer that
70     holds the compiled pattern length does not overflow. We make it a bit less than
71     INT_MAX to allow for adding in group terminating bytes, so that we don't have
72     to check them every time. */
73 ph10 178
74 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
75    
76    
77 nigel 77 /*************************************************
78     * Code parameters and static tables *
79     *************************************************/
80    
81 nigel 93 /* This value specifies the size of stack workspace that is used during the
82     first pre-compile phase that determines how much memory is required. The regex
83     is partly compiled into this space, but the compiled parts are discarded as
84     soon as they can be, so that hopefully there will never be an overrun. The code
85     does, however, check for an overrun. The largest amount I've seen used is 218,
86     so this number is very generous.
87 nigel 77
88 nigel 93 The same workspace is used during the second, actual compile phase for
89     remembering forward references to groups so that they can be filled in at the
90     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91 ph10 773 is 4 there is plenty of room for most patterns. However, the memory can get
92     filled up by repetitions of forward references, for example patterns like
93     /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
94     that the workspace is expanded using malloc() in this situation. The value
95     below is therefore a minimum, and we put a maximum on it for safety. The
96     minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
97     kicks in at the same number of forward references in all cases. */
98 nigel 77
99 ph10 773 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
100     #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
101 nigel 77
102 ph10 507 /* The overrun tests check for a slightly smaller size so that they detect the
103 ph10 505 overrun before it actually does run off the end of the data block. */
104 nigel 93
105 ph10 773 #define WORK_SIZE_SAFETY_MARGIN (100)
106 ph10 505
107    
108 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
109     are simple data values; negative values are for special things like \d and so
110     on. Zero means further processing is needed (for things like \x), or the escape
111     is invalid. */
112    
113 ph10 391 #ifndef EBCDIC
114    
115     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
116 ph10 392 in UTF-8 mode. */
117 ph10 391
118 ph10 392 static const short int escapes[] = {
119 ph10 391 0, 0,
120     0, 0,
121 ph10 392 0, 0,
122     0, 0,
123     0, 0,
124 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
125 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
126 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
127 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
128     -ESC_B, -ESC_C,
129     -ESC_D, -ESC_E,
130     0, -ESC_G,
131     -ESC_H, 0,
132     0, -ESC_K,
133 ph10 391 0, 0,
134 ph10 514 -ESC_N, 0,
135 ph10 391 -ESC_P, -ESC_Q,
136     -ESC_R, -ESC_S,
137 ph10 392 0, 0,
138     -ESC_V, -ESC_W,
139     -ESC_X, 0,
140     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
141 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
142 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
143 ph10 391 CHAR_GRAVE_ACCENT, 7,
144 ph10 392 -ESC_b, 0,
145     -ESC_d, ESC_e,
146 ph10 391 ESC_f, 0,
147     -ESC_h, 0,
148 ph10 392 0, -ESC_k,
149 ph10 391 0, 0,
150     ESC_n, 0,
151 ph10 392 -ESC_p, 0,
152     ESC_r, -ESC_s,
153 ph10 391 ESC_tee, 0,
154 ph10 392 -ESC_v, -ESC_w,
155     0, 0,
156 ph10 391 -ESC_z
157 nigel 77 };
158    
159 ph10 392 #else
160 ph10 391
161     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
162    
163 nigel 77 static const short int escapes[] = {
164     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
165     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
166     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
167     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
168     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
169     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
170     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
171     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
172 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
173 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
174 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
175 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
176 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
177     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
178     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
179     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
180 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
181 ph10 514 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
182 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
183 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
184 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
185     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
186     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
187     };
188     #endif
189    
190    
191 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
192     searched linearly. Put all the names into a single string, in order to reduce
193 ph10 392 the number of relocations when a shared library is dynamically linked. The
194     string is built from string macros so that it works in UTF-8 mode on EBCDIC
195 ph10 391 platforms. */
196 ph10 210
197     typedef struct verbitem {
198 ph10 510 int len; /* Length of verb name */
199     int op; /* Op when no arg, or -1 if arg mandatory */
200     int op_arg; /* Op when arg present, or -1 if not allowed */
201 ph10 211 } verbitem;
202 ph10 210
203 ph10 240 static const char verbnames[] =
204 ph10 510 "\0" /* Empty name is a shorthand for MARK */
205 ph10 512 STRING_MARK0
206 ph10 391 STRING_ACCEPT0
207     STRING_COMMIT0
208     STRING_F0
209     STRING_FAIL0
210     STRING_PRUNE0
211     STRING_SKIP0
212     STRING_THEN;
213 ph10 240
214 ph10 327 static const verbitem verbs[] = {
215 ph10 510 { 0, -1, OP_MARK },
216 ph10 512 { 4, -1, OP_MARK },
217 ph10 510 { 6, OP_ACCEPT, -1 },
218     { 6, OP_COMMIT, -1 },
219     { 1, OP_FAIL, -1 },
220     { 4, OP_FAIL, -1 },
221     { 5, OP_PRUNE, OP_PRUNE_ARG },
222     { 4, OP_SKIP, OP_SKIP_ARG },
223     { 4, OP_THEN, OP_THEN_ARG }
224 ph10 210 };
225    
226 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
227 ph10 210
228    
229 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
230     now all in a single string, to reduce the number of relocations when a shared
231 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
232     length entry. The first three must be alpha, lower, upper, as this is assumed
233     for handling case independence. */
234 nigel 77
235 ph10 240 static const char posix_names[] =
236 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
237     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
238 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
239     STRING_word0 STRING_xdigit;
240 nigel 77
241     static const uschar posix_name_lengths[] = {
242     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
243    
244 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
245     base map, with an optional addition or removal of another map. Then, for some
246     classes, there is some additional tweaking: for [:blank:] the vertical space
247     characters are removed, and for [:alpha:] and [:alnum:] the underscore
248     character is removed. The triples in the table consist of the base map offset,
249     second map offset or -1 if no second map, and a non-negative value for map
250     addition or a negative value for map subtraction (if there are two maps). The
251     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
252     remove vertical space characters, 2 => remove underscore. */
253 nigel 77
254     static const int posix_class_maps[] = {
255 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
256     cbit_lower, -1, 0, /* lower */
257     cbit_upper, -1, 0, /* upper */
258     cbit_word, -1, 2, /* alnum - word without underscore */
259     cbit_print, cbit_cntrl, 0, /* ascii */
260     cbit_space, -1, 1, /* blank - a GNU extension */
261     cbit_cntrl, -1, 0, /* cntrl */
262     cbit_digit, -1, 0, /* digit */
263     cbit_graph, -1, 0, /* graph */
264     cbit_print, -1, 0, /* print */
265     cbit_punct, -1, 0, /* punct */
266     cbit_space, -1, 0, /* space */
267     cbit_word, -1, 0, /* word - a Perl extension */
268     cbit_xdigit,-1, 0 /* xdigit */
269 nigel 77 };
270    
271 ph10 535 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
272     substitutes must be in the order of the names, defined above, and there are
273 ph10 518 both positive and negative cases. NULL means no substitute. */
274 nigel 77
275 ph10 518 #ifdef SUPPORT_UCP
276     static const uschar *substitutes[] = {
277     (uschar *)"\\P{Nd}", /* \D */
278     (uschar *)"\\p{Nd}", /* \d */
279     (uschar *)"\\P{Xsp}", /* \S */ /* NOTE: Xsp is Perl space */
280     (uschar *)"\\p{Xsp}", /* \s */
281     (uschar *)"\\P{Xwd}", /* \W */
282 ph10 535 (uschar *)"\\p{Xwd}" /* \w */
283 ph10 518 };
284 ph10 535
285 ph10 518 static const uschar *posix_substitutes[] = {
286     (uschar *)"\\p{L}", /* alpha */
287 ph10 535 (uschar *)"\\p{Ll}", /* lower */
288     (uschar *)"\\p{Lu}", /* upper */
289     (uschar *)"\\p{Xan}", /* alnum */
290 ph10 518 NULL, /* ascii */
291     (uschar *)"\\h", /* blank */
292     NULL, /* cntrl */
293     (uschar *)"\\p{Nd}", /* digit */
294     NULL, /* graph */
295     NULL, /* print */
296     NULL, /* punct */
297     (uschar *)"\\p{Xps}", /* space */ /* NOTE: Xps is POSIX space */
298     (uschar *)"\\p{Xwd}", /* word */
299 ph10 535 NULL, /* xdigit */
300 ph10 518 /* Negated cases */
301     (uschar *)"\\P{L}", /* ^alpha */
302 ph10 535 (uschar *)"\\P{Ll}", /* ^lower */
303     (uschar *)"\\P{Lu}", /* ^upper */
304     (uschar *)"\\P{Xan}", /* ^alnum */
305 ph10 518 NULL, /* ^ascii */
306     (uschar *)"\\H", /* ^blank */
307     NULL, /* ^cntrl */
308     (uschar *)"\\P{Nd}", /* ^digit */
309     NULL, /* ^graph */
310     NULL, /* ^print */
311     NULL, /* ^punct */
312     (uschar *)"\\P{Xps}", /* ^space */ /* NOTE: Xps is POSIX space */
313     (uschar *)"\\P{Xwd}", /* ^word */
314 ph10 535 NULL /* ^xdigit */
315 ph10 518 };
316     #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
317 ph10 535 #endif
318 ph10 518
319 nigel 93 #define STRING(a) # a
320     #define XSTRING(s) STRING(s)
321    
322 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
323 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
324     they are documented. Always add a new error instead. Messages marked DEAD below
325 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
326     the number of relocations needed when a shared library is loaded dynamically,
327     it is now one long string. We cannot use a table of offsets, because the
328     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
329     simply count through to the one we want - this isn't a performance issue
330 ph10 507 because these strings are used only when there is a compilation error.
331 nigel 77
332 ph10 507 Each substring ends with \0 to insert a null character. This includes the final
333     substring, so that the whole string ends with \0\0, which can be detected when
334 ph10 499 counting through. */
335    
336 ph10 240 static const char error_texts[] =
337     "no error\0"
338     "\\ at end of pattern\0"
339     "\\c at end of pattern\0"
340     "unrecognized character follows \\\0"
341     "numbers out of order in {} quantifier\0"
342 nigel 77 /* 5 */
343 ph10 240 "number too big in {} quantifier\0"
344     "missing terminating ] for character class\0"
345     "invalid escape sequence in character class\0"
346     "range out of order in character class\0"
347     "nothing to repeat\0"
348 nigel 77 /* 10 */
349 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
350     "internal error: unexpected repeat\0"
351 ph10 269 "unrecognized character after (? or (?-\0"
352 ph10 240 "POSIX named classes are supported only within a class\0"
353     "missing )\0"
354 nigel 77 /* 15 */
355 ph10 240 "reference to non-existent subpattern\0"
356     "erroffset passed as NULL\0"
357     "unknown option bit(s) set\0"
358     "missing ) after comment\0"
359     "parentheses nested too deeply\0" /** DEAD **/
360 nigel 77 /* 20 */
361 ph10 240 "regular expression is too large\0"
362     "failed to get memory\0"
363     "unmatched parentheses\0"
364     "internal error: code overflow\0"
365     "unrecognized character after (?<\0"
366 nigel 77 /* 25 */
367 ph10 240 "lookbehind assertion is not fixed length\0"
368     "malformed number or name after (?(\0"
369     "conditional group contains more than two branches\0"
370     "assertion expected after (?(\0"
371     "(?R or (?[+-]digits must be followed by )\0"
372 nigel 77 /* 30 */
373 ph10 240 "unknown POSIX class name\0"
374     "POSIX collating elements are not supported\0"
375     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
376     "spare error\0" /** DEAD **/
377     "character value in \\x{...} sequence is too large\0"
378 nigel 77 /* 35 */
379 ph10 240 "invalid condition (?(0)\0"
380     "\\C not allowed in lookbehind assertion\0"
381 ph10 514 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
382 ph10 240 "number after (?C is > 255\0"
383     "closing ) for (?C expected\0"
384 nigel 77 /* 40 */
385 ph10 240 "recursive call could loop indefinitely\0"
386     "unrecognized character after (?P\0"
387     "syntax error in subpattern name (missing terminator)\0"
388     "two named subpatterns have the same name\0"
389     "invalid UTF-8 string\0"
390 nigel 77 /* 45 */
391 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
392     "malformed \\P or \\p sequence\0"
393     "unknown property name after \\P or \\p\0"
394     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
395     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
396 nigel 91 /* 50 */
397 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
398     "octal value is greater than \\377 (not in UTF-8 mode)\0"
399     "internal error: overran compiling workspace\0"
400     "internal error: previously-checked referenced subpattern not found\0"
401     "DEFINE group contains more than one branch\0"
402 nigel 93 /* 55 */
403 ph10 637 "repeating a DEFINE group is not allowed\0" /** DEAD **/
404 ph10 240 "inconsistent NEWLINE options\0"
405 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
406     "a numbered reference must not be zero\0"
407 ph10 510 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
408 ph10 211 /* 60 */
409 ph10 240 "(*VERB) not recognized\0"
410 ph10 268 "number is too big\0"
411 ph10 272 "subpattern name expected\0"
412 ph10 336 "digit expected after (?+\0"
413 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
414     /* 65 */
415 ph10 510 "different names for subpatterns of the same number are not allowed\0"
416 ph10 512 "(*MARK) must have an argument\0"
417 ph10 535 "this version of PCRE is not compiled with PCRE_UCP support\0"
418 ph10 579 "\\c must be followed by an ASCII character\0"
419 ph10 654 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
420 ph10 747 /* 70 */
421     "internal error: unknown opcode in find_fixedlength()\0"
422 ph10 758 "\\N is not supported in a class\0"
423 ph10 760 "too many forward references\0"
424 ph10 510 ;
425 nigel 77
426     /* Table to identify digits and hex digits. This is used when compiling
427     patterns. Note that the tables in chartables are dependent on the locale, and
428     may mark arbitrary characters as digits - but the PCRE compiling code expects
429     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
430     a private table here. It costs 256 bytes, but it is a lot faster than doing
431     character value tests (at least in some simple cases I timed), and in some
432     applications one wants PCRE to compile efficiently as well as match
433     efficiently.
434    
435     For convenience, we use the same bit definitions as in chartables:
436    
437     0x04 decimal digit
438     0x08 hexadecimal digit
439    
440     Then we can use ctype_digit and ctype_xdigit in the code. */
441    
442 ph10 392 #ifndef EBCDIC
443 ph10 391
444 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
445 ph10 391 UTF-8 mode. */
446    
447 nigel 77 static const unsigned char digitab[] =
448     {
449     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
450     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
451     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
452     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
453     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
454     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
455     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
456     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
457     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
458     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
459     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
460     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
461     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
462     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
463     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
464     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
465     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
466     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
467     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
468     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
469     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
470     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
471     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
472     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
473     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
474     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
475     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
476     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
477     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
478     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
479     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
480     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
481    
482 ph10 392 #else
483 ph10 391
484     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
485    
486 nigel 77 static const unsigned char digitab[] =
487     {
488     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
489     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
490     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
491     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
492     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
493     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
494     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
495     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
496     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
497     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
498     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
499 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
500 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
501     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
502     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
503     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
504     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
505     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
506     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
507     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
508     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
509     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
510     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
511     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
512     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
513     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
514     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
515     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
516     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
517     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
518     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
519     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
520    
521     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
522     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
523     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
524     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
525     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
526     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
527     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
528     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
529     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
530     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
531     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
532     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
533 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
534 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
535     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
536     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
537     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
538     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
539     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
540     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
541     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
542     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
543     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
544     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
545     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
546     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
547     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
548     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
549     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
550     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
551     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
552     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
553     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
554     #endif
555    
556    
557     /* Definition to allow mutual recursion */
558    
559     static BOOL
560 ph10 642 compile_regex(int, uschar **, const uschar **, int *, BOOL, BOOL, int, int,
561     int *, int *, branch_chain *, compile_data *, int *);
562 nigel 77
563    
564    
565     /*************************************************
566 ph10 240 * Find an error text *
567     *************************************************/
568    
569 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
570     some of the text is of unknown length, we can't use a table of offsets.
571     Instead, just count through the strings. This is not a performance issue
572 ph10 240 because it happens only when there has been a compilation error.
573    
574     Argument: the error number
575     Returns: pointer to the error string
576     */
577    
578     static const char *
579     find_error_text(int n)
580     {
581     const char *s = error_texts;
582 ph10 507 for (; n > 0; n--)
583 ph10 499 {
584     while (*s++ != 0) {};
585     if (*s == 0) return "Error text not found (please report)";
586 ph10 507 }
587 ph10 240 return s;
588     }
589    
590    
591     /*************************************************
592 ph10 773 * Expand the workspace *
593     *************************************************/
594    
595     /* This function is called during the second compiling phase, if the number of
596     forward references fills the existing workspace, which is originally a block on
597     the stack. A larger block is obtained from malloc() unless the ultimate limit
598     has been reached or the increase will be rather small.
599    
600     Argument: pointer to the compile data block
601     Returns: 0 if all went well, else an error number
602     */
603    
604     static int
605     expand_workspace(compile_data *cd)
606     {
607     uschar *newspace;
608     int newsize = cd->workspace_size * 2;
609    
610     if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
611     if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
612     newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
613     return ERR72;
614    
615     newspace = (pcre_malloc)(newsize);
616     if (newspace == NULL) return ERR21;
617    
618     memcpy(newspace, cd->start_workspace, cd->workspace_size);
619     cd->hwm = (uschar *)newspace + (cd->hwm - cd->start_workspace);
620     if (cd->workspace_size > COMPILE_WORK_SIZE)
621     (pcre_free)((void *)cd->start_workspace);
622     cd->start_workspace = newspace;
623     cd->workspace_size = newsize;
624     return 0;
625     }
626    
627    
628    
629     /*************************************************
630 ph10 640 * Check for counted repeat *
631     *************************************************/
632    
633     /* This function is called when a '{' is encountered in a place where it might
634     start a quantifier. It looks ahead to see if it really is a quantifier or not.
635     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
636     where the ddds are digits.
637    
638     Arguments:
639     p pointer to the first char after '{'
640    
641     Returns: TRUE or FALSE
642     */
643    
644     static BOOL
645     is_counted_repeat(const uschar *p)
646     {
647     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
648     while ((digitab[*p] & ctype_digit) != 0) p++;
649     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
650    
651     if (*p++ != CHAR_COMMA) return FALSE;
652     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
653    
654     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
655     while ((digitab[*p] & ctype_digit) != 0) p++;
656    
657     return (*p == CHAR_RIGHT_CURLY_BRACKET);
658     }
659    
660    
661    
662     /*************************************************
663 nigel 77 * Handle escapes *
664     *************************************************/
665    
666     /* This function is called when a \ has been encountered. It either returns a
667     positive value for a simple escape such as \n, or a negative value which
668 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
669     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
670     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
671     ptr is pointing at the \. On exit, it is on the final character of the escape
672     sequence.
673 nigel 77
674     Arguments:
675     ptrptr points to the pattern position pointer
676     errorcodeptr points to the errorcode variable
677     bracount number of previous extracting brackets
678     options the options bits
679     isclass TRUE if inside a character class
680    
681     Returns: zero or positive => a data character
682     negative => a special escape sequence
683 ph10 213 on error, errorcodeptr is set
684 nigel 77 */
685    
686     static int
687     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
688     int options, BOOL isclass)
689     {
690 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
691     const uschar *ptr = *ptrptr + 1;
692 nigel 77 int c, i;
693    
694 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
695     ptr--; /* Set pointer back to the last byte */
696    
697 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
698    
699     if (c == 0) *errorcodeptr = ERR1;
700    
701 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
702     in a table. A non-zero result is something that can be returned immediately.
703 nigel 77 Otherwise further processing may be required. */
704    
705 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
706     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
707     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
708 nigel 77
709 ph10 97 #else /* EBCDIC coding */
710 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
711 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
712     #endif
713    
714     /* Escapes that need further processing, or are illegal. */
715    
716     else
717     {
718     const uschar *oldptr;
719 nigel 93 BOOL braced, negated;
720    
721 nigel 77 switch (c)
722     {
723     /* A number of Perl escapes are not handled by PCRE. We give an explicit
724     error. */
725    
726 ph10 391 case CHAR_l:
727     case CHAR_L:
728 zherczeg 744 *errorcodeptr = ERR37;
729     break;
730    
731 ph10 391 case CHAR_u:
732 zherczeg 744 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
733     {
734     /* In JavaScript, \u must be followed by four hexadecimal numbers.
735     Otherwise it is a lowercase u letter. */
736     if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0
737     && (digitab[ptr[3]] & ctype_xdigit) != 0 && (digitab[ptr[4]] & ctype_xdigit) != 0)
738     {
739     c = 0;
740     for (i = 0; i < 4; ++i)
741     {
742     register int cc = *(++ptr);
743     #ifndef EBCDIC /* ASCII/UTF-8 coding */
744     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
745     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
746     #else /* EBCDIC coding */
747     if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
748     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
749     #endif
750     }
751     }
752     }
753     else
754     *errorcodeptr = ERR37;
755     break;
756    
757 ph10 391 case CHAR_U:
758 zherczeg 744 /* In JavaScript, \U is an uppercase U letter. */
759     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
760 nigel 77 break;
761    
762 ph10 654 /* In a character class, \g is just a literal "g". Outside a character
763 ph10 640 class, \g must be followed by one of a number of specific things:
764 ph10 345
765 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
766     backreference. If negative, it is a relative backreference. This is a Perl
767     5.10 feature.
768 ph10 345
769 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
770     is part of Perl's movement towards a unified syntax for back references. As
771     this is synonymous with \k{name}, we fudge it up by pretending it really
772     was \k.
773 ph10 345
774     (3) For Oniguruma compatibility we also support \g followed by a name or a
775     number either in angle brackets or in single quotes. However, these are
776     (possibly recursive) subroutine calls, _not_ backreferences. Just return
777 ph10 333 the -ESC_g code (cf \k). */
778 nigel 93
779 ph10 391 case CHAR_g:
780 ph10 640 if (isclass) break;
781 ph10 391 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
782 ph10 333 {
783     c = -ESC_g;
784 ph10 345 break;
785     }
786 ph10 333
787     /* Handle the Perl-compatible cases */
788 ph10 345
789 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
790 nigel 93 {
791 ph10 171 const uschar *p;
792 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
793     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
794     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
795 ph10 171 {
796     c = -ESC_k;
797     break;
798 ph10 172 }
799 nigel 93 braced = TRUE;
800     ptr++;
801     }
802     else braced = FALSE;
803    
804 ph10 391 if (ptr[1] == CHAR_MINUS)
805 nigel 93 {
806     negated = TRUE;
807     ptr++;
808     }
809     else negated = FALSE;
810    
811     c = 0;
812     while ((digitab[ptr[1]] & ctype_digit) != 0)
813 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
814 ph10 220
815 ph10 333 if (c < 0) /* Integer overflow */
816 ph10 213 {
817     *errorcodeptr = ERR61;
818     break;
819 ph10 220 }
820 ph10 345
821 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
822 nigel 93 {
823     *errorcodeptr = ERR57;
824 ph10 213 break;
825 nigel 93 }
826 ph10 345
827 ph10 333 if (c == 0)
828     {
829     *errorcodeptr = ERR58;
830     break;
831 ph10 345 }
832 nigel 93
833     if (negated)
834     {
835     if (c > bracount)
836     {
837     *errorcodeptr = ERR15;
838 ph10 213 break;
839 nigel 93 }
840     c = bracount - (c - 1);
841     }
842    
843     c = -(ESC_REF + c);
844     break;
845    
846 nigel 77 /* The handling of escape sequences consisting of a string of digits
847     starting with one that is not zero is not straightforward. By experiment,
848     the way Perl works seems to be as follows:
849    
850     Outside a character class, the digits are read as a decimal number. If the
851     number is less than 10, or if there are that many previous extracting
852     left brackets, then it is a back reference. Otherwise, up to three octal
853     digits are read to form an escaped byte. Thus \123 is likely to be octal
854     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
855     value is greater than 377, the least significant 8 bits are taken. Inside a
856     character class, \ followed by a digit is always an octal number. */
857    
858 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
859     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
860 nigel 77
861     if (!isclass)
862     {
863     oldptr = ptr;
864 ph10 391 c -= CHAR_0;
865 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
866 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
867 ph10 333 if (c < 0) /* Integer overflow */
868 ph10 213 {
869     *errorcodeptr = ERR61;
870 ph10 220 break;
871     }
872 nigel 77 if (c < 10 || c <= bracount)
873     {
874     c = -(ESC_REF + c);
875     break;
876     }
877     ptr = oldptr; /* Put the pointer back and fall through */
878     }
879    
880     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
881     generates a binary zero byte and treats the digit as a following literal.
882     Thus we have to pull back the pointer by one. */
883    
884 ph10 391 if ((c = *ptr) >= CHAR_8)
885 nigel 77 {
886     ptr--;
887     c = 0;
888     break;
889     }
890    
891     /* \0 always starts an octal number, but we may drop through to here with a
892 nigel 91 larger first octal digit. The original code used just to take the least
893     significant 8 bits of octal numbers (I think this is what early Perls used
894     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
895     than 3 octal digits. */
896 nigel 77
897 ph10 391 case CHAR_0:
898     c -= CHAR_0;
899     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
900     c = c * 8 + *(++ptr) - CHAR_0;
901 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
902 nigel 77 break;
903    
904 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
905     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
906     treated as a data character. */
907 nigel 77
908 ph10 391 case CHAR_x:
909 zherczeg 744 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
910     {
911     /* In JavaScript, \x must be followed by two hexadecimal numbers.
912     Otherwise it is a lowercase x letter. */
913     if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0)
914     {
915     c = 0;
916     for (i = 0; i < 2; ++i)
917     {
918     register int cc = *(++ptr);
919     #ifndef EBCDIC /* ASCII/UTF-8 coding */
920     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
921     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
922     #else /* EBCDIC coding */
923     if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
924     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
925     #endif
926     }
927     }
928     break;
929     }
930    
931 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
932 nigel 77 {
933     const uschar *pt = ptr + 2;
934 nigel 87 int count = 0;
935    
936 nigel 77 c = 0;
937     while ((digitab[*pt] & ctype_xdigit) != 0)
938     {
939 nigel 87 register int cc = *pt++;
940 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
941 nigel 77 count++;
942 nigel 87
943 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
944     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
945     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
946 ph10 97 #else /* EBCDIC coding */
947 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
948     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
949 nigel 77 #endif
950     }
951 nigel 87
952 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
953 nigel 77 {
954 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
955 nigel 77 ptr = pt;
956     break;
957     }
958 nigel 87
959 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
960     recognize this construct; fall through to the normal \x handling. */
961     }
962    
963 nigel 87 /* Read just a single-byte hex-defined char */
964 nigel 77
965     c = 0;
966     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
967     {
968 ph10 391 int cc; /* Some compilers don't like */
969     cc = *(++ptr); /* ++ in initializers */
970     #ifndef EBCDIC /* ASCII/UTF-8 coding */
971     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
972     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
973 ph10 97 #else /* EBCDIC coding */
974 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
975     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
976 nigel 77 #endif
977     }
978     break;
979    
980 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
981 ph10 574 An error is given if the byte following \c is not an ASCII character. This
982     coding is ASCII-specific, but then the whole concept of \cx is
983 nigel 93 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
984 nigel 77
985 ph10 391 case CHAR_c:
986 nigel 77 c = *(++ptr);
987     if (c == 0)
988     {
989     *errorcodeptr = ERR2;
990 ph10 213 break;
991 nigel 77 }
992 ph10 574 #ifndef EBCDIC /* ASCII/UTF-8 coding */
993     if (c > 127) /* Excludes all non-ASCII in either mode */
994     {
995     *errorcodeptr = ERR68;
996 ph10 579 break;
997     }
998 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
999 nigel 77 c ^= 0x40;
1000 ph10 574 #else /* EBCDIC coding */
1001 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
1002 nigel 77 c ^= 0xC0;
1003     #endif
1004     break;
1005    
1006     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1007 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
1008     otherwise, for Perl compatibility, it is a literal. This code looks a bit
1009     odd, but there used to be some cases other than the default, and there may
1010     be again in future, so I haven't "optimized" it. */
1011 nigel 77
1012     default:
1013     if ((options & PCRE_EXTRA) != 0) switch(c)
1014     {
1015     default:
1016     *errorcodeptr = ERR3;
1017     break;
1018     }
1019     break;
1020     }
1021     }
1022 ph10 518
1023     /* Perl supports \N{name} for character names, as well as plain \N for "not
1024 ph10 654 newline". PCRE does not support \N{name}. However, it does support
1025 ph10 640 quantification such as \N{2,3}. */
1026 nigel 77
1027 ph10 640 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1028     !is_counted_repeat(ptr+2))
1029 ph10 518 *errorcodeptr = ERR37;
1030 ph10 514
1031 ph10 518 /* If PCRE_UCP is set, we change the values for \d etc. */
1032    
1033     if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
1034     c -= (ESC_DU - ESC_D);
1035    
1036     /* Set the pointer to the final character before returning. */
1037    
1038 nigel 77 *ptrptr = ptr;
1039     return c;
1040     }
1041    
1042    
1043    
1044     #ifdef SUPPORT_UCP
1045     /*************************************************
1046     * Handle \P and \p *
1047     *************************************************/
1048    
1049     /* This function is called after \P or \p has been encountered, provided that
1050     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1051     pointing at the P or p. On exit, it is pointing at the final character of the
1052     escape sequence.
1053    
1054     Argument:
1055     ptrptr points to the pattern position pointer
1056     negptr points to a boolean that is set TRUE for negation else FALSE
1057 nigel 87 dptr points to an int that is set to the detailed property value
1058 nigel 77 errorcodeptr points to the error code variable
1059    
1060 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
1061 nigel 77 */
1062    
1063     static int
1064 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1065 nigel 77 {
1066     int c, i, bot, top;
1067     const uschar *ptr = *ptrptr;
1068 nigel 87 char name[32];
1069 nigel 77
1070     c = *(++ptr);
1071     if (c == 0) goto ERROR_RETURN;
1072    
1073     *negptr = FALSE;
1074    
1075 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1076     negation. */
1077 nigel 77
1078 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
1079 nigel 77 {
1080 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1081 nigel 77 {
1082     *negptr = TRUE;
1083     ptr++;
1084     }
1085 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
1086 nigel 77 {
1087     c = *(++ptr);
1088     if (c == 0) goto ERROR_RETURN;
1089 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1090 nigel 77 name[i] = c;
1091     }
1092 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1093 nigel 77 name[i] = 0;
1094     }
1095    
1096     /* Otherwise there is just one following character */
1097    
1098     else
1099     {
1100     name[0] = c;
1101     name[1] = 0;
1102     }
1103    
1104     *ptrptr = ptr;
1105    
1106     /* Search for a recognized property name using binary chop */
1107    
1108     bot = 0;
1109     top = _pcre_utt_size;
1110    
1111     while (bot < top)
1112     {
1113 nigel 87 i = (bot + top) >> 1;
1114 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
1115 nigel 87 if (c == 0)
1116     {
1117     *dptr = _pcre_utt[i].value;
1118     return _pcre_utt[i].type;
1119     }
1120 nigel 77 if (c > 0) bot = i + 1; else top = i;
1121     }
1122    
1123     *errorcodeptr = ERR47;
1124     *ptrptr = ptr;
1125     return -1;
1126    
1127     ERROR_RETURN:
1128     *errorcodeptr = ERR46;
1129     *ptrptr = ptr;
1130     return -1;
1131     }
1132     #endif
1133    
1134    
1135    
1136    
1137     /*************************************************
1138     * Read repeat counts *
1139     *************************************************/
1140    
1141     /* Read an item of the form {n,m} and return the values. This is called only
1142     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1143     so the syntax is guaranteed to be correct, but we need to check the values.
1144    
1145     Arguments:
1146     p pointer to first char after '{'
1147     minp pointer to int for min
1148     maxp pointer to int for max
1149     returned as -1 if no max
1150     errorcodeptr points to error code variable
1151    
1152     Returns: pointer to '}' on success;
1153     current ptr on error, with errorcodeptr set non-zero
1154     */
1155    
1156     static const uschar *
1157     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
1158     {
1159     int min = 0;
1160     int max = -1;
1161    
1162 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
1163     an integer overflow. */
1164    
1165 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1166 nigel 81 if (min < 0 || min > 65535)
1167     {
1168     *errorcodeptr = ERR5;
1169     return p;
1170     }
1171 nigel 77
1172 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
1173     Also, max must not be less than min. */
1174    
1175 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1176 nigel 77 {
1177 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1178 nigel 77 {
1179     max = 0;
1180 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1181 nigel 81 if (max < 0 || max > 65535)
1182     {
1183     *errorcodeptr = ERR5;
1184     return p;
1185     }
1186 nigel 77 if (max < min)
1187     {
1188     *errorcodeptr = ERR4;
1189     return p;
1190     }
1191     }
1192     }
1193    
1194 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1195     '}'. */
1196 nigel 77
1197 nigel 81 *minp = min;
1198     *maxp = max;
1199 nigel 77 return p;
1200     }
1201    
1202    
1203    
1204     /*************************************************
1205 ph10 408 * Subroutine for finding forward reference *
1206 nigel 91 *************************************************/
1207    
1208 ph10 408 /* This recursive function is called only from find_parens() below. The
1209     top-level call starts at the beginning of the pattern. All other calls must
1210     start at a parenthesis. It scans along a pattern's text looking for capturing
1211 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1212     name it is given, it returns its number. Alternatively, if the name is NULL, it
1213 ph10 578 returns when it reaches a given numbered subpattern. Recursion is used to keep
1214     track of subpatterns that reset the capturing group numbers - the (?| feature.
1215 nigel 91
1216 ph10 578 This function was originally called only from the second pass, in which we know
1217     that if (?< or (?' or (?P< is encountered, the name will be correctly
1218     terminated because that is checked in the first pass. There is now one call to
1219     this function in the first pass, to check for a recursive back reference by
1220     name (so that we can make the whole group atomic). In this case, we need check
1221 ph10 579 only up to the current position in the pattern, and that is still OK because
1222     and previous occurrences will have been checked. To make this work, the test
1223     for "end of pattern" is a check against cd->end_pattern in the main loop,
1224 ph10 578 instead of looking for a binary zero. This means that the special first-pass
1225 ph10 579 call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1226     processing items within the loop are OK, because afterwards the main loop will
1227 ph10 578 terminate.)
1228    
1229 nigel 91 Arguments:
1230 ph10 408 ptrptr address of the current character pointer (updated)
1231 ph10 345 cd compile background data
1232 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1233     lorn name length, or subpattern number if name is NULL
1234     xmode TRUE if we are in /x mode
1235 ph10 579 utf8 TRUE if we are in UTF-8 mode
1236 ph10 411 count pointer to the current capturing subpattern number (updated)
1237 nigel 91
1238     Returns: the number of the named subpattern, or -1 if not found
1239     */
1240    
1241     static int
1242 ph10 408 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1243 ph10 556 BOOL xmode, BOOL utf8, int *count)
1244 nigel 91 {
1245 ph10 408 uschar *ptr = *ptrptr;
1246     int start_count = *count;
1247     int hwm_count = start_count;
1248     BOOL dup_parens = FALSE;
1249 nigel 93
1250 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1251 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1252    
1253     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1254     {
1255 ph10 544 /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1256 ph10 545
1257 ph10 544 if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1258 ph10 545
1259 ph10 544 /* Handle a normal, unnamed capturing parenthesis. */
1260 ph10 408
1261 ph10 544 else if (ptr[1] != CHAR_QUESTION_MARK)
1262 ph10 408 {
1263     *count += 1;
1264     if (name == NULL && *count == lorn) return *count;
1265 ph10 411 ptr++;
1266 ph10 408 }
1267    
1268 ph10 544 /* All cases now have (? at the start. Remember when we are in a group
1269     where the parenthesis numbers are duplicated. */
1270    
1271     else if (ptr[2] == CHAR_VERTICAL_LINE)
1272     {
1273     ptr += 3;
1274     dup_parens = TRUE;
1275     }
1276 ph10 545
1277 ph10 544 /* Handle comments; all characters are allowed until a ket is reached. */
1278    
1279     else if (ptr[2] == CHAR_NUMBER_SIGN)
1280     {
1281     for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1282     goto FAIL_EXIT;
1283 ph10 545 }
1284 ph10 544
1285 ph10 408 /* Handle a condition. If it is an assertion, just carry on so that it
1286     is processed as normal. If not, skip to the closing parenthesis of the
1287 ph10 544 condition (there can't be any nested parens). */
1288 ph10 411
1289 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1290     {
1291 ph10 411 ptr += 2;
1292 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1293     {
1294     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1295 ph10 411 if (*ptr != 0) ptr++;
1296 ph10 408 }
1297 ph10 411 }
1298    
1299 ph10 544 /* Start with (? but not a condition. */
1300 ph10 408
1301     else
1302 ph10 411 {
1303 ph10 408 ptr += 2;
1304     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1305    
1306     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1307 ph10 411
1308 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1309     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1310     {
1311     int term;
1312     const uschar *thisname;
1313     *count += 1;
1314     if (name == NULL && *count == lorn) return *count;
1315     term = *ptr++;
1316     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1317     thisname = ptr;
1318     while (*ptr != term) ptr++;
1319     if (name != NULL && lorn == ptr - thisname &&
1320     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1321     return *count;
1322 ph10 461 term++;
1323 ph10 411 }
1324 ph10 408 }
1325 ph10 411 }
1326 ph10 408
1327 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1328 ph10 579 bars. Stop if we get to cd->end_pattern. Note that this is important for the
1329     first-pass call when this value is temporarily adjusted to stop at the current
1330 ph10 578 position. So DO NOT change this to a test for binary zero. */
1331 ph10 408
1332 ph10 578 for (; ptr < cd->end_pattern; ptr++)
1333 nigel 91 {
1334 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1335    
1336 ph10 391 if (*ptr == CHAR_BACKSLASH)
1337 nigel 93 {
1338 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1339 ph10 391 if (*ptr == CHAR_Q) for (;;)
1340 nigel 93 {
1341 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1342 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1343 ph10 391 if (*(++ptr) == CHAR_E) break;
1344 nigel 93 }
1345     continue;
1346     }
1347    
1348 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1349     are handled for real. If the first character is '^', skip it. Also, if the
1350     first few characters (either before or after ^) are \Q\E or \E we skip them
1351 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1352 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1353 nigel 93
1354 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1355 nigel 93 {
1356 ph10 340 BOOL negate_class = FALSE;
1357     for (;;)
1358     {
1359 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1360 ph10 340 {
1361 ph10 438 if (ptr[2] == CHAR_E)
1362     ptr+= 2;
1363     else if (strncmp((const char *)ptr+2,
1364 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1365 ph10 438 ptr += 4;
1366 ph10 392 else
1367 ph10 391 break;
1368 ph10 340 }
1369 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1370 ph10 461 {
1371 ph10 340 negate_class = TRUE;
1372 ph10 438 ptr++;
1373 ph10 461 }
1374 ph10 340 else break;
1375     }
1376    
1377     /* If the next character is ']', it is a data character that must be
1378 ph10 341 skipped, except in JavaScript compatibility mode. */
1379 ph10 345
1380 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1381 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1382 ph10 345 ptr++;
1383    
1384 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1385 nigel 93 {
1386 ph10 220 if (*ptr == 0) return -1;
1387 ph10 391 if (*ptr == CHAR_BACKSLASH)
1388 nigel 93 {
1389 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1390 ph10 391 if (*ptr == CHAR_Q) for (;;)
1391 nigel 93 {
1392 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1393 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1394 ph10 391 if (*(++ptr) == CHAR_E) break;
1395 nigel 93 }
1396     continue;
1397     }
1398     }
1399     continue;
1400     }
1401    
1402     /* Skip comments in /x mode */
1403    
1404 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1405 nigel 93 {
1406 ph10 579 ptr++;
1407 ph10 556 while (*ptr != 0)
1408     {
1409     if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1410     ptr++;
1411 ph10 579 #ifdef SUPPORT_UTF8
1412 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
1413     #endif
1414     }
1415 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1416 nigel 93 continue;
1417     }
1418    
1419 ph10 408 /* Check for the special metacharacters */
1420 ph10 411
1421 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1422 nigel 93 {
1423 ph10 556 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
1424 ph10 408 if (rc > 0) return rc;
1425     if (*ptr == 0) goto FAIL_EXIT;
1426 nigel 93 }
1427 ph10 411
1428 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1429     {
1430     if (dup_parens && *count < hwm_count) *count = hwm_count;
1431 ph10 545 goto FAIL_EXIT;
1432 ph10 408 }
1433 ph10 411
1434     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1435 ph10 408 {
1436     if (*count > hwm_count) hwm_count = *count;
1437     *count = start_count;
1438 ph10 411 }
1439 ph10 408 }
1440 nigel 93
1441 ph10 408 FAIL_EXIT:
1442     *ptrptr = ptr;
1443     return -1;
1444     }
1445 nigel 93
1446    
1447    
1448    
1449 ph10 408 /*************************************************
1450     * Find forward referenced subpattern *
1451     *************************************************/
1452 nigel 93
1453 ph10 408 /* This function scans along a pattern's text looking for capturing
1454     subpatterns, and counting them. If it finds a named pattern that matches the
1455     name it is given, it returns its number. Alternatively, if the name is NULL, it
1456     returns when it reaches a given numbered subpattern. This is used for forward
1457     references to subpatterns. We used to be able to start this scan from the
1458     current compiling point, using the current count value from cd->bracount, and
1459     do it all in a single loop, but the addition of the possibility of duplicate
1460     subpattern numbers means that we have to scan from the very start, in order to
1461     take account of such duplicates, and to use a recursive function to keep track
1462     of the different types of group.
1463    
1464     Arguments:
1465     cd compile background data
1466     name name to seek, or NULL if seeking a numbered subpattern
1467     lorn name length, or subpattern number if name is NULL
1468     xmode TRUE if we are in /x mode
1469 ph10 579 utf8 TRUE if we are in UTF-8 mode
1470 ph10 408
1471     Returns: the number of the found subpattern, or -1 if not found
1472     */
1473    
1474     static int
1475 ph10 556 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
1476     BOOL utf8)
1477 ph10 408 {
1478     uschar *ptr = (uschar *)cd->start_pattern;
1479     int count = 0;
1480     int rc;
1481    
1482     /* If the pattern does not start with an opening parenthesis, the first call
1483     to find_parens_sub() will scan right to the end (if necessary). However, if it
1484     does start with a parenthesis, find_parens_sub() will return when it hits the
1485     matching closing parens. That is why we have to have a loop. */
1486    
1487 ph10 411 for (;;)
1488     {
1489 ph10 556 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
1490 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1491     }
1492    
1493 ph10 408 return rc;
1494 nigel 91 }
1495    
1496    
1497    
1498 ph10 408
1499 nigel 91 /*************************************************
1500 nigel 77 * Find first significant op code *
1501     *************************************************/
1502    
1503     /* This is called by several functions that scan a compiled expression looking
1504     for a fixed first character, or an anchoring op code etc. It skips over things
1505 ph10 602 that do not influence this. For some calls, it makes sense to skip negative
1506     forward and all backward assertions, and also the \b assertion; for others it
1507     does not.
1508 nigel 77
1509     Arguments:
1510     code pointer to the start of the group
1511     skipassert TRUE if certain assertions are to be skipped
1512    
1513     Returns: pointer to the first significant opcode
1514     */
1515    
1516     static const uschar*
1517 ph10 604 first_significant_code(const uschar *code, BOOL skipassert)
1518 nigel 77 {
1519     for (;;)
1520     {
1521     switch ((int)*code)
1522     {
1523     case OP_ASSERT_NOT:
1524     case OP_ASSERTBACK:
1525     case OP_ASSERTBACK_NOT:
1526     if (!skipassert) return code;
1527     do code += GET(code, 1); while (*code == OP_ALT);
1528     code += _pcre_OP_lengths[*code];
1529     break;
1530    
1531     case OP_WORD_BOUNDARY:
1532     case OP_NOT_WORD_BOUNDARY:
1533     if (!skipassert) return code;
1534     /* Fall through */
1535    
1536     case OP_CALLOUT:
1537     case OP_CREF:
1538 ph10 459 case OP_NCREF:
1539 nigel 93 case OP_RREF:
1540 ph10 459 case OP_NRREF:
1541 nigel 93 case OP_DEF:
1542 nigel 77 code += _pcre_OP_lengths[*code];
1543     break;
1544    
1545     default:
1546     return code;
1547     }
1548     }
1549     /* Control never reaches here */
1550     }
1551    
1552    
1553    
1554    
1555     /*************************************************
1556 ph10 454 * Find the fixed length of a branch *
1557 nigel 77 *************************************************/
1558    
1559 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1560 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1561 ph10 461 In UTF8 mode, the result is in characters rather than bytes. The branch is
1562 ph10 454 temporarily terminated with OP_END when this function is called.
1563 nigel 77
1564 ph10 461 This function is called when a backward assertion is encountered, so that if it
1565     fails, the error message can point to the correct place in the pattern.
1566 ph10 454 However, we cannot do this when the assertion contains subroutine calls,
1567 ph10 461 because they can be forward references. We solve this by remembering this case
1568 ph10 454 and doing the check at the end; a flag specifies which mode we are running in.
1569    
1570 nigel 77 Arguments:
1571     code points to the start of the pattern (the bracket)
1572 ph10 604 utf8 TRUE in UTF-8 mode
1573 ph10 461 atend TRUE if called when the pattern is complete
1574     cd the "compile data" structure
1575 nigel 77
1576 ph10 461 Returns: the fixed length,
1577 ph10 454 or -1 if there is no fixed length,
1578 ph10 754 or -2 if \C was encountered (in UTF-8 mode only)
1579 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1580 ph10 747 or -4 if an unknown opcode was encountered (internal error)
1581 nigel 77 */
1582    
1583     static int
1584 ph10 604 find_fixedlength(uschar *code, BOOL utf8, BOOL atend, compile_data *cd)
1585 nigel 77 {
1586     int length = -1;
1587    
1588     register int branchlength = 0;
1589     register uschar *cc = code + 1 + LINK_SIZE;
1590    
1591     /* Scan along the opcodes for this branch. If we get to the end of the
1592     branch, check the length against that of the other branches. */
1593    
1594     for (;;)
1595     {
1596     int d;
1597 ph10 454 uschar *ce, *cs;
1598 nigel 77 register int op = *cc;
1599     switch (op)
1600     {
1601 ph10 604 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1602     OP_BRA (normal non-capturing bracket) because the other variants of these
1603     opcodes are all concerned with unlimited repeated groups, which of course
1604 ph10 747 are not of fixed length. */
1605 ph10 604
1606 nigel 93 case OP_CBRA:
1607 nigel 77 case OP_BRA:
1608     case OP_ONCE:
1609 ph10 733 case OP_ONCE_NC:
1610 nigel 77 case OP_COND:
1611 ph10 604 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);
1612 nigel 77 if (d < 0) return d;
1613     branchlength += d;
1614     do cc += GET(cc, 1); while (*cc == OP_ALT);
1615     cc += 1 + LINK_SIZE;
1616     break;
1617    
1618 ph10 747 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1619     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1620     an ALT. If it is END it's the end of the outer call. All can be handled by
1621     the same code. Note that we must not include the OP_KETRxxx opcodes here,
1622     because they all imply an unlimited repeat. */
1623 nigel 77
1624     case OP_ALT:
1625     case OP_KET:
1626     case OP_END:
1627 ph10 747 case OP_ACCEPT:
1628     case OP_ASSERT_ACCEPT:
1629 nigel 77 if (length < 0) length = branchlength;
1630     else if (length != branchlength) return -1;
1631     if (*cc != OP_ALT) return length;
1632     cc += 1 + LINK_SIZE;
1633     branchlength = 0;
1634     break;
1635 ph10 461
1636 ph10 454 /* A true recursion implies not fixed length, but a subroutine call may
1637     be OK. If the subroutine is a forward reference, we can't deal with
1638     it until the end of the pattern, so return -3. */
1639 ph10 461
1640 ph10 454 case OP_RECURSE:
1641     if (!atend) return -3;
1642     cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1643     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1644     if (cc > cs && cc < ce) return -1; /* Recursion */
1645 ph10 604 d = find_fixedlength(cs + 2, utf8, atend, cd);
1646 ph10 461 if (d < 0) return d;
1647 ph10 454 branchlength += d;
1648     cc += 1 + LINK_SIZE;
1649 ph10 461 break;
1650 nigel 77
1651     /* Skip over assertive subpatterns */
1652    
1653     case OP_ASSERT:
1654     case OP_ASSERT_NOT:
1655     case OP_ASSERTBACK:
1656     case OP_ASSERTBACK_NOT:
1657     do cc += GET(cc, 1); while (*cc == OP_ALT);
1658     /* Fall through */
1659    
1660     /* Skip over things that don't match chars */
1661    
1662 ph10 747 case OP_MARK:
1663     case OP_PRUNE_ARG:
1664     case OP_SKIP_ARG:
1665     case OP_THEN_ARG:
1666     cc += cc[1] + _pcre_OP_lengths[*cc];
1667     break;
1668    
1669 nigel 77 case OP_CALLOUT:
1670     case OP_CIRC:
1671 ph10 602 case OP_CIRCM:
1672 ph10 747 case OP_CLOSE:
1673     case OP_COMMIT:
1674     case OP_CREF:
1675     case OP_DEF:
1676 nigel 77 case OP_DOLL:
1677 ph10 602 case OP_DOLLM:
1678 ph10 747 case OP_EOD:
1679     case OP_EODN:
1680     case OP_FAIL:
1681     case OP_NCREF:
1682     case OP_NRREF:
1683 nigel 77 case OP_NOT_WORD_BOUNDARY:
1684 ph10 747 case OP_PRUNE:
1685     case OP_REVERSE:
1686     case OP_RREF:
1687     case OP_SET_SOM:
1688     case OP_SKIP:
1689     case OP_SOD:
1690     case OP_SOM:
1691     case OP_THEN:
1692 nigel 77 case OP_WORD_BOUNDARY:
1693     cc += _pcre_OP_lengths[*cc];
1694     break;
1695    
1696     /* Handle literal characters */
1697    
1698     case OP_CHAR:
1699 ph10 602 case OP_CHARI:
1700 nigel 91 case OP_NOT:
1701 ph10 604 case OP_NOTI:
1702 nigel 77 branchlength++;
1703     cc += 2;
1704     #ifdef SUPPORT_UTF8
1705 ph10 604 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1706 nigel 77 #endif
1707     break;
1708    
1709     /* Handle exact repetitions. The count is already in characters, but we
1710     need to skip over a multibyte character in UTF8 mode. */
1711    
1712     case OP_EXACT:
1713 ph10 747 case OP_EXACTI:
1714     case OP_NOTEXACT:
1715     case OP_NOTEXACTI:
1716 nigel 77 branchlength += GET2(cc,1);
1717     cc += 4;
1718     #ifdef SUPPORT_UTF8
1719 ph10 604 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1720 nigel 77 #endif
1721     break;
1722    
1723     case OP_TYPEEXACT:
1724     branchlength += GET2(cc,1);
1725 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1726 nigel 77 cc += 4;
1727     break;
1728    
1729     /* Handle single-char matchers */
1730    
1731     case OP_PROP:
1732     case OP_NOTPROP:
1733 nigel 87 cc += 2;
1734 nigel 77 /* Fall through */
1735    
1736 ph10 747 case OP_HSPACE:
1737     case OP_VSPACE:
1738     case OP_NOT_HSPACE:
1739     case OP_NOT_VSPACE:
1740 nigel 77 case OP_NOT_DIGIT:
1741     case OP_DIGIT:
1742     case OP_NOT_WHITESPACE:
1743     case OP_WHITESPACE:
1744     case OP_NOT_WORDCHAR:
1745     case OP_WORDCHAR:
1746     case OP_ANY:
1747 ph10 342 case OP_ALLANY:
1748 nigel 77 branchlength++;
1749     cc++;
1750     break;
1751    
1752 ph10 754 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1753     otherwise \C is coded as OP_ALLANY. */
1754 nigel 77
1755     case OP_ANYBYTE:
1756     return -2;
1757    
1758     /* Check a class for variable quantification */
1759    
1760     #ifdef SUPPORT_UTF8
1761     case OP_XCLASS:
1762     cc += GET(cc, 1) - 33;
1763     /* Fall through */
1764     #endif
1765    
1766     case OP_CLASS:
1767     case OP_NCLASS:
1768     cc += 33;
1769    
1770     switch (*cc)
1771     {
1772 ph10 747 case OP_CRPLUS:
1773     case OP_CRMINPLUS:
1774 nigel 77 case OP_CRSTAR:
1775     case OP_CRMINSTAR:
1776     case OP_CRQUERY:
1777     case OP_CRMINQUERY:
1778     return -1;
1779    
1780     case OP_CRRANGE:
1781     case OP_CRMINRANGE:
1782     if (GET2(cc,1) != GET2(cc,3)) return -1;
1783     branchlength += GET2(cc,1);
1784     cc += 5;
1785     break;
1786    
1787     default:
1788     branchlength++;
1789     }
1790     break;
1791    
1792     /* Anything else is variable length */
1793    
1794 ph10 747 case OP_ANYNL:
1795     case OP_BRAMINZERO:
1796     case OP_BRAPOS:
1797     case OP_BRAPOSZERO:
1798     case OP_BRAZERO:
1799     case OP_CBRAPOS:
1800     case OP_EXTUNI:
1801     case OP_KETRMAX:
1802     case OP_KETRMIN:
1803     case OP_KETRPOS:
1804     case OP_MINPLUS:
1805     case OP_MINPLUSI:
1806     case OP_MINQUERY:
1807     case OP_MINQUERYI:
1808     case OP_MINSTAR:
1809     case OP_MINSTARI:
1810     case OP_MINUPTO:
1811     case OP_MINUPTOI:
1812     case OP_NOTMINPLUS:
1813     case OP_NOTMINPLUSI:
1814     case OP_NOTMINQUERY:
1815     case OP_NOTMINQUERYI:
1816     case OP_NOTMINSTAR:
1817     case OP_NOTMINSTARI:
1818     case OP_NOTMINUPTO:
1819     case OP_NOTMINUPTOI:
1820     case OP_NOTPLUS:
1821     case OP_NOTPLUSI:
1822     case OP_NOTPOSPLUS:
1823     case OP_NOTPOSPLUSI:
1824     case OP_NOTPOSQUERY:
1825     case OP_NOTPOSQUERYI:
1826     case OP_NOTPOSSTAR:
1827     case OP_NOTPOSSTARI:
1828     case OP_NOTPOSUPTO:
1829     case OP_NOTPOSUPTOI:
1830     case OP_NOTQUERY:
1831     case OP_NOTQUERYI:
1832     case OP_NOTSTAR:
1833     case OP_NOTSTARI:
1834     case OP_NOTUPTO:
1835     case OP_NOTUPTOI:
1836     case OP_PLUS:
1837     case OP_PLUSI:
1838     case OP_POSPLUS:
1839     case OP_POSPLUSI:
1840     case OP_POSQUERY:
1841     case OP_POSQUERYI:
1842     case OP_POSSTAR:
1843     case OP_POSSTARI:
1844     case OP_POSUPTO:
1845     case OP_POSUPTOI:
1846     case OP_QUERY:
1847     case OP_QUERYI:
1848     case OP_REF:
1849     case OP_REFI:
1850     case OP_SBRA:
1851     case OP_SBRAPOS:
1852     case OP_SCBRA:
1853     case OP_SCBRAPOS:
1854     case OP_SCOND:
1855     case OP_SKIPZERO:
1856     case OP_STAR:
1857     case OP_STARI:
1858     case OP_TYPEMINPLUS:
1859     case OP_TYPEMINQUERY:
1860     case OP_TYPEMINSTAR:
1861     case OP_TYPEMINUPTO:
1862     case OP_TYPEPLUS:
1863     case OP_TYPEPOSPLUS:
1864     case OP_TYPEPOSQUERY:
1865     case OP_TYPEPOSSTAR:
1866     case OP_TYPEPOSUPTO:
1867     case OP_TYPEQUERY:
1868     case OP_TYPESTAR:
1869     case OP_TYPEUPTO:
1870     case OP_UPTO:
1871     case OP_UPTOI:
1872     return -1;
1873    
1874     /* Catch unrecognized opcodes so that when new ones are added they
1875     are not forgotten, as has happened in the past. */
1876    
1877 nigel 77 default:
1878 ph10 747 return -4;
1879 nigel 77 }
1880     }
1881     /* Control never gets here */
1882     }
1883    
1884    
1885    
1886    
1887     /*************************************************
1888 ph10 454 * Scan compiled regex for specific bracket *
1889 nigel 77 *************************************************/
1890    
1891     /* This little function scans through a compiled pattern until it finds a
1892 ph10 454 capturing bracket with the given number, or, if the number is negative, an
1893 ph10 461 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1894     so that it can be called from pcre_study() when finding the minimum matching
1895 ph10 455 length.
1896 nigel 77
1897     Arguments:
1898     code points to start of expression
1899     utf8 TRUE in UTF-8 mode
1900 ph10 454 number the required bracket number or negative to find a lookbehind
1901 nigel 77
1902     Returns: pointer to the opcode for the bracket, or NULL if not found
1903     */
1904    
1905 ph10 455 const uschar *
1906     _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1907 nigel 77 {
1908     for (;;)
1909     {
1910     register int c = *code;
1911 ph10 618
1912 nigel 77 if (c == OP_END) return NULL;
1913 nigel 91
1914     /* XCLASS is used for classes that cannot be represented just by a bit
1915     map. This includes negated single high-valued characters. The length in
1916     the table is zero; the actual length is stored in the compiled code. */
1917    
1918     if (c == OP_XCLASS) code += GET(code, 1);
1919 ph10 461
1920 ph10 454 /* Handle recursion */
1921 ph10 461
1922 ph10 454 else if (c == OP_REVERSE)
1923     {
1924 ph10 461 if (number < 0) return (uschar *)code;
1925 ph10 454 code += _pcre_OP_lengths[c];
1926     }
1927 nigel 91
1928 nigel 93 /* Handle capturing bracket */
1929 nigel 91
1930 ph10 604 else if (c == OP_CBRA || c == OP_SCBRA ||
1931     c == OP_CBRAPOS || c == OP_SCBRAPOS)
1932 nigel 77 {
1933 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1934 nigel 77 if (n == number) return (uschar *)code;
1935 nigel 93 code += _pcre_OP_lengths[c];
1936 nigel 77 }
1937 nigel 91
1938 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1939     repeated character types, we have to test for \p and \P, which have an extra
1940 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1941 ph10 510 must add in its length. */
1942 nigel 91
1943 nigel 77 else
1944     {
1945 ph10 218 switch(c)
1946     {
1947     case OP_TYPESTAR:
1948     case OP_TYPEMINSTAR:
1949     case OP_TYPEPLUS:
1950     case OP_TYPEMINPLUS:
1951     case OP_TYPEQUERY:
1952     case OP_TYPEMINQUERY:
1953     case OP_TYPEPOSSTAR:
1954     case OP_TYPEPOSPLUS:
1955     case OP_TYPEPOSQUERY:
1956     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1957 ph10 220 break;
1958 ph10 221
1959     case OP_TYPEUPTO:
1960     case OP_TYPEMINUPTO:
1961     case OP_TYPEEXACT:
1962     case OP_TYPEPOSUPTO:
1963     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1964     break;
1965 ph10 512
1966 ph10 510 case OP_MARK:
1967     case OP_PRUNE_ARG:
1968     case OP_SKIP_ARG:
1969     code += code[1];
1970 ph10 512 break;
1971 ph10 550
1972     case OP_THEN_ARG:
1973 ph10 716 code += code[1];
1974 ph10 550 break;
1975 ph10 220 }
1976    
1977 ph10 218 /* Add in the fixed length from the table */
1978 ph10 220
1979 nigel 77 code += _pcre_OP_lengths[c];
1980 ph10 220
1981 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1982     a multi-byte character. The length in the table is a minimum, so we have to
1983     arrange to skip the extra bytes. */
1984 ph10 220
1985 ph10 107 #ifdef SUPPORT_UTF8
1986 nigel 77 if (utf8) switch(c)
1987     {
1988     case OP_CHAR:
1989 ph10 602 case OP_CHARI:
1990 nigel 77 case OP_EXACT:
1991 ph10 602 case OP_EXACTI:
1992 nigel 77 case OP_UPTO:
1993 ph10 602 case OP_UPTOI:
1994 nigel 77 case OP_MINUPTO:
1995 ph10 602 case OP_MINUPTOI:
1996 nigel 93 case OP_POSUPTO:
1997 ph10 602 case OP_POSUPTOI:
1998 nigel 77 case OP_STAR:
1999 ph10 602 case OP_STARI:
2000 nigel 77 case OP_MINSTAR:
2001 ph10 602 case OP_MINSTARI:
2002 nigel 93 case OP_POSSTAR:
2003 ph10 602 case OP_POSSTARI:
2004 nigel 77 case OP_PLUS:
2005 ph10 602 case OP_PLUSI:
2006 nigel 77 case OP_MINPLUS:
2007 ph10 602 case OP_MINPLUSI:
2008 nigel 93 case OP_POSPLUS:
2009 ph10 602 case OP_POSPLUSI:
2010 nigel 77 case OP_QUERY:
2011 ph10 602 case OP_QUERYI:
2012 nigel 77 case OP_MINQUERY:
2013 ph10 602 case OP_MINQUERYI:
2014 nigel 93 case OP_POSQUERY:
2015 ph10 602 case OP_POSQUERYI:
2016 nigel 93 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
2017 nigel 77 break;
2018     }
2019 ph10 369 #else
2020     (void)(utf8); /* Keep compiler happy by referencing function argument */
2021 ph10 111 #endif
2022 nigel 77 }
2023     }
2024     }
2025    
2026    
2027    
2028     /*************************************************
2029     * Scan compiled regex for recursion reference *
2030     *************************************************/
2031    
2032     /* This little function scans through a compiled pattern until it finds an
2033     instance of OP_RECURSE.
2034    
2035     Arguments:
2036     code points to start of expression
2037     utf8 TRUE in UTF-8 mode
2038    
2039     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
2040     */
2041    
2042     static const uschar *
2043     find_recurse(const uschar *code, BOOL utf8)
2044     {
2045     for (;;)
2046     {
2047     register int c = *code;
2048     if (c == OP_END) return NULL;
2049 nigel 91 if (c == OP_RECURSE) return code;
2050 ph10 220
2051 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
2052     map. This includes negated single high-valued characters. The length in
2053     the table is zero; the actual length is stored in the compiled code. */
2054    
2055     if (c == OP_XCLASS) code += GET(code, 1);
2056    
2057 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
2058     repeated character types, we have to test for \p and \P, which have an extra
2059 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2060 ph10 510 must add in its length. */
2061 nigel 91
2062 nigel 77 else
2063     {
2064 ph10 218 switch(c)
2065     {
2066     case OP_TYPESTAR:
2067     case OP_TYPEMINSTAR:
2068     case OP_TYPEPLUS:
2069     case OP_TYPEMINPLUS:
2070     case OP_TYPEQUERY:
2071     case OP_TYPEMINQUERY:
2072     case OP_TYPEPOSSTAR:
2073     case OP_TYPEPOSPLUS:
2074     case OP_TYPEPOSQUERY:
2075     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2076 ph10 220 break;
2077 ph10 221
2078     case OP_TYPEPOSUPTO:
2079     case OP_TYPEUPTO:
2080     case OP_TYPEMINUPTO:
2081     case OP_TYPEEXACT:
2082     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2083     break;
2084 ph10 512
2085 ph10 510 case OP_MARK:
2086     case OP_PRUNE_ARG:
2087     case OP_SKIP_ARG:
2088     code += code[1];
2089 ph10 512 break;
2090 ph10 550
2091     case OP_THEN_ARG:
2092 ph10 716 code += code[1];
2093 ph10 550 break;
2094 ph10 220 }
2095    
2096 ph10 218 /* Add in the fixed length from the table */
2097    
2098 nigel 77 code += _pcre_OP_lengths[c];
2099 ph10 220
2100 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
2101     by a multi-byte character. The length in the table is a minimum, so we have
2102     to arrange to skip the extra bytes. */
2103 ph10 220
2104 ph10 107 #ifdef SUPPORT_UTF8
2105 nigel 77 if (utf8) switch(c)
2106     {
2107     case OP_CHAR:
2108 ph10 602 case OP_CHARI:
2109 nigel 77 case OP_EXACT:
2110 ph10 602 case OP_EXACTI:
2111 nigel 77 case OP_UPTO:
2112 ph10 602 case OP_UPTOI:
2113 nigel 77 case OP_MINUPTO:
2114 ph10 602 case OP_MINUPTOI:
2115 nigel 93 case OP_POSUPTO:
2116 ph10 602 case OP_POSUPTOI:
2117 nigel 77 case OP_STAR:
2118 ph10 602 case OP_STARI:
2119 nigel 77 case OP_MINSTAR:
2120 ph10 602 case OP_MINSTARI:
2121 nigel 93 case OP_POSSTAR:
2122 ph10 602 case OP_POSSTARI:
2123 nigel 77 case OP_PLUS:
2124 ph10 602 case OP_PLUSI:
2125 nigel 77 case OP_MINPLUS:
2126 ph10 602 case OP_MINPLUSI:
2127 nigel 93 case OP_POSPLUS:
2128 ph10 602 case OP_POSPLUSI:
2129 nigel 77 case OP_QUERY:
2130 ph10 602 case OP_QUERYI:
2131 nigel 77 case OP_MINQUERY:
2132 ph10 602 case OP_MINQUERYI:
2133 nigel 93 case OP_POSQUERY:
2134 ph10 602 case OP_POSQUERYI:
2135 nigel 93 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
2136 nigel 77 break;
2137     }
2138 ph10 369 #else
2139     (void)(utf8); /* Keep compiler happy by referencing function argument */
2140 ph10 111 #endif
2141 nigel 77 }
2142     }
2143     }
2144    
2145    
2146    
2147     /*************************************************
2148     * Scan compiled branch for non-emptiness *
2149     *************************************************/
2150    
2151     /* This function scans through a branch of a compiled pattern to see whether it
2152 nigel 93 can match the empty string or not. It is called from could_be_empty()
2153     below and from compile_branch() when checking for an unlimited repeat of a
2154     group that can match nothing. Note that first_significant_code() skips over
2155 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
2156     hit an unclosed bracket, we return "empty" - this means we've struck an inner
2157     bracket whose current branch will already have been scanned.
2158 nigel 77
2159     Arguments:
2160     code points to start of search
2161     endcode points to where to stop
2162     utf8 TRUE if in UTF8 mode
2163 ph10 503 cd contains pointers to tables etc.
2164 nigel 77
2165     Returns: TRUE if what is matched could be empty
2166     */
2167    
2168     static BOOL
2169 ph10 503 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
2170     compile_data *cd)
2171 nigel 77 {
2172     register int c;
2173 ph10 604 for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);
2174 nigel 77 code < endcode;
2175 ph10 604 code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))
2176 nigel 77 {
2177     const uschar *ccode;
2178    
2179     c = *code;
2180 ph10 507
2181 ph10 286 /* Skip over forward assertions; the other assertions are skipped by
2182 ph10 282 first_significant_code() with a TRUE final argument. */
2183 ph10 286
2184 ph10 282 if (c == OP_ASSERT)
2185 ph10 286 {
2186 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
2187     c = *code;
2188     continue;
2189 ph10 286 }
2190 ph10 172
2191 ph10 503 /* For a recursion/subroutine call, if its end has been reached, which
2192 ph10 624 implies a backward reference subroutine call, we can scan it. If it's a
2193     forward reference subroutine call, we can't. To detect forward reference
2194 ph10 654 we have to scan up the list that is kept in the workspace. This function is
2195     called only when doing the real compile, not during the pre-compile that
2196 ph10 624 measures the size of the compiled pattern. */
2197 ph10 507
2198 ph10 503 if (c == OP_RECURSE)
2199     {
2200 ph10 624 const uschar *scode;
2201     BOOL empty_branch;
2202 ph10 654
2203 ph10 624 /* Test for forward reference */
2204 ph10 654
2205 ph10 624 for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2206 ph10 654 if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
2207 ph10 624
2208     /* Not a forward reference, test for completed backward reference */
2209 ph10 654
2210 ph10 624 empty_branch = FALSE;
2211     scode = cd->start_code + GET(code, 1);
2212 ph10 503 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2213 ph10 654
2214 ph10 624 /* Completed backwards reference */
2215 ph10 654
2216 ph10 503 do
2217     {
2218 ph10 504 if (could_be_empty_branch(scode, endcode, utf8, cd))
2219     {
2220     empty_branch = TRUE;
2221 ph10 507 break;
2222     }
2223 ph10 503 scode += GET(scode, 1);
2224     }
2225     while (*scode == OP_ALT);
2226 ph10 654
2227 ph10 504 if (!empty_branch) return FALSE; /* All branches are non-empty */
2228 ph10 503 continue;
2229 ph10 507 }
2230 ph10 170
2231 ph10 604 /* Groups with zero repeats can of course be empty; skip them. */
2232    
2233     if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2234     c == OP_BRAPOSZERO)
2235     {
2236     code += _pcre_OP_lengths[c];
2237     do code += GET(code, 1); while (*code == OP_ALT);
2238     c = *code;
2239     continue;
2240     }
2241    
2242     /* A nested group that is already marked as "could be empty" can just be
2243     skipped. */
2244    
2245     if (c == OP_SBRA || c == OP_SBRAPOS ||
2246     c == OP_SCBRA || c == OP_SCBRAPOS)
2247     {
2248     do code += GET(code, 1); while (*code == OP_ALT);
2249     c = *code;
2250     continue;
2251     }
2252    
2253 ph10 170 /* For other groups, scan the branches. */
2254 ph10 172
2255 ph10 604 if (c == OP_BRA || c == OP_BRAPOS ||
2256     c == OP_CBRA || c == OP_CBRAPOS ||
2257 ph10 723 c == OP_ONCE || c == OP_ONCE_NC ||
2258     c == OP_COND)
2259 nigel 77 {
2260     BOOL empty_branch;
2261     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2262 ph10 406
2263     /* If a conditional group has only one branch, there is a second, implied,
2264 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
2265     Otherwise, scan the individual branches of the group. */
2266 ph10 406
2267 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2268 nigel 77 code += GET(code, 1);
2269 ph10 395 else
2270 ph10 406 {
2271 ph10 395 empty_branch = FALSE;
2272     do
2273     {
2274 ph10 503 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
2275 ph10 395 empty_branch = TRUE;
2276     code += GET(code, 1);
2277     }
2278     while (*code == OP_ALT);
2279     if (!empty_branch) return FALSE; /* All branches are non-empty */
2280 nigel 77 }
2281 ph10 406
2282 ph10 172 c = *code;
2283 nigel 93 continue;
2284 nigel 77 }
2285    
2286 nigel 93 /* Handle the other opcodes */
2287    
2288     switch (c)
2289 nigel 77 {
2290 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
2291     cannot be represented just by a bit map. This includes negated single
2292     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
2293 ph10 220 actual length is stored in the compiled code, so we must update "code"
2294 ph10 216 here. */
2295 nigel 77
2296     #ifdef SUPPORT_UTF8
2297     case OP_XCLASS:
2298 ph10 216 ccode = code += GET(code, 1);
2299 nigel 77 goto CHECK_CLASS_REPEAT;
2300     #endif
2301    
2302     case OP_CLASS:
2303     case OP_NCLASS:
2304     ccode = code + 33;
2305    
2306     #ifdef SUPPORT_UTF8
2307     CHECK_CLASS_REPEAT:
2308     #endif
2309    
2310     switch (*ccode)
2311     {
2312     case OP_CRSTAR: /* These could be empty; continue */
2313     case OP_CRMINSTAR:
2314     case OP_CRQUERY:
2315     case OP_CRMINQUERY:
2316     break;
2317    
2318     default: /* Non-repeat => class must match */
2319     case OP_CRPLUS: /* These repeats aren't empty */
2320     case OP_CRMINPLUS:
2321     return FALSE;
2322    
2323     case OP_CRRANGE:
2324     case OP_CRMINRANGE:
2325     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2326     break;
2327     }
2328     break;
2329    
2330     /* Opcodes that must match a character */
2331    
2332     case OP_PROP:
2333     case OP_NOTPROP:
2334     case OP_EXTUNI:
2335     case OP_NOT_DIGIT:
2336     case OP_DIGIT:
2337     case OP_NOT_WHITESPACE:
2338     case OP_WHITESPACE:
2339     case OP_NOT_WORDCHAR:
2340     case OP_WORDCHAR:
2341     case OP_ANY:
2342 ph10 345 case OP_ALLANY:
2343 nigel 77 case OP_ANYBYTE:
2344     case OP_CHAR:
2345 ph10 602 case OP_CHARI:
2346 nigel 77 case OP_NOT:
2347 ph10 602 case OP_NOTI:
2348 nigel 77 case OP_PLUS:
2349     case OP_MINPLUS:
2350 nigel 93 case OP_POSPLUS:
2351 nigel 77 case OP_EXACT:
2352     case OP_NOTPLUS:
2353     case OP_NOTMINPLUS:
2354 nigel 93 case OP_NOTPOSPLUS:
2355 nigel 77 case OP_NOTEXACT:
2356     case OP_TYPEPLUS:
2357     case OP_TYPEMINPLUS:
2358 nigel 93 case OP_TYPEPOSPLUS:
2359 nigel 77 case OP_TYPEEXACT:
2360     return FALSE;
2361 ph10 227
2362     /* These are going to continue, as they may be empty, but we have to
2363     fudge the length for the \p and \P cases. */
2364    
2365 ph10 224 case OP_TYPESTAR:
2366     case OP_TYPEMINSTAR:
2367     case OP_TYPEPOSSTAR:
2368     case OP_TYPEQUERY:
2369     case OP_TYPEMINQUERY:
2370     case OP_TYPEPOSQUERY:
2371     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2372 ph10 227 break;
2373    
2374 ph10 224 /* Same for these */
2375 ph10 227
2376 ph10 224 case OP_TYPEUPTO:
2377     case OP_TYPEMINUPTO:
2378     case OP_TYPEPOSUPTO:
2379     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2380     break;
2381 nigel 77
2382     /* End of branch */
2383    
2384     case OP_KET:
2385     case OP_KETRMAX:
2386     case OP_KETRMIN:
2387 ph10 604 case OP_KETRPOS:
2388 nigel 77 case OP_ALT:
2389     return TRUE;
2390    
2391 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2392     MINUPTO, and POSUPTO may be followed by a multibyte character */
2393 nigel 77
2394     #ifdef SUPPORT_UTF8
2395     case OP_STAR:
2396 ph10 602 case OP_STARI:
2397 nigel 77 case OP_MINSTAR:
2398 ph10 602 case OP_MINSTARI:
2399 nigel 93 case OP_POSSTAR:
2400 ph10 602 case OP_POSSTARI:
2401 nigel 77 case OP_QUERY:
2402 ph10 602 case OP_QUERYI:
2403 nigel 77 case OP_MINQUERY:
2404 ph10 602 case OP_MINQUERYI:
2405 nigel 93 case OP_POSQUERY:
2406 ph10 602 case OP_POSQUERYI:
2407 ph10 426 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2408     break;
2409 ph10 461
2410 nigel 77 case OP_UPTO:
2411 ph10 602 case OP_UPTOI:
2412 nigel 77 case OP_MINUPTO:
2413 ph10 602 case OP_MINUPTOI:
2414 nigel 93 case OP_POSUPTO:
2415 ph10 602 case OP_POSUPTOI:
2416 ph10 426 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2417 nigel 77 break;
2418     #endif
2419 ph10 503
2420 ph10 510 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2421     string. */
2422    
2423     case OP_MARK:
2424     case OP_PRUNE_ARG:
2425     case OP_SKIP_ARG:
2426     code += code[1];
2427 ph10 512 break;
2428 ph10 510
2429 ph10 550 case OP_THEN_ARG:
2430 ph10 716 code += code[1];
2431 ph10 550 break;
2432    
2433 ph10 503 /* None of the remaining opcodes are required to match a character. */
2434 ph10 507
2435 ph10 503 default:
2436 ph10 507 break;
2437 nigel 77 }
2438     }
2439    
2440     return TRUE;
2441     }
2442    
2443    
2444    
2445     /*************************************************
2446     * Scan compiled regex for non-emptiness *
2447     *************************************************/
2448    
2449     /* This function is called to check for left recursive calls. We want to check
2450     the current branch of the current pattern to see if it could match the empty
2451     string. If it could, we must look outwards for branches at other levels,
2452     stopping when we pass beyond the bracket which is the subject of the recursion.
2453 ph10 654 This function is called only during the real compile, not during the
2454 ph10 624 pre-compile.
2455 nigel 77
2456     Arguments:
2457     code points to start of the recursion
2458     endcode points to where to stop (current RECURSE item)
2459     bcptr points to the chain of current (unclosed) branch starts
2460     utf8 TRUE if in UTF-8 mode
2461 ph10 507 cd pointers to tables etc
2462 nigel 77
2463     Returns: TRUE if what is matched could be empty
2464     */
2465    
2466     static BOOL
2467     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2468 ph10 503 BOOL utf8, compile_data *cd)
2469 nigel 77 {
2470 ph10 475 while (bcptr != NULL && bcptr->current_branch >= code)
2471 nigel 77 {
2472 ph10 503 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2473 ph10 475 return FALSE;
2474 nigel 77 bcptr = bcptr->outer;
2475     }
2476     return TRUE;
2477     }
2478    
2479    
2480    
2481     /*************************************************
2482     * Check for POSIX class syntax *
2483     *************************************************/
2484    
2485     /* This function is called when the sequence "[:" or "[." or "[=" is
2486 ph10 295 encountered in a character class. It checks whether this is followed by a
2487 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2488 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2489 nigel 77
2490 ph10 298 Originally, this function only recognized a sequence of letters between the
2491     terminators, but it seems that Perl recognizes any sequence of characters,
2492     though of course unknown POSIX names are subsequently rejected. Perl gives an
2493     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2494     didn't consider this to be a POSIX class. Likewise for [:1234:].
2495 ph10 295
2496 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2497     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2498     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2499     below handles the special case of \], but does not try to do any other escape
2500     processing. This makes it different from Perl for cases such as [:l\ower:]
2501 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2502 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2503 ph10 295 I think.
2504    
2505 ph10 640 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2506     It seems that the appearance of a nested POSIX class supersedes an apparent
2507     external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2508 ph10 691 a digit.
2509 ph10 640
2510 ph10 661 In Perl, unescaped square brackets may also appear as part of class names. For
2511     example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2512     [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2513 ph10 691 seem right at all. PCRE does not allow closing square brackets in POSIX class
2514 ph10 661 names.
2515    
2516 ph10 295 Arguments:
2517 nigel 77 ptr pointer to the initial [
2518     endptr where to return the end pointer
2519    
2520     Returns: TRUE or FALSE
2521     */
2522    
2523     static BOOL
2524 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2525 nigel 77 {
2526     int terminator; /* Don't combine these lines; the Solaris cc */
2527     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2528 ph10 295 for (++ptr; *ptr != 0; ptr++)
2529 nigel 77 {
2530 ph10 654 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2531     ptr++;
2532 ph10 691 else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2533 ph10 640 else
2534 ph10 298 {
2535 ph10 391 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2536 ph10 295 {
2537     *endptr = ptr;
2538     return TRUE;
2539 ph10 298 }
2540 ph10 640 if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2541     (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2542     ptr[1] == CHAR_EQUALS_SIGN) &&
2543     check_posix_syntax(ptr, endptr))
2544 ph10 654 return FALSE;
2545 ph10 298 }
2546     }
2547 nigel 77 return FALSE;
2548     }
2549    
2550    
2551    
2552    
2553     /*************************************************
2554     * Check POSIX class name *
2555     *************************************************/
2556    
2557     /* This function is called to check the name given in a POSIX-style class entry
2558     such as [:alnum:].
2559    
2560     Arguments:
2561     ptr points to the first letter
2562     len the length of the name
2563    
2564     Returns: a value representing the name, or -1 if unknown
2565     */
2566    
2567     static int
2568     check_posix_name(const uschar *ptr, int len)
2569     {
2570 ph10 240 const char *pn = posix_names;
2571 nigel 77 register int yield = 0;
2572     while (posix_name_lengths[yield] != 0)
2573     {
2574     if (len == posix_name_lengths[yield] &&
2575 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2576 ph10 243 pn += posix_name_lengths[yield] + 1;
2577 nigel 77 yield++;
2578     }
2579     return -1;
2580     }
2581    
2582    
2583     /*************************************************
2584     * Adjust OP_RECURSE items in repeated group *
2585     *************************************************/
2586    
2587     /* OP_RECURSE items contain an offset from the start of the regex to the group
2588     that is referenced. This means that groups can be replicated for fixed
2589     repetition simply by copying (because the recursion is allowed to refer to
2590     earlier groups that are outside the current group). However, when a group is
2591 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2592     inserted before it, after it has been compiled. This means that any OP_RECURSE
2593     items within it that refer to the group itself or any contained groups have to
2594     have their offsets adjusted. That one of the jobs of this function. Before it
2595     is called, the partially compiled regex must be temporarily terminated with
2596     OP_END.
2597 nigel 77
2598 nigel 93 This function has been extended with the possibility of forward references for
2599     recursions and subroutine calls. It must also check the list of such references
2600     for the group we are dealing with. If it finds that one of the recursions in
2601     the current group is on this list, it adjusts the offset in the list, not the
2602     value in the reference (which is a group number).
2603    
2604 nigel 77 Arguments:
2605     group points to the start of the group
2606     adjust the amount by which the group is to be moved
2607     utf8 TRUE in UTF-8 mode
2608     cd contains pointers to tables etc.
2609 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2610 nigel 77
2611     Returns: nothing
2612     */
2613    
2614     static void
2615 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2616     uschar *save_hwm)
2617 nigel 77 {
2618     uschar *ptr = group;
2619 ph10 224
2620 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2621     {
2622 nigel 93 int offset;
2623     uschar *hc;
2624    
2625     /* See if this recursion is on the forward reference list. If so, adjust the
2626     reference. */
2627 ph10 345
2628 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2629     {
2630     offset = GET(hc, 0);
2631     if (cd->start_code + offset == ptr + 1)
2632     {
2633     PUT(hc, 0, offset + adjust);
2634     break;
2635     }
2636     }
2637    
2638     /* Otherwise, adjust the recursion offset if it's after the start of this
2639     group. */
2640    
2641     if (hc >= cd->hwm)
2642     {
2643     offset = GET(ptr, 1);
2644     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2645     }
2646    
2647 nigel 77 ptr += 1 + LINK_SIZE;
2648     }
2649     }
2650    
2651    
2652    
2653     /*************************************************
2654     * Insert an automatic callout point *
2655     *************************************************/
2656    
2657     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2658     callout points before each pattern item.
2659    
2660     Arguments:
2661     code current code pointer
2662     ptr current pattern pointer
2663     cd pointers to tables etc
2664    
2665     Returns: new code pointer
2666     */
2667    
2668     static uschar *
2669     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2670     {
2671     *code++ = OP_CALLOUT;
2672     *code++ = 255;
2673 ph10 530 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
2674     PUT(code, LINK_SIZE, 0); /* Default length */
2675 nigel 77 return code + 2*LINK_SIZE;
2676     }
2677    
2678    
2679    
2680     /*************************************************
2681     * Complete a callout item *
2682     *************************************************/
2683    
2684     /* A callout item contains the length of the next item in the pattern, which
2685     we can't fill in till after we have reached the relevant point. This is used
2686     for both automatic and manual callouts.
2687    
2688     Arguments:
2689     previous_callout points to previous callout item
2690     ptr current pattern pointer
2691     cd pointers to tables etc
2692    
2693     Returns: nothing
2694     */
2695    
2696     static void
2697     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2698     {
2699 ph10 530 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2700 nigel 77 PUT(previous_callout, 2 + LINK_SIZE, length);
2701     }
2702    
2703    
2704    
2705     #ifdef SUPPORT_UCP
2706     /*************************************************
2707     * Get othercase range *
2708     *************************************************/
2709    
2710     /* This function is passed the start and end of a class range, in UTF-8 mode
2711     with UCP support. It searches up the characters, looking for internal ranges of
2712     characters in the "other" case. Each call returns the next one, updating the
2713     start address.
2714    
2715     Arguments:
2716     cptr points to starting character value; updated
2717     d end value
2718     ocptr where to put start of othercase range
2719     odptr where to put end of othercase range
2720    
2721     Yield: TRUE when range returned; FALSE when no more
2722     */
2723    
2724     static BOOL
2725 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2726     unsigned int *odptr)
2727 nigel 77 {
2728 nigel 93 unsigned int c, othercase, next;
2729 nigel 77
2730     for (c = *cptr; c <= d; c++)
2731 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2732 nigel 77
2733     if (c > d) return FALSE;
2734    
2735     *ocptr = othercase;
2736     next = othercase + 1;
2737    
2738     for (++c; c <= d; c++)
2739     {
2740 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2741 nigel 77 next++;
2742     }
2743    
2744     *odptr = next - 1;
2745     *cptr = c;
2746    
2747     return TRUE;
2748     }
2749 ph10 532
2750    
2751    
2752     /*************************************************
2753     * Check a character and a property *
2754     *************************************************/
2755    
2756     /* This function is called by check_auto_possessive() when a property item
2757     is adjacent to a fixed character.
2758    
2759     Arguments:
2760     c the character
2761     ptype the property type
2762     pdata the data for the type
2763     negated TRUE if it's a negated property (\P or \p{^)
2764 ph10 535
2765 ph10 532 Returns: TRUE if auto-possessifying is OK
2766 ph10 535 */
2767 ph10 532
2768     static BOOL
2769     check_char_prop(int c, int ptype, int pdata, BOOL negated)
2770     {
2771     const ucd_record *prop = GET_UCD(c);
2772     switch(ptype)
2773     {
2774     case PT_LAMP:
2775     return (prop->chartype == ucp_Lu ||
2776     prop->chartype == ucp_Ll ||
2777     prop->chartype == ucp_Lt) == negated;
2778    
2779     case PT_GC:
2780     return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
2781    
2782     case PT_PC:
2783     return (pdata == prop->chartype) == negated;
2784    
2785     case PT_SC:
2786     return (pdata == prop->script) == negated;
2787    
2788     /* These are specials */
2789    
2790     case PT_ALNUM:
2791     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2792     _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
2793    
2794     case PT_SPACE: /* Perl space */
2795     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2796     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2797     == negated;
2798    
2799     case PT_PXSPACE: /* POSIX space */
2800     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2801     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2802     c == CHAR_FF || c == CHAR_CR)
2803     == negated;
2804    
2805     case PT_WORD:
2806     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2807     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2808     c == CHAR_UNDERSCORE) == negated;
2809     }
2810 ph10 535 return FALSE;
2811 ph10 532 }
2812 nigel 77 #endif /* SUPPORT_UCP */
2813    
2814    
2815 nigel 93
2816 nigel 77 /*************************************************
2817 nigel 93 * Check if auto-possessifying is possible *
2818     *************************************************/
2819    
2820     /* This function is called for unlimited repeats of certain items, to see
2821     whether the next thing could possibly match the repeated item. If not, it makes
2822     sense to automatically possessify the repeated item.
2823    
2824     Arguments:
2825 ph10 532 previous pointer to the repeated opcode
2826 nigel 93 utf8 TRUE in UTF-8 mode
2827     ptr next character in pattern
2828     options options bits
2829     cd contains pointers to tables etc.
2830    
2831     Returns: TRUE if possessifying is wanted
2832     */
2833    
2834     static BOOL
2835 ph10 535 check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
2836 ph10 532 int options, compile_data *cd)
2837 nigel 93 {
2838 ph10 532 int c, next;
2839     int op_code = *previous++;
2840 nigel 93
2841     /* Skip whitespace and comments in extended mode */
2842    
2843     if ((options & PCRE_EXTENDED) != 0)
2844     {
2845     for (;;)
2846     {
2847     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2848 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2849 nigel 93 {
2850 ph10 579 ptr++;
2851 ph10 556 while (*ptr != 0)
2852     {
2853 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2854 ph10 556 ptr++;
2855 ph10 579 #ifdef SUPPORT_UTF8
2856 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2857     #endif
2858     }
2859 nigel 93 }
2860     else break;
2861     }
2862     }
2863    
2864     /* If the next item is one that we can handle, get its value. A non-negative
2865     value is a character, a negative value is an escape value. */
2866    
2867 ph10 391 if (*ptr == CHAR_BACKSLASH)
2868 nigel 93 {
2869     int temperrorcode = 0;
2870     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2871     if (temperrorcode != 0) return FALSE;
2872     ptr++; /* Point after the escape sequence */
2873     }
2874    
2875     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2876     {
2877     #ifdef SUPPORT_UTF8
2878     if (utf8) { GETCHARINC(next, ptr); } else
2879     #endif
2880     next = *ptr++;
2881     }
2882    
2883     else return FALSE;
2884    
2885     /* Skip whitespace and comments in extended mode */
2886    
2887     if ((options & PCRE_EXTENDED) != 0)
2888     {
2889     for (;;)
2890     {
2891     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2892 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2893 nigel 93 {
2894 ph10 579 ptr++;
2895 ph10 556 while (*ptr != 0)
2896     {
2897 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2898 ph10 556 ptr++;
2899 ph10 579 #ifdef SUPPORT_UTF8
2900 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2901     #endif
2902     }
2903 nigel 93 }
2904     else break;
2905     }
2906     }
2907    
2908     /* If the next thing is itself optional, we have to give up. */
2909    
2910 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2911 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2912     return FALSE;
2913 nigel 93
2914 ph10 532 /* Now compare the next item with the previous opcode. First, handle cases when
2915     the next item is a character. */
2916 nigel 93
2917     if (next >= 0) switch(op_code)
2918     {
2919     case OP_CHAR:
2920 ph10 535 #ifdef SUPPORT_UTF8
2921 ph10 532 GETCHARTEST(c, previous);
2922 ph10 369 #else
2923 ph10 532 c = *previous;
2924 ph10 535 #endif
2925     return c != next;
2926 nigel 93
2927 ph10 602 /* For CHARI (caseless character) we must check the other case. If we have
2928 nigel 93 Unicode property support, we can use it to test the other case of
2929     high-valued characters. */
2930    
2931 ph10 602 case OP_CHARI:
2932 ph10 535 #ifdef SUPPORT_UTF8
2933 ph10 532 GETCHARTEST(c, previous);
2934     #else
2935     c = *previous;
2936 ph10 535 #endif
2937 ph10 532 if (c == next) return FALSE;
2938 nigel 93 #ifdef SUPPORT_UTF8
2939     if (utf8)
2940     {
2941     unsigned int othercase;
2942     if (next < 128) othercase = cd->fcc[next]; else
2943     #ifdef SUPPORT_UCP
2944 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2945 nigel 93 #else
2946     othercase = NOTACHAR;
2947     #endif
2948 ph10 532 return (unsigned int)c != othercase;
2949 nigel 93 }
2950     else
2951     #endif /* SUPPORT_UTF8 */
2952 ph10 532 return (c != cd->fcc[next]); /* Non-UTF-8 mode */
2953 nigel 93
2954 ph10 602 /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
2955 ph10 604 opcodes are not used for multi-byte characters, because they are coded using
2956 ph10 602 an XCLASS instead. */
2957 nigel 93
2958     case OP_NOT:
2959 ph10 602 return (c = *previous) == next;
2960 ph10 604
2961     case OP_NOTI:
2962 ph10 532 if ((c = *previous) == next) return TRUE;
2963 nigel 93 #ifdef SUPPORT_UTF8
2964     if (utf8)
2965     {
2966     unsigned int othercase;
2967     if (next < 128) othercase = cd->fcc[next]; else
2968     #ifdef SUPPORT_UCP
2969 ph10 349 othercase = UCD_OTHERCASE(next);
2970 nigel 93 #else
2971     othercase = NOTACHAR;
2972     #endif
2973 ph10 532 return (unsigned int)c == othercase;
2974 nigel 93 }
2975     else
2976     #endif /* SUPPORT_UTF8 */
2977 ph10 532 return (c == cd->fcc[next]); /* Non-UTF-8 mode */
2978 nigel 93
2979 ph10 535 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
2980     When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
2981    
2982 nigel 93 case OP_DIGIT:
2983     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2984    
2985     case OP_NOT_DIGIT:
2986     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2987    
2988     case OP_WHITESPACE:
2989     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2990    
2991     case OP_NOT_WHITESPACE:
2992     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2993    
2994     case OP_WORDCHAR:
2995     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2996    
2997     case OP_NOT_WORDCHAR:
2998     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2999    
3000 ph10 180 case OP_HSPACE:
3001     case OP_NOT_HSPACE:
3002     switch(next)
3003     {
3004     case 0x09:
3005     case 0x20:
3006     case 0xa0:
3007     case 0x1680:
3008     case 0x180e:
3009     case 0x2000:
3010     case 0x2001:
3011     case 0x2002:
3012     case 0x2003:
3013     case 0x2004:
3014     case 0x2005:
3015     case 0x2006:
3016     case 0x2007:
3017     case 0x2008:
3018     case 0x2009:
3019     case 0x200A:
3020     case 0x202f:
3021     case 0x205f:
3022     case 0x3000:
3023 ph10 528 return op_code == OP_NOT_HSPACE;
3024 ph10 180 default:
3025 ph10 528 return op_code != OP_NOT_HSPACE;
3026 ph10 180 }
3027    
3028 ph10 528 case OP_ANYNL:
3029 ph10 180 case OP_VSPACE:
3030     case OP_NOT_VSPACE:
3031     switch(next)
3032     {
3033     case 0x0a:
3034     case 0x0b:
3035     case 0x0c:
3036     case 0x0d:
3037     case 0x85:
3038     case 0x2028:
3039     case 0x2029:
3040 ph10 528 return op_code == OP_NOT_VSPACE;
3041 ph10 180 default:
3042 ph10 528 return op_code != OP_NOT_VSPACE;
3043 ph10 180 }
3044    
3045 ph10 532 #ifdef SUPPORT_UCP
3046     case OP_PROP:
3047     return check_char_prop(next, previous[0], previous[1], FALSE);
3048 ph10 535
3049 ph10 532 case OP_NOTPROP:
3050     return check_char_prop(next, previous[0], previous[1], TRUE);
3051     #endif
3052    
3053 nigel 93 default:
3054     return FALSE;
3055     }
3056    
3057    
3058 ph10 535 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3059     is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3060     generated only when PCRE_UCP is *not* set, that is, when only ASCII
3061     characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
3062 ph10 532 replaced by OP_PROP codes when PCRE_UCP is set. */
3063 nigel 93
3064     switch(op_code)
3065     {
3066     case OP_CHAR:
3067 ph10 602 case OP_CHARI:
3068 ph10 535 #ifdef SUPPORT_UTF8
3069 ph10 532 GETCHARTEST(c, previous);
3070     #else
3071     c = *previous;
3072 ph10 535 #endif
3073 nigel 93 switch(-next)
3074     {
3075     case ESC_d:
3076 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
3077 nigel 93
3078     case ESC_D:
3079 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
3080 nigel 93
3081     case ESC_s:
3082 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
3083 nigel 93
3084     case ESC_S:
3085 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
3086 nigel 93
3087     case ESC_w:
3088 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
3089 nigel 93
3090     case ESC_W:
3091 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
3092 ph10 182
3093 ph10 180 case ESC_h:
3094     case ESC_H:
3095 ph10 532 switch(c)
3096 ph10 180 {
3097     case 0x09:
3098     case 0x20:
3099     case 0xa0:
3100     case 0x1680:
3101     case 0x180e:
3102     case 0x2000:
3103     case 0x2001:
3104     case 0x2002:
3105     case 0x2003:
3106     case 0x2004:
3107     case 0x2005:
3108     case 0x2006:
3109     case 0x2007:
3110     case 0x2008:
3111     case 0x2009:
3112     case 0x200A:
3113     case 0x202f:
3114     case 0x205f:
3115     case 0x3000:
3116     return -next != ESC_h;
3117     default:
3118     return -next == ESC_h;
3119 ph10 182 }
3120    
3121 ph10 180 case ESC_v:
3122     case ESC_V:
3123 ph10 532 switch(c)
3124 ph10 180 {
3125     case 0x0a:
3126     case 0x0b:
3127     case 0x0c:
3128     case 0x0d:
3129     case 0x85:
3130     case 0x2028:
3131     case 0x2029:
3132     return -next != ESC_v;
3133     default:
3134     return -next == ESC_v;
3135 ph10 182 }
3136 ph10 535
3137     /* When PCRE_UCP is set, these values get generated for \d etc. Find
3138     their substitutions and process them. The result will always be either
3139 ph10 532 -ESC_p or -ESC_P. Then fall through to process those values. */
3140 ph10 535
3141 ph10 532 #ifdef SUPPORT_UCP
3142     case ESC_du:
3143     case ESC_DU:
3144     case ESC_wu:
3145     case ESC_WU:
3146     case ESC_su:
3147     case ESC_SU:
3148     {
3149     int temperrorcode = 0;
3150     ptr = substitutes[-next - ESC_DU];
3151     next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
3152     if (temperrorcode != 0) return FALSE;
3153     ptr++; /* For compatibility */
3154     }
3155 ph10 535 /* Fall through */
3156 nigel 93
3157 ph10 532 case ESC_p:
3158     case ESC_P:
3159     {
3160     int ptype, pdata, errorcodeptr;
3161 ph10 535 BOOL negated;
3162    
3163 ph10 532 ptr--; /* Make ptr point at the p or P */
3164     ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
3165     if (ptype < 0) return FALSE;
3166     ptr++; /* Point past the final curly ket */
3167 ph10 535
3168 ph10 532 /* If the property item is optional, we have to give up. (When generated
3169     from \d etc by PCRE_UCP, this test will have been applied much earlier,
3170     to the original \d etc. At this point, ptr will point to a zero byte. */
3171 ph10 535
3172 ph10 532 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3173     strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3174     return FALSE;
3175 ph10 535
3176 ph10 532 /* Do the property check. */
3177 ph10 535
3178 ph10 532 return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
3179 ph10 535 }
3180 ph10 532 #endif
3181    
3182 nigel 93 default:
3183     return FALSE;
3184     }
3185    
3186 ph10 535 /* In principle, support for Unicode properties should be integrated here as
3187     well. It means re-organizing the above code so as to get hold of the property
3188     values before switching on the op-code. However, I wonder how many patterns
3189     combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
3190     these op-codes are never generated.) */
3191    
3192 nigel 93 case OP_DIGIT:
3193 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
3194 ph10 528 next == -ESC_h || next == -ESC_v || next == -ESC_R;
3195 nigel 93
3196     case OP_NOT_DIGIT:
3197     return next == -ESC_d;
3198    
3199     case OP_WHITESPACE:
3200 ph10 528 return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
3201 nigel 93
3202     case OP_NOT_WHITESPACE:
3203 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
3204 nigel 93
3205 ph10 180 case OP_HSPACE:
3206 ph10 535 return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
3207 ph10 528 next == -ESC_w || next == -ESC_v || next == -ESC_R;
3208 ph10 180
3209     case OP_NOT_HSPACE:
3210     return next == -ESC_h;
3211 ph10 182
3212 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
3213 ph10 535 case OP_ANYNL:
3214 ph10 182 case OP_VSPACE:
3215 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
3216    
3217     case OP_NOT_VSPACE:
3218 ph10 528 return next == -ESC_v || next == -ESC_R;
3219 ph10 180
3220 nigel 93 case OP_WORDCHAR:
3221 ph10 535 return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
3222 ph10 528 next == -ESC_v || next == -ESC_R;
3223 nigel 93
3224     case OP_NOT_WORDCHAR:
3225     return next == -ESC_w || next == -ESC_d;
3226 ph10 182
3227 nigel 93 default:
3228     return FALSE;
3229     }
3230    
3231     /* Control does not reach here */
3232     }
3233    
3234    
3235    
3236     /*************************************************
3237 nigel 77 * Compile one branch *
3238     *************************************************/
3239    
3240 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
3241 nigel 77 changed during the branch, the pointer is used to change the external options
3242 nigel 93 bits. This function is used during the pre-compile phase when we are trying
3243     to find out the amount of memory needed, as well as during the real compile
3244     phase. The value of lengthptr distinguishes the two phases.
3245 nigel 77
3246     Arguments:
3247     optionsptr pointer to the option bits
3248     codeptr points to the pointer to the current code point
3249     ptrptr points to the current pattern pointer
3250     errorcodeptr points to error code variable
3251     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3252     reqbyteptr set to the last literal character required, else < 0
3253     bcptr points to current branch chain
3254 ph10 654 cond_depth conditional nesting depth
3255 nigel 77 cd contains pointers to tables etc.
3256 nigel 93 lengthptr NULL during the real compile phase
3257     points to length accumulator during pre-compile phase
3258 nigel 77
3259     Returns: TRUE on success
3260     FALSE, with *errorcodeptr set non-zero on error
3261     */
3262    
3263     static BOOL
3264 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
3265     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
3266 ph10 642 int cond_depth, compile_data *cd, int *lengthptr)
3267 nigel 77 {
3268     int repeat_type, op_type;
3269     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
3270     int bravalue = 0;
3271     int greedy_default, greedy_non_default;
3272     int firstbyte, reqbyte;
3273     int zeroreqbyte, zerofirstbyte;
3274     int req_caseopt, reqvary, tempreqvary;
3275 ph10 635 int options = *optionsptr; /* May change dynamically */
3276 nigel 77 int after_manual_callout = 0;
3277 nigel 93 int length_prevgroup = 0;
3278 nigel 77 register int c;
3279     register uschar *code = *codeptr;
3280 nigel 93 uschar *last_code = code;
3281     uschar *orig_code = code;
3282 nigel 77 uschar *tempcode;
3283     BOOL inescq = FALSE;
3284     BOOL groupsetfirstbyte = FALSE;
3285     const uschar *ptr = *ptrptr;
3286     const uschar *tempptr;
3287 ph10 518 const uschar *nestptr = NULL;
3288 nigel 77 uschar *previous = NULL;
3289     uschar *previous_callout = NULL;
3290 nigel 93 uschar *save_hwm = NULL;
3291 nigel 77 uschar classbits[32];
3292    
3293 ph10 635 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3294 ph10 654 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3295 ph10 635 dynamically as we process the pattern. */
3296    
3297 nigel 77 #ifdef SUPPORT_UTF8
3298     BOOL class_utf8;
3299     BOOL utf8 = (options & PCRE_UTF8) != 0;
3300     uschar *class_utf8data;
3301 ph10 300 uschar *class_utf8data_base;
3302 nigel 77 uschar utf8_char[6];
3303     #else
3304     BOOL utf8 = FALSE;
3305     #endif
3306    
3307 ph10 475 #ifdef PCRE_DEBUG
3308 nigel 93 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3309     #endif
3310    
3311 nigel 77 /* Set up the default and non-default settings for greediness */
3312    
3313     greedy_default = ((options & PCRE_UNGREEDY) != 0);
3314     greedy_non_default = greedy_default ^ 1;
3315    
3316     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3317     matching encountered yet". It gets changed to REQ_NONE if we hit something that
3318     matches a non-fixed char first char; reqbyte just remains unset if we never
3319     find one.
3320    
3321     When we hit a repeat whose minimum is zero, we may have to adjust these values
3322     to take the zero repeat into account. This is implemented by setting them to
3323     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
3324     item types that can be repeated set these backoff variables appropriately. */
3325    
3326     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
3327    
3328     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
3329     according to the current setting of the caseless flag. REQ_CASELESS is a bit
3330     value > 255. It is added into the firstbyte or reqbyte variables to record the
3331     case status of the value. This is used only for ASCII characters. */
3332    
3333     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3334    
3335     /* Switch on next character until the end of the branch */
3336    
3337     for (;; ptr++)
3338     {
3339     BOOL negate_class;
3340 ph10 286 BOOL should_flip_negation;
3341 nigel 77 BOOL possessive_quantifier;
3342     BOOL is_quantifier;
3343 nigel 93 BOOL is_recurse;
3344 ph10 180 BOOL reset_bracount;
3345 nigel 77 int class_charcount;
3346     int class_lastchar;
3347     int newoptions;
3348     int recno;
3349 ph10 172 int refsign;
3350 nigel 77 int skipbytes;
3351     int subreqbyte;
3352     int subfirstbyte;
3353 nigel 93 int terminator;
3354 nigel 77 int mclength;
3355 ph10 733 int tempbracount;
3356 nigel 77 uschar mcbuffer[8];
3357    
3358 nigel 93 /* Get next byte in the pattern */
3359 nigel 77
3360     c = *ptr;
3361 ph10 345
3362 ph10 535 /* If we are at the end of a nested substitution, revert to the outer level
3363 ph10 518 string. Nesting only happens one level deep. */
3364    
3365     if (c == 0 && nestptr != NULL)
3366     {
3367     ptr = nestptr;
3368     nestptr = NULL;
3369     c = *ptr;
3370     }
3371    
3372 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
3373     previous cycle of this loop. */
3374    
3375     if (lengthptr != NULL)
3376     {
3377 ph10 475 #ifdef PCRE_DEBUG
3378 nigel 93 if (code > cd->hwm) cd->hwm = code; /* High water info */
3379     #endif
3380 ph10 773 if (code > cd->start_workspace + cd->workspace_size -
3381     WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
3382 nigel 93 {
3383     *errorcodeptr = ERR52;
3384     goto FAILED;
3385     }
3386    
3387     /* There is at least one situation where code goes backwards: this is the
3388     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3389     the class is simply eliminated. However, it is created first, so we have to
3390     allow memory for it. Therefore, don't ever reduce the length at this point.
3391     */
3392    
3393     if (code < last_code) code = last_code;
3394 ph10 202
3395     /* Paranoid check for integer overflow */
3396    
3397     if (OFLOW_MAX - *lengthptr < code - last_code)
3398     {
3399     *errorcodeptr = ERR20;
3400     goto FAILED;
3401     }
3402    
3403 ph10 530 *lengthptr += (int)(code - last_code);
3404 ph10 751 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code),
3405     c));
3406 nigel 93
3407     /* If "previous" is set and it is not at the start of the work space, move
3408     it back to there, in order to avoid filling up the work space. Otherwise,
3409     if "previous" is NULL, reset the current code pointer to the start. */
3410    
3411     if (previous != NULL)
3412     {
3413     if (previous > orig_code)
3414     {
3415     memmove(orig_code, previous, code - previous);
3416     code -= previous - orig_code;
3417     previous = orig_code;
3418     }
3419     }
3420     else code = orig_code;
3421    
3422     /* Remember where this code item starts so we can pick up the length
3423     next time round. */
3424    
3425     last_code = code;
3426     }
3427    
3428     /* In the real compile phase, just check the workspace used by the forward
3429     reference list. */
3430    
3431 ph10 773 else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3432     WORK_SIZE_SAFETY_MARGIN)
3433 nigel 93 {
3434     *errorcodeptr = ERR52;
3435     goto FAILED;
3436     }
3437    
3438 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
3439    
3440     if (inescq && c != 0)
3441     {
3442 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3443 nigel 77 {
3444     inescq = FALSE;
3445     ptr++;
3446     continue;
3447     }
3448     else
3449     {
3450     if (previous_callout != NULL)
3451     {
3452 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3453     complete_callout(previous_callout, ptr, cd);
3454 nigel 77 previous_callout = NULL;
3455     }
3456     if ((options & PCRE_AUTO_CALLOUT) != 0)
3457     {
3458     previous_callout = code;
3459     code = auto_callout(code, ptr, cd);
3460     }
3461     goto NORMAL_CHAR;
3462     }
3463     }
3464    
3465     /* Fill in length of a previous callout, except when the next thing is
3466     a quantifier. */
3467    
3468 ph10 392 is_quantifier =
3469 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3470     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3471 nigel 77
3472     if (!is_quantifier && previous_callout != NULL &&
3473     after_manual_callout-- <= 0)
3474     {
3475 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3476     complete_callout(previous_callout, ptr, cd);
3477 nigel 77 previous_callout = NULL;
3478     }
3479    
3480 ph10 635 /* In extended mode, skip white space and comments. */
3481 nigel 77
3482     if ((options & PCRE_EXTENDED) != 0)
3483     {
3484     if ((cd->ctypes[c] & ctype_space) != 0) continue;
3485 ph10 391 if (c == CHAR_NUMBER_SIGN)
3486 nigel 77 {
3487 ph10 579 ptr++;
3488 ph10 556 while (*ptr != 0)
3489 nigel 91 {
3490 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3491 ph10 556 ptr++;
3492 ph10 579 #ifdef SUPPORT_UTF8
3493 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
3494     #endif
3495 nigel 91 }
3496 nigel 93 if (*ptr != 0) continue;
3497    
3498 nigel 91 /* Else fall through to handle end of string */
3499     c = 0;
3500 nigel 77 }
3501     }
3502    
3503     /* No auto callout for quantifiers. */
3504    
3505     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3506     {
3507     previous_callout = code;
3508     code = auto_callout(code, ptr, cd);
3509     }
3510    
3511     switch(c)
3512     {
3513 nigel 93 /* ===================================================================*/
3514     case 0: /* The branch terminates at string end */
3515 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
3516     case CHAR_RIGHT_PARENTHESIS:
3517 nigel 77 *firstbyteptr = firstbyte;
3518     *reqbyteptr = reqbyte;
3519     *codeptr = code;
3520     *ptrptr = ptr;
3521 nigel 93 if (lengthptr != NULL)
3522     {
3523 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
3524     {
3525     *errorcodeptr = ERR20;
3526     goto FAILED;
3527     }
3528 ph10 530 *lengthptr += (int)(code - last_code); /* To include callout length */
3529 nigel 93 DPRINTF((">> end branch\n"));
3530     }
3531 nigel 77 return TRUE;
3532    
3533 nigel 93
3534     /* ===================================================================*/
3535 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
3536     the setting of any following char as a first character. */
3537    
3538 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
3539 ph10 602 previous = NULL;
3540 nigel 77 if ((options & PCRE_MULTILINE) != 0)
3541     {
3542     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3543 ph10 602 *code++ = OP_CIRCM;
3544 nigel 77 }
3545 ph10 602 else *code++ = OP_CIRC;
3546 nigel 77 break;
3547    
3548 ph10 391 case CHAR_DOLLAR_SIGN:
3549 nigel 77 previous = NULL;
3550 ph10 602 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3551 nigel 77 break;
3552    
3553     /* There can never be a first char if '.' is first, whatever happens about
3554     repeats. The value of reqbyte doesn't change either. */
3555    
3556 ph10 391 case CHAR_DOT:
3557 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3558     zerofirstbyte = firstbyte;
3559     zeroreqbyte = reqbyte;
3560     previous = code;
3561 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3562 nigel 77 break;
3563    
3564 nigel 93
3565     /* ===================================================================*/
3566 nigel 87 /* Character classes. If the included characters are all < 256, we build a
3567     32-byte bitmap of the permitted characters, except in the special case
3568     where there is only one such character. For negated classes, we build the
3569     map as usual, then invert it at the end. However, we use a different opcode
3570     so that data characters > 255 can be handled correctly.
3571 nigel 77
3572     If the class contains characters outside the 0-255 range, a different
3573     opcode is compiled. It may optionally have a bit map for characters < 256,
3574     but those above are are explicitly listed afterwards. A flag byte tells
3575     whether the bitmap is present, and whether this is a negated class or not.
3576 ph10 345
3577 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
3578     default (Perl) mode, it is treated as a data character. */
3579 ph10 345
3580 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
3581 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3582     {
3583     *errorcodeptr = ERR64;
3584 ph10 345 goto FAILED;
3585 ph10 336 }
3586 ph10 345 goto NORMAL_CHAR;
3587 nigel 77
3588 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
3589 nigel 77 previous = code;
3590    
3591     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3592     they are encountered at the top level, so we'll do that too. */
3593    
3594 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3595 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
3596 ph10 295 check_posix_syntax(ptr, &tempptr))
3597 nigel 77 {
3598 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3599 nigel 77 goto FAILED;
3600     }
3601    
3602 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
3603 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
3604 ph10 205 skip them too. This makes for compatibility with Perl. */
3605 ph10 208
3606 ph10 205 negate_class = FALSE;
3607     for (;;)
3608 nigel 77 {
3609     c = *(++ptr);
3610 ph10 391 if (c == CHAR_BACKSLASH)
3611 ph10 205 {
3612 ph10 392 if (ptr[1] == CHAR_E)
3613 ph10 391 ptr++;
3614 ph10 392 else if (strncmp((const char *)ptr+1,
3615     STR_Q STR_BACKSLASH STR_E, 3) == 0)
3616 ph10 391 ptr += 3;
3617 ph10 392 else
3618 ph10 391 break;
3619 ph10 205 }
3620 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3621 ph10 205 negate_class = TRUE;
3622     else break;
3623 ph10 208 }
3624 ph10 345
3625     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3626     an initial ']' is taken as a data character -- the code below handles
3627 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3628     [^] must match any character, so generate OP_ALLANY. */
3629 ph10 345
3630 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3631 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3632 ph10 341 {
3633     *code++ = negate_class? OP_ALLANY : OP_FAIL;
3634     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3635     zerofirstbyte = firstbyte;
3636     break;
3637 ph10 345 }
3638 nigel 77
3639 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
3640     negation flag at the end, so that support for characters > 255 works
3641 ph10 264 correctly (they are all included in the class). */
3642    
3643     should_flip_negation = FALSE;
3644    
3645 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
3646 nigel 93 of just a single character (as long as it's < 256). However, For higher
3647     valued UTF-8 characters, we don't yet do any optimization. */
3648 nigel 77
3649     class_charcount = 0;
3650     class_lastchar = -1;
3651    
3652 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
3653     temporary bit of memory, in case the class contains only 1 character (less
3654     than 256), because in that case the compiled code doesn't use the bit map.
3655     */
3656    
3657     memset(classbits, 0, 32 * sizeof(uschar));
3658    
3659 nigel 77 #ifdef SUPPORT_UTF8
3660     class_utf8 = FALSE; /* No chars >= 256 */
3661 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3662 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3663 nigel 77 #endif
3664    
3665     /* Process characters until ] is reached. By writing this as a "do" it
3666 nigel 93 means that an initial ] is taken as a data character. At the start of the
3667     loop, c contains the first byte of the character. */
3668 nigel 77
3669 nigel 93 if (c != 0) do
3670 nigel 77 {
3671 nigel 93 const uschar *oldptr;
3672    
3673 nigel 77 #ifdef SUPPORT_UTF8
3674     if (utf8 && c > 127)
3675     { /* Braces are required because the */
3676     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3677     }
3678 ph10 535
3679 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3680 ph10 309 data and reset the pointer. This is so that very large classes that
3681 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
3682 ph10 309 (which is on the stack). */
3683    
3684 ph10 300 if (lengthptr != NULL)
3685     {
3686     *lengthptr += class_utf8data - class_utf8data_base;
3687 ph10 309 class_utf8data = class_utf8data_base;
3688     }
3689    
3690 nigel 77 #endif
3691    
3692     /* Inside \Q...\E everything is literal except \E */
3693    
3694     if (inescq)
3695     {
3696 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3697 nigel 77 {
3698 nigel 93 inescq = FALSE; /* Reset literal state */
3699     ptr++; /* Skip the 'E' */
3700     continue; /* Carry on with next */
3701 nigel 77 }
3702 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
3703 nigel 77 }
3704    
3705     /* Handle POSIX class names. Perl allows a negation extension of the
3706     form [:^name:]. A square bracket that doesn't match the syntax is
3707     treated as a literal. We also recognize the POSIX constructions
3708     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3709     5.6 and 5.8 do. */
3710    
3711 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3712 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3713 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3714 nigel 77 {
3715     BOOL local_negate = FALSE;
3716 nigel 87 int posix_class, taboffset, tabopt;
3717 nigel 77 register const uschar *cbits = cd->cbits;
3718 nigel 87 uschar pbits[32];
3719 nigel 77
3720 ph10 391 if (ptr[1] != CHAR_COLON)
3721 nigel 77 {
3722     *errorcodeptr = ERR31;
3723     goto FAILED;
3724     }
3725    
3726     ptr += 2;
3727 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3728 nigel 77 {
3729     local_negate = TRUE;
3730 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3731 nigel 77 ptr++;
3732     }
3733    
3734 ph10 530 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3735 nigel 77 if (posix_class < 0)
3736     {
3737     *errorcodeptr = ERR30;
3738     goto FAILED;
3739     }
3740    
3741     /* If matching is caseless, upper and lower are converted to
3742     alpha. This relies on the fact that the class table starts with
3743     alpha, lower, upper as the first 3 entries. */
3744    
3745     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3746     posix_class = 0;
3747 ph10 535
3748     /* When PCRE_UCP is set, some of the POSIX classes are converted to
3749 ph10 518 different escape sequences that use Unicode properties. */
3750 ph10 535
3751 ph10 518 #ifdef SUPPORT_UCP
3752     if ((options & PCRE_UCP) != 0)
3753     {
3754     int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3755     if (posix_substitutes[pc] != NULL)
3756     {
3757 ph10 535 nestptr = tempptr + 1;
3758 ph10 518 ptr = posix_substitutes[pc] - 1;
3759 ph10 535 continue;
3760     }
3761     }
3762     #endif
3763 ph10 518 /* In the non-UCP case, we build the bit map for the POSIX class in a
3764     chunk of local store because we may be adding and subtracting from it,
3765     and we don't want to subtract bits that may be in the main map already.
3766     At the end we or the result into the bit map that is being built. */
3767 nigel 77
3768     posix_class *= 3;
3769 nigel 87
3770     /* Copy in the first table (always present) */
3771    
3772     memcpy(pbits, cbits + posix_class_maps[posix_class],
3773     32 * sizeof(uschar));
3774    
3775     /* If there is a second table, add or remove it as required. */
3776    
3777     taboffset = posix_class_maps[posix_class + 1];
3778     tabopt = posix_class_maps[posix_class + 2];
3779    
3780     if (taboffset >= 0)
3781 nigel 77 {
3782 nigel 87 if (tabopt >= 0)
3783     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3784 nigel 77 else
3785 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3786 nigel 77 }
3787    
3788 nigel 87 /* Not see if we need to remove any special characters. An option
3789     value of 1 removes vertical space and 2 removes underscore. */
3790    
3791     if (tabopt < 0) tabopt = -tabopt;
3792     if (tabopt == 1) pbits[1] &= ~0x3c;
3793     else if (tabopt == 2) pbits[11] &= 0x7f;
3794    
3795     /* Add the POSIX table or its complement into the main table that is
3796     being built and we are done. */
3797    
3798     if (local_negate)
3799     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3800     else
3801     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3802    
3803 nigel 77 ptr = tempptr + 1;
3804     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3805     continue; /* End of POSIX syntax handling */
3806     }
3807    
3808     /* Backslash may introduce a single character, or it may introduce one
3809 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3810 ph10 513 case. Inside a class (and only there) it is treated as backspace. We
3811     assume that other escapes have more than one character in them, so set
3812     class_charcount bigger than one. Unrecognized escapes fall through and
3813     are either treated as literal characters (by default), or are faulted if
3814     PCRE_EXTRA is set. */
3815 nigel 77
3816 ph10 391 if (c == CHAR_BACKSLASH)
3817 nigel 77 {
3818 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3819     if (*errorcodeptr != 0) goto FAILED;
3820 nigel 77
3821 ph10 513 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3822 ph10 758 else if (-c == ESC_N) /* \N is not supported in a class */
3823     {
3824     *errorcodeptr = ERR71;
3825     goto FAILED;
3826     }
3827 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3828     {
3829 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3830 nigel 77 {
3831     ptr += 2; /* avoid empty string */
3832     }
3833     else inescq = TRUE;
3834     continue;
3835     }
3836 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3837 nigel 77
3838     if (c < 0)
3839     {
3840     register const uschar *cbits = cd->cbits;
3841     class_charcount += 2; /* Greater than 1 is what matters */
3842 nigel 93
3843 ph10 518 switch (-c)
3844 nigel 77 {
3845 ph10 518 #ifdef SUPPORT_UCP
3846     case ESC_du: /* These are the values given for \d etc */
3847     case ESC_DU: /* when PCRE_UCP is set. We replace the */
3848     case ESC_wu: /* escape sequence with an appropriate \p */
3849     case ESC_WU: /* or \P to test Unicode properties instead */
3850     case ESC_su: /* of the default ASCII testing. */
3851     case ESC_SU:
3852     nestptr = ptr;
3853     ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
3854 ph10 535 class_charcount -= 2; /* Undo! */
3855 ph10 518 continue;
3856     #endif
3857 nigel 77 case ESC_d:
3858     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3859     continue;
3860    
3861     case ESC_D:
3862 ph10