/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 518 - (hide annotations) (download)
Tue May 18 15:47:01 2010 UTC (3 years ago) by ph10
File MIME type: text/plain
File size: 228686 byte(s)
Added PCRE_UCP and related stuff to make \w etc use Unicode properties.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 475 Copyright (c) 1997-2010 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 ph10 475 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57     also used by pcretest. PCRE_DEBUG is not defined when building a production
58     library. */
59 nigel 85
60 ph10 475 #ifdef PCRE_DEBUG
61 nigel 85 #include "pcre_printint.src"
62     #endif
63    
64    
65 ph10 178 /* Macro for setting individual bits in class bitmaps. */
66    
67     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68    
69 ph10 202 /* Maximum length value to check against when making sure that the integer that
70     holds the compiled pattern length does not overflow. We make it a bit less than
71     INT_MAX to allow for adding in group terminating bytes, so that we don't have
72     to check them every time. */
73 ph10 178
74 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
75    
76    
77 nigel 77 /*************************************************
78     * Code parameters and static tables *
79     *************************************************/
80    
81 nigel 93 /* This value specifies the size of stack workspace that is used during the
82     first pre-compile phase that determines how much memory is required. The regex
83     is partly compiled into this space, but the compiled parts are discarded as
84     soon as they can be, so that hopefully there will never be an overrun. The code
85     does, however, check for an overrun. The largest amount I've seen used is 218,
86     so this number is very generous.
87 nigel 77
88 nigel 93 The same workspace is used during the second, actual compile phase for
89     remembering forward references to groups so that they can be filled in at the
90     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91     is 4 there is plenty of room. */
92 nigel 77
93 nigel 93 #define COMPILE_WORK_SIZE (4096)
94 nigel 77
95 ph10 507 /* The overrun tests check for a slightly smaller size so that they detect the
96 ph10 505 overrun before it actually does run off the end of the data block. */
97 nigel 93
98 ph10 505 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99    
100    
101 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
102     are simple data values; negative values are for special things like \d and so
103     on. Zero means further processing is needed (for things like \x), or the escape
104     is invalid. */
105    
106 ph10 391 #ifndef EBCDIC
107    
108     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
109 ph10 392 in UTF-8 mode. */
110 ph10 391
111 ph10 392 static const short int escapes[] = {
112 ph10 391 0, 0,
113     0, 0,
114 ph10 392 0, 0,
115     0, 0,
116     0, 0,
117 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
118 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
119 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
120 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
121     -ESC_B, -ESC_C,
122     -ESC_D, -ESC_E,
123     0, -ESC_G,
124     -ESC_H, 0,
125     0, -ESC_K,
126 ph10 391 0, 0,
127 ph10 514 -ESC_N, 0,
128 ph10 391 -ESC_P, -ESC_Q,
129     -ESC_R, -ESC_S,
130 ph10 392 0, 0,
131     -ESC_V, -ESC_W,
132     -ESC_X, 0,
133     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
134 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
135 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
136 ph10 391 CHAR_GRAVE_ACCENT, 7,
137 ph10 392 -ESC_b, 0,
138     -ESC_d, ESC_e,
139 ph10 391 ESC_f, 0,
140     -ESC_h, 0,
141 ph10 392 0, -ESC_k,
142 ph10 391 0, 0,
143     ESC_n, 0,
144 ph10 392 -ESC_p, 0,
145     ESC_r, -ESC_s,
146 ph10 391 ESC_tee, 0,
147 ph10 392 -ESC_v, -ESC_w,
148     0, 0,
149 ph10 391 -ESC_z
150 nigel 77 };
151    
152 ph10 392 #else
153 ph10 391
154     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155    
156 nigel 77 static const short int escapes[] = {
157     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
158     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
159     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
160     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
161     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
162     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
163     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
164     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
165 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
166 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
167 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
168 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
169 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
170     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
171     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
172     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
173 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
174 ph10 514 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
175 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
176 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
177 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
178     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
179     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
180     };
181     #endif
182    
183    
184 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185     searched linearly. Put all the names into a single string, in order to reduce
186 ph10 392 the number of relocations when a shared library is dynamically linked. The
187     string is built from string macros so that it works in UTF-8 mode on EBCDIC
188 ph10 391 platforms. */
189 ph10 210
190     typedef struct verbitem {
191 ph10 510 int len; /* Length of verb name */
192     int op; /* Op when no arg, or -1 if arg mandatory */
193     int op_arg; /* Op when arg present, or -1 if not allowed */
194 ph10 211 } verbitem;
195 ph10 210
196 ph10 240 static const char verbnames[] =
197 ph10 510 "\0" /* Empty name is a shorthand for MARK */
198 ph10 512 STRING_MARK0
199 ph10 391 STRING_ACCEPT0
200     STRING_COMMIT0
201     STRING_F0
202     STRING_FAIL0
203     STRING_PRUNE0
204     STRING_SKIP0
205     STRING_THEN;
206 ph10 240
207 ph10 327 static const verbitem verbs[] = {
208 ph10 510 { 0, -1, OP_MARK },
209 ph10 512 { 4, -1, OP_MARK },
210 ph10 510 { 6, OP_ACCEPT, -1 },
211     { 6, OP_COMMIT, -1 },
212     { 1, OP_FAIL, -1 },
213     { 4, OP_FAIL, -1 },
214     { 5, OP_PRUNE, OP_PRUNE_ARG },
215     { 4, OP_SKIP, OP_SKIP_ARG },
216     { 4, OP_THEN, OP_THEN_ARG }
217 ph10 210 };
218    
219 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
220 ph10 210
221    
222 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
223     now all in a single string, to reduce the number of relocations when a shared
224 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
225     length entry. The first three must be alpha, lower, upper, as this is assumed
226     for handling case independence. */
227 nigel 77
228 ph10 240 static const char posix_names[] =
229 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
230     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
231 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
232     STRING_word0 STRING_xdigit;
233 nigel 77
234     static const uschar posix_name_lengths[] = {
235     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
236    
237 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
238     base map, with an optional addition or removal of another map. Then, for some
239     classes, there is some additional tweaking: for [:blank:] the vertical space
240     characters are removed, and for [:alpha:] and [:alnum:] the underscore
241     character is removed. The triples in the table consist of the base map offset,
242     second map offset or -1 if no second map, and a non-negative value for map
243     addition or a negative value for map subtraction (if there are two maps). The
244     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
245     remove vertical space characters, 2 => remove underscore. */
246 nigel 77
247     static const int posix_class_maps[] = {
248 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
249     cbit_lower, -1, 0, /* lower */
250     cbit_upper, -1, 0, /* upper */
251     cbit_word, -1, 2, /* alnum - word without underscore */
252     cbit_print, cbit_cntrl, 0, /* ascii */
253     cbit_space, -1, 1, /* blank - a GNU extension */
254     cbit_cntrl, -1, 0, /* cntrl */
255     cbit_digit, -1, 0, /* digit */
256     cbit_graph, -1, 0, /* graph */
257     cbit_print, -1, 0, /* print */
258     cbit_punct, -1, 0, /* punct */
259     cbit_space, -1, 0, /* space */
260     cbit_word, -1, 0, /* word - a Perl extension */
261     cbit_xdigit,-1, 0 /* xdigit */
262 nigel 77 };
263    
264 ph10 518 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
265     substitutes must be in the order of the names, defined above, and there are
266     both positive and negative cases. NULL means no substitute. */
267 nigel 77
268 ph10 518 #ifdef SUPPORT_UCP
269     static const uschar *substitutes[] = {
270     (uschar *)"\\P{Nd}", /* \D */
271     (uschar *)"\\p{Nd}", /* \d */
272     (uschar *)"\\P{Xsp}", /* \S */ /* NOTE: Xsp is Perl space */
273     (uschar *)"\\p{Xsp}", /* \s */
274     (uschar *)"\\P{Xwd}", /* \W */
275     (uschar *)"\\p{Xwd}" /* \w */
276     };
277    
278     static const uschar *posix_substitutes[] = {
279     (uschar *)"\\p{L}", /* alpha */
280     (uschar *)"\\p{Ll}", /* lower */
281     (uschar *)"\\p{Lu}", /* upper */
282     (uschar *)"\\p{Xan}", /* alnum */
283     NULL, /* ascii */
284     (uschar *)"\\h", /* blank */
285     NULL, /* cntrl */
286     (uschar *)"\\p{Nd}", /* digit */
287     NULL, /* graph */
288     NULL, /* print */
289     NULL, /* punct */
290     (uschar *)"\\p{Xps}", /* space */ /* NOTE: Xps is POSIX space */
291     (uschar *)"\\p{Xwd}", /* word */
292     NULL, /* xdigit */
293     /* Negated cases */
294     (uschar *)"\\P{L}", /* ^alpha */
295     (uschar *)"\\P{Ll}", /* ^lower */
296     (uschar *)"\\P{Lu}", /* ^upper */
297     (uschar *)"\\P{Xan}", /* ^alnum */
298     NULL, /* ^ascii */
299     (uschar *)"\\H", /* ^blank */
300     NULL, /* ^cntrl */
301     (uschar *)"\\P{Nd}", /* ^digit */
302     NULL, /* ^graph */
303     NULL, /* ^print */
304     NULL, /* ^punct */
305     (uschar *)"\\P{Xps}", /* ^space */ /* NOTE: Xps is POSIX space */
306     (uschar *)"\\P{Xwd}", /* ^word */
307     NULL /* ^xdigit */
308     };
309     #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
310     #endif
311    
312 nigel 93 #define STRING(a) # a
313     #define XSTRING(s) STRING(s)
314    
315 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
316 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
317     they are documented. Always add a new error instead. Messages marked DEAD below
318 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
319     the number of relocations needed when a shared library is loaded dynamically,
320     it is now one long string. We cannot use a table of offsets, because the
321     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
322     simply count through to the one we want - this isn't a performance issue
323 ph10 507 because these strings are used only when there is a compilation error.
324 nigel 77
325 ph10 507 Each substring ends with \0 to insert a null character. This includes the final
326     substring, so that the whole string ends with \0\0, which can be detected when
327 ph10 499 counting through. */
328    
329 ph10 240 static const char error_texts[] =
330     "no error\0"
331     "\\ at end of pattern\0"
332     "\\c at end of pattern\0"
333     "unrecognized character follows \\\0"
334     "numbers out of order in {} quantifier\0"
335 nigel 77 /* 5 */
336 ph10 240 "number too big in {} quantifier\0"
337     "missing terminating ] for character class\0"
338     "invalid escape sequence in character class\0"
339     "range out of order in character class\0"
340     "nothing to repeat\0"
341 nigel 77 /* 10 */
342 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
343     "internal error: unexpected repeat\0"
344 ph10 269 "unrecognized character after (? or (?-\0"
345 ph10 240 "POSIX named classes are supported only within a class\0"
346     "missing )\0"
347 nigel 77 /* 15 */
348 ph10 240 "reference to non-existent subpattern\0"
349     "erroffset passed as NULL\0"
350     "unknown option bit(s) set\0"
351     "missing ) after comment\0"
352     "parentheses nested too deeply\0" /** DEAD **/
353 nigel 77 /* 20 */
354 ph10 240 "regular expression is too large\0"
355     "failed to get memory\0"
356     "unmatched parentheses\0"
357     "internal error: code overflow\0"
358     "unrecognized character after (?<\0"
359 nigel 77 /* 25 */
360 ph10 240 "lookbehind assertion is not fixed length\0"
361     "malformed number or name after (?(\0"
362     "conditional group contains more than two branches\0"
363     "assertion expected after (?(\0"
364     "(?R or (?[+-]digits must be followed by )\0"
365 nigel 77 /* 30 */
366 ph10 240 "unknown POSIX class name\0"
367     "POSIX collating elements are not supported\0"
368     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
369     "spare error\0" /** DEAD **/
370     "character value in \\x{...} sequence is too large\0"
371 nigel 77 /* 35 */
372 ph10 240 "invalid condition (?(0)\0"
373     "\\C not allowed in lookbehind assertion\0"
374 ph10 514 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
375 ph10 240 "number after (?C is > 255\0"
376     "closing ) for (?C expected\0"
377 nigel 77 /* 40 */
378 ph10 240 "recursive call could loop indefinitely\0"
379     "unrecognized character after (?P\0"
380     "syntax error in subpattern name (missing terminator)\0"
381     "two named subpatterns have the same name\0"
382     "invalid UTF-8 string\0"
383 nigel 77 /* 45 */
384 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
385     "malformed \\P or \\p sequence\0"
386     "unknown property name after \\P or \\p\0"
387     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
388     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
389 nigel 91 /* 50 */
390 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
391     "octal value is greater than \\377 (not in UTF-8 mode)\0"
392     "internal error: overran compiling workspace\0"
393     "internal error: previously-checked referenced subpattern not found\0"
394     "DEFINE group contains more than one branch\0"
395 nigel 93 /* 55 */
396 ph10 240 "repeating a DEFINE group is not allowed\0"
397     "inconsistent NEWLINE options\0"
398 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
399     "a numbered reference must not be zero\0"
400 ph10 510 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
401 ph10 211 /* 60 */
402 ph10 240 "(*VERB) not recognized\0"
403 ph10 268 "number is too big\0"
404 ph10 272 "subpattern name expected\0"
405 ph10 336 "digit expected after (?+\0"
406 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
407     /* 65 */
408 ph10 510 "different names for subpatterns of the same number are not allowed\0"
409 ph10 512 "(*MARK) must have an argument\0"
410 ph10 518 "this version of PCRE is not compiled with PCRE_UCP support\0"
411 ph10 510 ;
412 nigel 77
413     /* Table to identify digits and hex digits. This is used when compiling
414     patterns. Note that the tables in chartables are dependent on the locale, and
415     may mark arbitrary characters as digits - but the PCRE compiling code expects
416     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
417     a private table here. It costs 256 bytes, but it is a lot faster than doing
418     character value tests (at least in some simple cases I timed), and in some
419     applications one wants PCRE to compile efficiently as well as match
420     efficiently.
421    
422     For convenience, we use the same bit definitions as in chartables:
423    
424     0x04 decimal digit
425     0x08 hexadecimal digit
426    
427     Then we can use ctype_digit and ctype_xdigit in the code. */
428    
429 ph10 392 #ifndef EBCDIC
430 ph10 391
431 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
432 ph10 391 UTF-8 mode. */
433    
434 nigel 77 static const unsigned char digitab[] =
435     {
436     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
437     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
438     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
439     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
440     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
441     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
442     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
443     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
444     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
445     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
446     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
447     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
448     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
449     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
450     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
451     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
452     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
453     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
454     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
455     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
456     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
457     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
458     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
459     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
460     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
461     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
462     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
463     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
464     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
465     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
466     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
467     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
468    
469 ph10 392 #else
470 ph10 391
471     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
472    
473 nigel 77 static const unsigned char digitab[] =
474     {
475     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
476     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
477     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
478     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
479     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
480     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
481     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
482     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
483     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
484     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
485     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
486 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
487 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
488     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
489     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
490     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
491     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
492     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
493     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
494     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
495     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
496     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
497     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
498     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
499     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
500     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
501     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
502     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
503     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
504     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
505     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
506     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
507    
508     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
509     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
510     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
511     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
512     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
513     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
514     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
515     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
516     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
517     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
518     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
519     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
520 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
521 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
522     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
523     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
524     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
525     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
526     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
527     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
528     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
529     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
530     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
531     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
532     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
533     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
534     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
535     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
536     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
537     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
538     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
539     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
540     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
541     #endif
542    
543    
544     /* Definition to allow mutual recursion */
545    
546     static BOOL
547 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
548 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
549 nigel 77
550    
551    
552     /*************************************************
553 ph10 240 * Find an error text *
554     *************************************************/
555    
556 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
557     some of the text is of unknown length, we can't use a table of offsets.
558     Instead, just count through the strings. This is not a performance issue
559 ph10 240 because it happens only when there has been a compilation error.
560    
561     Argument: the error number
562     Returns: pointer to the error string
563     */
564    
565     static const char *
566     find_error_text(int n)
567     {
568     const char *s = error_texts;
569 ph10 507 for (; n > 0; n--)
570 ph10 499 {
571     while (*s++ != 0) {};
572     if (*s == 0) return "Error text not found (please report)";
573 ph10 507 }
574 ph10 240 return s;
575     }
576    
577    
578     /*************************************************
579 nigel 77 * Handle escapes *
580     *************************************************/
581    
582     /* This function is called when a \ has been encountered. It either returns a
583     positive value for a simple escape such as \n, or a negative value which
584 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
585     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
586     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
587     ptr is pointing at the \. On exit, it is on the final character of the escape
588     sequence.
589 nigel 77
590     Arguments:
591     ptrptr points to the pattern position pointer
592     errorcodeptr points to the errorcode variable
593     bracount number of previous extracting brackets
594     options the options bits
595     isclass TRUE if inside a character class
596    
597     Returns: zero or positive => a data character
598     negative => a special escape sequence
599 ph10 213 on error, errorcodeptr is set
600 nigel 77 */
601    
602     static int
603     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
604     int options, BOOL isclass)
605     {
606 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
607     const uschar *ptr = *ptrptr + 1;
608 nigel 77 int c, i;
609    
610 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
611     ptr--; /* Set pointer back to the last byte */
612    
613 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
614    
615     if (c == 0) *errorcodeptr = ERR1;
616    
617 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
618     in a table. A non-zero result is something that can be returned immediately.
619 nigel 77 Otherwise further processing may be required. */
620    
621 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
622     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
623     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
624 nigel 77
625 ph10 97 #else /* EBCDIC coding */
626 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
627 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
628     #endif
629    
630     /* Escapes that need further processing, or are illegal. */
631    
632     else
633     {
634     const uschar *oldptr;
635 nigel 93 BOOL braced, negated;
636    
637 nigel 77 switch (c)
638     {
639     /* A number of Perl escapes are not handled by PCRE. We give an explicit
640     error. */
641    
642 ph10 391 case CHAR_l:
643     case CHAR_L:
644     case CHAR_u:
645     case CHAR_U:
646 nigel 77 *errorcodeptr = ERR37;
647     break;
648    
649 ph10 333 /* \g must be followed by one of a number of specific things:
650 ph10 345
651 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
652     backreference. If negative, it is a relative backreference. This is a Perl
653     5.10 feature.
654 ph10 345
655 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
656     is part of Perl's movement towards a unified syntax for back references. As
657     this is synonymous with \k{name}, we fudge it up by pretending it really
658     was \k.
659 ph10 345
660     (3) For Oniguruma compatibility we also support \g followed by a name or a
661     number either in angle brackets or in single quotes. However, these are
662     (possibly recursive) subroutine calls, _not_ backreferences. Just return
663 ph10 333 the -ESC_g code (cf \k). */
664 nigel 93
665 ph10 391 case CHAR_g:
666     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
667 ph10 333 {
668     c = -ESC_g;
669 ph10 345 break;
670     }
671 ph10 333
672     /* Handle the Perl-compatible cases */
673 ph10 345
674 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
675 nigel 93 {
676 ph10 171 const uschar *p;
677 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
678     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
679     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
680 ph10 171 {
681     c = -ESC_k;
682     break;
683 ph10 172 }
684 nigel 93 braced = TRUE;
685     ptr++;
686     }
687     else braced = FALSE;
688    
689 ph10 391 if (ptr[1] == CHAR_MINUS)
690 nigel 93 {
691     negated = TRUE;
692     ptr++;
693     }
694     else negated = FALSE;
695    
696     c = 0;
697     while ((digitab[ptr[1]] & ctype_digit) != 0)
698 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
699 ph10 220
700 ph10 333 if (c < 0) /* Integer overflow */
701 ph10 213 {
702     *errorcodeptr = ERR61;
703     break;
704 ph10 220 }
705 ph10 345
706 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
707 nigel 93 {
708     *errorcodeptr = ERR57;
709 ph10 213 break;
710 nigel 93 }
711 ph10 345
712 ph10 333 if (c == 0)
713     {
714     *errorcodeptr = ERR58;
715     break;
716 ph10 345 }
717 nigel 93
718     if (negated)
719     {
720     if (c > bracount)
721     {
722     *errorcodeptr = ERR15;
723 ph10 213 break;
724 nigel 93 }
725     c = bracount - (c - 1);
726     }
727    
728     c = -(ESC_REF + c);
729     break;
730    
731 nigel 77 /* The handling of escape sequences consisting of a string of digits
732     starting with one that is not zero is not straightforward. By experiment,
733     the way Perl works seems to be as follows:
734    
735     Outside a character class, the digits are read as a decimal number. If the
736     number is less than 10, or if there are that many previous extracting
737     left brackets, then it is a back reference. Otherwise, up to three octal
738     digits are read to form an escaped byte. Thus \123 is likely to be octal
739     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
740     value is greater than 377, the least significant 8 bits are taken. Inside a
741     character class, \ followed by a digit is always an octal number. */
742    
743 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
744     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
745 nigel 77
746     if (!isclass)
747     {
748     oldptr = ptr;
749 ph10 391 c -= CHAR_0;
750 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
751 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
752 ph10 333 if (c < 0) /* Integer overflow */
753 ph10 213 {
754     *errorcodeptr = ERR61;
755 ph10 220 break;
756     }
757 nigel 77 if (c < 10 || c <= bracount)
758     {
759     c = -(ESC_REF + c);
760     break;
761     }
762     ptr = oldptr; /* Put the pointer back and fall through */
763     }
764    
765     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
766     generates a binary zero byte and treats the digit as a following literal.
767     Thus we have to pull back the pointer by one. */
768    
769 ph10 391 if ((c = *ptr) >= CHAR_8)
770 nigel 77 {
771     ptr--;
772     c = 0;
773     break;
774     }
775    
776     /* \0 always starts an octal number, but we may drop through to here with a
777 nigel 91 larger first octal digit. The original code used just to take the least
778     significant 8 bits of octal numbers (I think this is what early Perls used
779     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
780     than 3 octal digits. */
781 nigel 77
782 ph10 391 case CHAR_0:
783     c -= CHAR_0;
784     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
785     c = c * 8 + *(++ptr) - CHAR_0;
786 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
787 nigel 77 break;
788    
789 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
790     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
791     treated as a data character. */
792 nigel 77
793 ph10 391 case CHAR_x:
794     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
795 nigel 77 {
796     const uschar *pt = ptr + 2;
797 nigel 87 int count = 0;
798    
799 nigel 77 c = 0;
800     while ((digitab[*pt] & ctype_xdigit) != 0)
801     {
802 nigel 87 register int cc = *pt++;
803 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
804 nigel 77 count++;
805 nigel 87
806 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
807     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
808     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
809 ph10 97 #else /* EBCDIC coding */
810 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
811     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
812 nigel 77 #endif
813     }
814 nigel 87
815 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
816 nigel 77 {
817 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
818 nigel 77 ptr = pt;
819     break;
820     }
821 nigel 87
822 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
823     recognize this construct; fall through to the normal \x handling. */
824     }
825    
826 nigel 87 /* Read just a single-byte hex-defined char */
827 nigel 77
828     c = 0;
829     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
830     {
831 ph10 391 int cc; /* Some compilers don't like */
832     cc = *(++ptr); /* ++ in initializers */
833     #ifndef EBCDIC /* ASCII/UTF-8 coding */
834     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
835     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
836 ph10 97 #else /* EBCDIC coding */
837 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
838     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
839 nigel 77 #endif
840     }
841     break;
842    
843 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
844     This coding is ASCII-specific, but then the whole concept of \cx is
845     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
846 nigel 77
847 ph10 391 case CHAR_c:
848 nigel 77 c = *(++ptr);
849     if (c == 0)
850     {
851     *errorcodeptr = ERR2;
852 ph10 213 break;
853 nigel 77 }
854    
855 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
856     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
857 nigel 77 c ^= 0x40;
858 ph10 97 #else /* EBCDIC coding */
859 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
860 nigel 77 c ^= 0xC0;
861     #endif
862     break;
863    
864     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
865 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
866     otherwise, for Perl compatibility, it is a literal. This code looks a bit
867     odd, but there used to be some cases other than the default, and there may
868     be again in future, so I haven't "optimized" it. */
869 nigel 77
870     default:
871     if ((options & PCRE_EXTRA) != 0) switch(c)
872     {
873     default:
874     *errorcodeptr = ERR3;
875     break;
876     }
877     break;
878     }
879     }
880 ph10 518
881     /* Perl supports \N{name} for character names, as well as plain \N for "not
882 ph10 514 newline". PCRE does not support \N{name}. */
883 nigel 77
884 ph10 514 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
885 ph10 518 *errorcodeptr = ERR37;
886 ph10 514
887 ph10 518 /* If PCRE_UCP is set, we change the values for \d etc. */
888    
889     if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
890     c -= (ESC_DU - ESC_D);
891    
892     /* Set the pointer to the final character before returning. */
893    
894 nigel 77 *ptrptr = ptr;
895     return c;
896     }
897    
898    
899    
900     #ifdef SUPPORT_UCP
901     /*************************************************
902     * Handle \P and \p *
903     *************************************************/
904    
905     /* This function is called after \P or \p has been encountered, provided that
906     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
907     pointing at the P or p. On exit, it is pointing at the final character of the
908     escape sequence.
909    
910     Argument:
911     ptrptr points to the pattern position pointer
912     negptr points to a boolean that is set TRUE for negation else FALSE
913 nigel 87 dptr points to an int that is set to the detailed property value
914 nigel 77 errorcodeptr points to the error code variable
915    
916 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
917 nigel 77 */
918    
919     static int
920 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
921 nigel 77 {
922     int c, i, bot, top;
923     const uschar *ptr = *ptrptr;
924 nigel 87 char name[32];
925 nigel 77
926     c = *(++ptr);
927     if (c == 0) goto ERROR_RETURN;
928    
929     *negptr = FALSE;
930    
931 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
932     negation. */
933 nigel 77
934 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
935 nigel 77 {
936 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
937 nigel 77 {
938     *negptr = TRUE;
939     ptr++;
940     }
941 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
942 nigel 77 {
943     c = *(++ptr);
944     if (c == 0) goto ERROR_RETURN;
945 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
946 nigel 77 name[i] = c;
947     }
948 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
949 nigel 77 name[i] = 0;
950     }
951    
952     /* Otherwise there is just one following character */
953    
954     else
955     {
956     name[0] = c;
957     name[1] = 0;
958     }
959    
960     *ptrptr = ptr;
961    
962     /* Search for a recognized property name using binary chop */
963    
964     bot = 0;
965     top = _pcre_utt_size;
966    
967     while (bot < top)
968     {
969 nigel 87 i = (bot + top) >> 1;
970 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
971 nigel 87 if (c == 0)
972     {
973     *dptr = _pcre_utt[i].value;
974     return _pcre_utt[i].type;
975     }
976 nigel 77 if (c > 0) bot = i + 1; else top = i;
977     }
978    
979     *errorcodeptr = ERR47;
980     *ptrptr = ptr;
981     return -1;
982    
983     ERROR_RETURN:
984     *errorcodeptr = ERR46;
985     *ptrptr = ptr;
986     return -1;
987     }
988     #endif
989    
990    
991    
992    
993     /*************************************************
994     * Check for counted repeat *
995     *************************************************/
996    
997     /* This function is called when a '{' is encountered in a place where it might
998     start a quantifier. It looks ahead to see if it really is a quantifier or not.
999     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1000     where the ddds are digits.
1001    
1002     Arguments:
1003     p pointer to the first char after '{'
1004    
1005     Returns: TRUE or FALSE
1006     */
1007    
1008     static BOOL
1009     is_counted_repeat(const uschar *p)
1010     {
1011     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1012     while ((digitab[*p] & ctype_digit) != 0) p++;
1013 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1014 nigel 77
1015 ph10 391 if (*p++ != CHAR_COMMA) return FALSE;
1016     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1017 nigel 77
1018     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1019     while ((digitab[*p] & ctype_digit) != 0) p++;
1020    
1021 ph10 391 return (*p == CHAR_RIGHT_CURLY_BRACKET);
1022 nigel 77 }
1023    
1024    
1025    
1026     /*************************************************
1027     * Read repeat counts *
1028     *************************************************/
1029    
1030     /* Read an item of the form {n,m} and return the values. This is called only
1031     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1032     so the syntax is guaranteed to be correct, but we need to check the values.
1033    
1034     Arguments:
1035     p pointer to first char after '{'
1036     minp pointer to int for min
1037     maxp pointer to int for max
1038     returned as -1 if no max
1039     errorcodeptr points to error code variable
1040    
1041     Returns: pointer to '}' on success;
1042     current ptr on error, with errorcodeptr set non-zero
1043     */
1044    
1045     static const uschar *
1046     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
1047     {
1048     int min = 0;
1049     int max = -1;
1050    
1051 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
1052     an integer overflow. */
1053    
1054 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1055 nigel 81 if (min < 0 || min > 65535)
1056     {
1057     *errorcodeptr = ERR5;
1058     return p;
1059     }
1060 nigel 77
1061 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
1062     Also, max must not be less than min. */
1063    
1064 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1065 nigel 77 {
1066 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1067 nigel 77 {
1068     max = 0;
1069 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1070 nigel 81 if (max < 0 || max > 65535)
1071     {
1072     *errorcodeptr = ERR5;
1073     return p;
1074     }
1075 nigel 77 if (max < min)
1076     {
1077     *errorcodeptr = ERR4;
1078     return p;
1079     }
1080     }
1081     }
1082    
1083 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1084     '}'. */
1085 nigel 77
1086 nigel 81 *minp = min;
1087     *maxp = max;
1088 nigel 77 return p;
1089     }
1090    
1091    
1092    
1093     /*************************************************
1094 ph10 408 * Subroutine for finding forward reference *
1095 nigel 91 *************************************************/
1096    
1097 ph10 408 /* This recursive function is called only from find_parens() below. The
1098     top-level call starts at the beginning of the pattern. All other calls must
1099     start at a parenthesis. It scans along a pattern's text looking for capturing
1100 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1101     name it is given, it returns its number. Alternatively, if the name is NULL, it
1102 ph10 408 returns when it reaches a given numbered subpattern. We know that if (?P< is
1103     encountered, the name will be terminated by '>' because that is checked in the
1104 ph10 411 first pass. Recursion is used to keep track of subpatterns that reset the
1105 ph10 408 capturing group numbers - the (?| feature.
1106 nigel 91
1107     Arguments:
1108 ph10 408 ptrptr address of the current character pointer (updated)
1109 ph10 345 cd compile background data
1110 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1111     lorn name length, or subpattern number if name is NULL
1112     xmode TRUE if we are in /x mode
1113 ph10 411 count pointer to the current capturing subpattern number (updated)
1114 nigel 91
1115     Returns: the number of the named subpattern, or -1 if not found
1116     */
1117    
1118     static int
1119 ph10 408 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1120     BOOL xmode, int *count)
1121 nigel 91 {
1122 ph10 408 uschar *ptr = *ptrptr;
1123     int start_count = *count;
1124     int hwm_count = start_count;
1125     BOOL dup_parens = FALSE;
1126 nigel 93
1127 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1128 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1129    
1130     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1131     {
1132     if (ptr[1] == CHAR_QUESTION_MARK &&
1133 ph10 411 ptr[2] == CHAR_VERTICAL_LINE)
1134 ph10 408 {
1135     ptr += 3;
1136 ph10 411 dup_parens = TRUE;
1137     }
1138 ph10 408
1139     /* Handle a normal, unnamed capturing parenthesis */
1140 ph10 411
1141 ph10 408 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1142     {
1143     *count += 1;
1144     if (name == NULL && *count == lorn) return *count;
1145 ph10 411 ptr++;
1146 ph10 408 }
1147    
1148     /* Handle a condition. If it is an assertion, just carry on so that it
1149     is processed as normal. If not, skip to the closing parenthesis of the
1150 ph10 411 condition (there can't be any nested parens. */
1151    
1152 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1153     {
1154 ph10 411 ptr += 2;
1155 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1156     {
1157     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1158 ph10 411 if (*ptr != 0) ptr++;
1159 ph10 408 }
1160 ph10 411 }
1161    
1162 ph10 408 /* We have either (? or (* and not a condition */
1163    
1164     else
1165 ph10 411 {
1166 ph10 408 ptr += 2;
1167     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1168    
1169     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1170 ph10 411
1171 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1172     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1173     {
1174     int term;
1175     const uschar *thisname;
1176     *count += 1;
1177     if (name == NULL && *count == lorn) return *count;
1178     term = *ptr++;
1179     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1180     thisname = ptr;
1181     while (*ptr != term) ptr++;
1182     if (name != NULL && lorn == ptr - thisname &&
1183     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1184     return *count;
1185 ph10 461 term++;
1186 ph10 411 }
1187 ph10 408 }
1188 ph10 411 }
1189 ph10 408
1190 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1191 ph10 408 bars. */
1192    
1193 nigel 91 for (; *ptr != 0; ptr++)
1194     {
1195 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1196    
1197 ph10 391 if (*ptr == CHAR_BACKSLASH)
1198 nigel 93 {
1199 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1200 ph10 391 if (*ptr == CHAR_Q) for (;;)
1201 nigel 93 {
1202 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1203 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1204 ph10 391 if (*(++ptr) == CHAR_E) break;
1205 nigel 93 }
1206     continue;
1207     }
1208    
1209 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1210     are handled for real. If the first character is '^', skip it. Also, if the
1211     first few characters (either before or after ^) are \Q\E or \E we skip them
1212 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1213 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1214 nigel 93
1215 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1216 nigel 93 {
1217 ph10 340 BOOL negate_class = FALSE;
1218     for (;;)
1219     {
1220 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1221 ph10 340 {
1222 ph10 438 if (ptr[2] == CHAR_E)
1223     ptr+= 2;
1224     else if (strncmp((const char *)ptr+2,
1225 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1226 ph10 438 ptr += 4;
1227 ph10 392 else
1228 ph10 391 break;
1229 ph10 340 }
1230 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1231 ph10 461 {
1232 ph10 340 negate_class = TRUE;
1233 ph10 438 ptr++;
1234 ph10 461 }
1235 ph10 340 else break;
1236     }
1237    
1238     /* If the next character is ']', it is a data character that must be
1239 ph10 341 skipped, except in JavaScript compatibility mode. */
1240 ph10 345
1241 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1242 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1243 ph10 345 ptr++;
1244    
1245 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1246 nigel 93 {
1247 ph10 220 if (*ptr == 0) return -1;
1248 ph10 391 if (*ptr == CHAR_BACKSLASH)
1249 nigel 93 {
1250 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1251 ph10 391 if (*ptr == CHAR_Q) for (;;)
1252 nigel 93 {
1253 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1254 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1255 ph10 391 if (*(++ptr) == CHAR_E) break;
1256 nigel 93 }
1257     continue;
1258     }
1259     }
1260     continue;
1261     }
1262    
1263     /* Skip comments in /x mode */
1264    
1265 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1266 nigel 93 {
1267 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1268 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1269 nigel 93 continue;
1270     }
1271    
1272 ph10 408 /* Check for the special metacharacters */
1273 ph10 411
1274 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1275 nigel 93 {
1276 ph10 408 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1277     if (rc > 0) return rc;
1278     if (*ptr == 0) goto FAIL_EXIT;
1279 nigel 93 }
1280 ph10 411
1281 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1282     {
1283     if (dup_parens && *count < hwm_count) *count = hwm_count;
1284     *ptrptr = ptr;
1285     return -1;
1286     }
1287 ph10 411
1288     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1289 ph10 408 {
1290     if (*count > hwm_count) hwm_count = *count;
1291     *count = start_count;
1292 ph10 411 }
1293 ph10 408 }
1294 nigel 93
1295 ph10 408 FAIL_EXIT:
1296     *ptrptr = ptr;
1297     return -1;
1298     }
1299 nigel 93
1300    
1301    
1302    
1303 ph10 408 /*************************************************
1304     * Find forward referenced subpattern *
1305     *************************************************/
1306 nigel 93
1307 ph10 408 /* This function scans along a pattern's text looking for capturing
1308     subpatterns, and counting them. If it finds a named pattern that matches the
1309     name it is given, it returns its number. Alternatively, if the name is NULL, it
1310     returns when it reaches a given numbered subpattern. This is used for forward
1311     references to subpatterns. We used to be able to start this scan from the
1312     current compiling point, using the current count value from cd->bracount, and
1313     do it all in a single loop, but the addition of the possibility of duplicate
1314     subpattern numbers means that we have to scan from the very start, in order to
1315     take account of such duplicates, and to use a recursive function to keep track
1316     of the different types of group.
1317    
1318     Arguments:
1319     cd compile background data
1320     name name to seek, or NULL if seeking a numbered subpattern
1321     lorn name length, or subpattern number if name is NULL
1322     xmode TRUE if we are in /x mode
1323    
1324     Returns: the number of the found subpattern, or -1 if not found
1325     */
1326    
1327     static int
1328     find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1329     {
1330     uschar *ptr = (uschar *)cd->start_pattern;
1331     int count = 0;
1332     int rc;
1333    
1334     /* If the pattern does not start with an opening parenthesis, the first call
1335     to find_parens_sub() will scan right to the end (if necessary). However, if it
1336     does start with a parenthesis, find_parens_sub() will return when it hits the
1337     matching closing parens. That is why we have to have a loop. */
1338    
1339 ph10 411 for (;;)
1340     {
1341 ph10 408 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1342 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1343     }
1344    
1345 ph10 408 return rc;
1346 nigel 91 }
1347    
1348    
1349    
1350 ph10 408
1351 nigel 91 /*************************************************
1352 nigel 77 * Find first significant op code *
1353     *************************************************/
1354    
1355     /* This is called by several functions that scan a compiled expression looking
1356     for a fixed first character, or an anchoring op code etc. It skips over things
1357     that do not influence this. For some calls, a change of option is important.
1358     For some calls, it makes sense to skip negative forward and all backward
1359     assertions, and also the \b assertion; for others it does not.
1360    
1361     Arguments:
1362     code pointer to the start of the group
1363     options pointer to external options
1364     optbit the option bit whose changing is significant, or
1365     zero if none are
1366     skipassert TRUE if certain assertions are to be skipped
1367    
1368     Returns: pointer to the first significant opcode
1369     */
1370    
1371     static const uschar*
1372     first_significant_code(const uschar *code, int *options, int optbit,
1373     BOOL skipassert)
1374     {
1375     for (;;)
1376     {
1377     switch ((int)*code)
1378     {
1379     case OP_OPT:
1380     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1381     *options = (int)code[1];
1382     code += 2;
1383     break;
1384    
1385     case OP_ASSERT_NOT:
1386     case OP_ASSERTBACK:
1387     case OP_ASSERTBACK_NOT:
1388     if (!skipassert) return code;
1389     do code += GET(code, 1); while (*code == OP_ALT);
1390     code += _pcre_OP_lengths[*code];
1391     break;
1392    
1393     case OP_WORD_BOUNDARY:
1394     case OP_NOT_WORD_BOUNDARY:
1395     if (!skipassert) return code;
1396     /* Fall through */
1397    
1398     case OP_CALLOUT:
1399     case OP_CREF:
1400 ph10 459 case OP_NCREF:
1401 nigel 93 case OP_RREF:
1402 ph10 459 case OP_NRREF:
1403 nigel 93 case OP_DEF:
1404 nigel 77 code += _pcre_OP_lengths[*code];
1405     break;
1406    
1407     default:
1408     return code;
1409     }
1410     }
1411     /* Control never reaches here */
1412     }
1413    
1414    
1415    
1416    
1417     /*************************************************
1418 ph10 454 * Find the fixed length of a branch *
1419 nigel 77 *************************************************/
1420    
1421 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1422 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1423 ph10 461 In UTF8 mode, the result is in characters rather than bytes. The branch is
1424 ph10 454 temporarily terminated with OP_END when this function is called.
1425 nigel 77
1426 ph10 461 This function is called when a backward assertion is encountered, so that if it
1427     fails, the error message can point to the correct place in the pattern.
1428 ph10 454 However, we cannot do this when the assertion contains subroutine calls,
1429 ph10 461 because they can be forward references. We solve this by remembering this case
1430 ph10 454 and doing the check at the end; a flag specifies which mode we are running in.
1431    
1432 nigel 77 Arguments:
1433     code points to the start of the pattern (the bracket)
1434     options the compiling options
1435 ph10 461 atend TRUE if called when the pattern is complete
1436     cd the "compile data" structure
1437 nigel 77
1438 ph10 461 Returns: the fixed length,
1439 ph10 454 or -1 if there is no fixed length,
1440 nigel 77 or -2 if \C was encountered
1441 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1442 nigel 77 */
1443    
1444     static int
1445 ph10 454 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1446 nigel 77 {
1447     int length = -1;
1448    
1449     register int branchlength = 0;
1450     register uschar *cc = code + 1 + LINK_SIZE;
1451    
1452     /* Scan along the opcodes for this branch. If we get to the end of the
1453     branch, check the length against that of the other branches. */
1454    
1455     for (;;)
1456     {
1457     int d;
1458 ph10 454 uschar *ce, *cs;
1459 nigel 77 register int op = *cc;
1460     switch (op)
1461     {
1462 nigel 93 case OP_CBRA:
1463 nigel 77 case OP_BRA:
1464     case OP_ONCE:
1465     case OP_COND:
1466 ph10 454 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1467 nigel 77 if (d < 0) return d;
1468     branchlength += d;
1469     do cc += GET(cc, 1); while (*cc == OP_ALT);
1470     cc += 1 + LINK_SIZE;
1471     break;
1472    
1473     /* Reached end of a branch; if it's a ket it is the end of a nested
1474     call. If it's ALT it is an alternation in a nested call. If it is
1475     END it's the end of the outer call. All can be handled by the same code. */
1476    
1477     case OP_ALT:
1478     case OP_KET:
1479     case OP_KETRMAX:
1480     case OP_KETRMIN:
1481     case OP_END:
1482     if (length < 0) length = branchlength;
1483     else if (length != branchlength) return -1;
1484     if (*cc != OP_ALT) return length;
1485     cc += 1 + LINK_SIZE;
1486     branchlength = 0;
1487     break;
1488 ph10 461
1489 ph10 454 /* A true recursion implies not fixed length, but a subroutine call may
1490     be OK. If the subroutine is a forward reference, we can't deal with
1491     it until the end of the pattern, so return -3. */
1492 ph10 461
1493 ph10 454 case OP_RECURSE:
1494     if (!atend) return -3;
1495     cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1496     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1497     if (cc > cs && cc < ce) return -1; /* Recursion */
1498     d = find_fixedlength(cs + 2, options, atend, cd);
1499 ph10 461 if (d < 0) return d;
1500 ph10 454 branchlength += d;
1501     cc += 1 + LINK_SIZE;
1502 ph10 461 break;
1503 nigel 77
1504     /* Skip over assertive subpatterns */
1505    
1506     case OP_ASSERT:
1507     case OP_ASSERT_NOT:
1508     case OP_ASSERTBACK:
1509     case OP_ASSERTBACK_NOT:
1510     do cc += GET(cc, 1); while (*cc == OP_ALT);
1511     /* Fall through */
1512    
1513     /* Skip over things that don't match chars */
1514    
1515     case OP_REVERSE:
1516     case OP_CREF:
1517 ph10 459 case OP_NCREF:
1518 nigel 93 case OP_RREF:
1519 ph10 459 case OP_NRREF:
1520 nigel 93 case OP_DEF:
1521 nigel 77 case OP_OPT:
1522     case OP_CALLOUT:
1523     case OP_SOD:
1524     case OP_SOM:
1525 ph10 500 case OP_SET_SOM:
1526 nigel 77 case OP_EOD:
1527     case OP_EODN:
1528     case OP_CIRC:
1529     case OP_DOLL:
1530     case OP_NOT_WORD_BOUNDARY:
1531     case OP_WORD_BOUNDARY:
1532     cc += _pcre_OP_lengths[*cc];
1533     break;
1534    
1535     /* Handle literal characters */
1536    
1537     case OP_CHAR:
1538     case OP_CHARNC:
1539 nigel 91 case OP_NOT:
1540 nigel 77 branchlength++;
1541     cc += 2;
1542     #ifdef SUPPORT_UTF8
1543 ph10 461 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1544 ph10 426 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1545 nigel 77 #endif
1546     break;
1547    
1548     /* Handle exact repetitions. The count is already in characters, but we
1549     need to skip over a multibyte character in UTF8 mode. */
1550    
1551     case OP_EXACT:
1552     branchlength += GET2(cc,1);
1553     cc += 4;
1554     #ifdef SUPPORT_UTF8
1555 ph10 461 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1556 ph10 426 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1557 nigel 77 #endif
1558     break;
1559    
1560     case OP_TYPEEXACT:
1561     branchlength += GET2(cc,1);
1562 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1563 nigel 77 cc += 4;
1564     break;
1565    
1566     /* Handle single-char matchers */
1567    
1568     case OP_PROP:
1569     case OP_NOTPROP:
1570 nigel 87 cc += 2;
1571 nigel 77 /* Fall through */
1572    
1573     case OP_NOT_DIGIT:
1574     case OP_DIGIT:
1575     case OP_NOT_WHITESPACE:
1576     case OP_WHITESPACE:
1577     case OP_NOT_WORDCHAR:
1578     case OP_WORDCHAR:
1579     case OP_ANY:
1580 ph10 342 case OP_ALLANY:
1581 nigel 77 branchlength++;
1582     cc++;
1583     break;
1584    
1585     /* The single-byte matcher isn't allowed */
1586    
1587     case OP_ANYBYTE:
1588     return -2;
1589    
1590     /* Check a class for variable quantification */
1591    
1592     #ifdef SUPPORT_UTF8
1593     case OP_XCLASS:
1594     cc += GET(cc, 1) - 33;
1595     /* Fall through */
1596     #endif
1597    
1598     case OP_CLASS:
1599     case OP_NCLASS:
1600     cc += 33;
1601    
1602     switch (*cc)
1603     {
1604     case OP_CRSTAR:
1605     case OP_CRMINSTAR:
1606     case OP_CRQUERY:
1607     case OP_CRMINQUERY:
1608     return -1;
1609    
1610     case OP_CRRANGE:
1611     case OP_CRMINRANGE:
1612     if (GET2(cc,1) != GET2(cc,3)) return -1;
1613     branchlength += GET2(cc,1);
1614     cc += 5;
1615     break;
1616    
1617     default:
1618     branchlength++;
1619     }
1620     break;
1621    
1622     /* Anything else is variable length */
1623    
1624     default:
1625     return -1;
1626     }
1627     }
1628     /* Control never gets here */
1629     }
1630    
1631    
1632    
1633    
1634     /*************************************************
1635 ph10 454 * Scan compiled regex for specific bracket *
1636 nigel 77 *************************************************/
1637    
1638     /* This little function scans through a compiled pattern until it finds a
1639 ph10 454 capturing bracket with the given number, or, if the number is negative, an
1640 ph10 461 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1641     so that it can be called from pcre_study() when finding the minimum matching
1642 ph10 455 length.
1643 nigel 77
1644     Arguments:
1645     code points to start of expression
1646     utf8 TRUE in UTF-8 mode
1647 ph10 454 number the required bracket number or negative to find a lookbehind
1648 nigel 77
1649     Returns: pointer to the opcode for the bracket, or NULL if not found
1650     */
1651    
1652 ph10 455 const uschar *
1653     _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1654 nigel 77 {
1655     for (;;)
1656     {
1657     register int c = *code;
1658     if (c == OP_END) return NULL;
1659 nigel 91
1660     /* XCLASS is used for classes that cannot be represented just by a bit
1661     map. This includes negated single high-valued characters. The length in
1662     the table is zero; the actual length is stored in the compiled code. */
1663    
1664     if (c == OP_XCLASS) code += GET(code, 1);
1665 ph10 461
1666 ph10 454 /* Handle recursion */
1667 ph10 461
1668 ph10 454 else if (c == OP_REVERSE)
1669     {
1670 ph10 461 if (number < 0) return (uschar *)code;
1671 ph10 454 code += _pcre_OP_lengths[c];
1672     }
1673 nigel 91
1674 nigel 93 /* Handle capturing bracket */
1675 nigel 91
1676 nigel 93 else if (c == OP_CBRA)
1677 nigel 77 {
1678 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1679 nigel 77 if (n == number) return (uschar *)code;
1680 nigel 93 code += _pcre_OP_lengths[c];
1681 nigel 77 }
1682 nigel 91
1683 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1684     repeated character types, we have to test for \p and \P, which have an extra
1685 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1686 ph10 510 must add in its length. */
1687 nigel 91
1688 nigel 77 else
1689     {
1690 ph10 218 switch(c)
1691     {
1692     case OP_TYPESTAR:
1693     case OP_TYPEMINSTAR:
1694     case OP_TYPEPLUS:
1695     case OP_TYPEMINPLUS:
1696     case OP_TYPEQUERY:
1697     case OP_TYPEMINQUERY:
1698     case OP_TYPEPOSSTAR:
1699     case OP_TYPEPOSPLUS:
1700     case OP_TYPEPOSQUERY:
1701     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1702 ph10 220 break;
1703 ph10 221
1704     case OP_TYPEUPTO:
1705     case OP_TYPEMINUPTO:
1706     case OP_TYPEEXACT:
1707     case OP_TYPEPOSUPTO:
1708     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1709     break;
1710 ph10 512
1711 ph10 510 case OP_MARK:
1712     case OP_PRUNE_ARG:
1713     case OP_SKIP_ARG:
1714     case OP_THEN_ARG:
1715     code += code[1];
1716 ph10 512 break;
1717 ph10 220 }
1718    
1719 ph10 218 /* Add in the fixed length from the table */
1720 ph10 220
1721 nigel 77 code += _pcre_OP_lengths[c];
1722 ph10 220
1723 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1724     a multi-byte character. The length in the table is a minimum, so we have to
1725     arrange to skip the extra bytes. */
1726 ph10 220
1727 ph10 107 #ifdef SUPPORT_UTF8
1728 nigel 77 if (utf8) switch(c)
1729     {
1730     case OP_CHAR:
1731     case OP_CHARNC:
1732     case OP_EXACT:
1733     case OP_UPTO:
1734     case OP_MINUPTO:
1735 nigel 93 case OP_POSUPTO:
1736 nigel 77 case OP_STAR:
1737     case OP_MINSTAR:
1738 nigel 93 case OP_POSSTAR:
1739 nigel 77 case OP_PLUS:
1740     case OP_MINPLUS:
1741 nigel 93 case OP_POSPLUS:
1742 nigel 77 case OP_QUERY:
1743     case OP_MINQUERY:
1744 nigel 93 case OP_POSQUERY:
1745     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1746 nigel 77 break;
1747     }
1748 ph10 369 #else
1749     (void)(utf8); /* Keep compiler happy by referencing function argument */
1750 ph10 111 #endif
1751 nigel 77 }
1752     }
1753     }
1754    
1755    
1756    
1757     /*************************************************
1758     * Scan compiled regex for recursion reference *
1759     *************************************************/
1760    
1761     /* This little function scans through a compiled pattern until it finds an
1762     instance of OP_RECURSE.
1763    
1764     Arguments:
1765     code points to start of expression
1766     utf8 TRUE in UTF-8 mode
1767    
1768     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1769     */
1770    
1771     static const uschar *
1772     find_recurse(const uschar *code, BOOL utf8)
1773     {
1774     for (;;)
1775     {
1776     register int c = *code;
1777     if (c == OP_END) return NULL;
1778 nigel 91 if (c == OP_RECURSE) return code;
1779 ph10 220
1780 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1781     map. This includes negated single high-valued characters. The length in
1782     the table is zero; the actual length is stored in the compiled code. */
1783    
1784     if (c == OP_XCLASS) code += GET(code, 1);
1785    
1786 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1787     repeated character types, we have to test for \p and \P, which have an extra
1788 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1789 ph10 510 must add in its length. */
1790 nigel 91
1791 nigel 77 else
1792     {
1793 ph10 218 switch(c)
1794     {
1795     case OP_TYPESTAR:
1796     case OP_TYPEMINSTAR:
1797     case OP_TYPEPLUS:
1798     case OP_TYPEMINPLUS:
1799     case OP_TYPEQUERY:
1800     case OP_TYPEMINQUERY:
1801     case OP_TYPEPOSSTAR:
1802     case OP_TYPEPOSPLUS:
1803     case OP_TYPEPOSQUERY:
1804     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1805 ph10 220 break;
1806 ph10 221
1807     case OP_TYPEPOSUPTO:
1808     case OP_TYPEUPTO:
1809     case OP_TYPEMINUPTO:
1810     case OP_TYPEEXACT:
1811     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1812     break;
1813 ph10 512
1814 ph10 510 case OP_MARK:
1815     case OP_PRUNE_ARG:
1816     case OP_SKIP_ARG:
1817     case OP_THEN_ARG:
1818     code += code[1];
1819 ph10 512 break;
1820 ph10 220 }
1821    
1822 ph10 218 /* Add in the fixed length from the table */
1823    
1824 nigel 77 code += _pcre_OP_lengths[c];
1825 ph10 220
1826 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1827     by a multi-byte character. The length in the table is a minimum, so we have
1828     to arrange to skip the extra bytes. */
1829 ph10 220
1830 ph10 107 #ifdef SUPPORT_UTF8
1831 nigel 77 if (utf8) switch(c)
1832     {
1833     case OP_CHAR:
1834     case OP_CHARNC:
1835     case OP_EXACT:
1836     case OP_UPTO:
1837     case OP_MINUPTO:
1838 nigel 93 case OP_POSUPTO:
1839 nigel 77 case OP_STAR:
1840     case OP_MINSTAR:
1841 nigel 93 case OP_POSSTAR:
1842 nigel 77 case OP_PLUS:
1843     case OP_MINPLUS:
1844 nigel 93 case OP_POSPLUS:
1845 nigel 77 case OP_QUERY:
1846     case OP_MINQUERY:
1847 nigel 93 case OP_POSQUERY:
1848     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1849 nigel 77 break;
1850     }
1851 ph10 369 #else
1852     (void)(utf8); /* Keep compiler happy by referencing function argument */
1853 ph10 111 #endif
1854 nigel 77 }
1855     }
1856     }
1857    
1858    
1859    
1860     /*************************************************
1861     * Scan compiled branch for non-emptiness *
1862     *************************************************/
1863    
1864     /* This function scans through a branch of a compiled pattern to see whether it
1865 nigel 93 can match the empty string or not. It is called from could_be_empty()
1866     below and from compile_branch() when checking for an unlimited repeat of a
1867     group that can match nothing. Note that first_significant_code() skips over
1868 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1869     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1870     bracket whose current branch will already have been scanned.
1871 nigel 77
1872     Arguments:
1873     code points to start of search
1874     endcode points to where to stop
1875     utf8 TRUE if in UTF8 mode
1876 ph10 503 cd contains pointers to tables etc.
1877 nigel 77
1878     Returns: TRUE if what is matched could be empty
1879     */
1880    
1881     static BOOL
1882 ph10 503 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1883     compile_data *cd)
1884 nigel 77 {
1885     register int c;
1886 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1887 nigel 77 code < endcode;
1888     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1889     {
1890     const uschar *ccode;
1891    
1892     c = *code;
1893 ph10 507
1894 ph10 286 /* Skip over forward assertions; the other assertions are skipped by
1895 ph10 282 first_significant_code() with a TRUE final argument. */
1896 ph10 286
1897 ph10 282 if (c == OP_ASSERT)
1898 ph10 286 {
1899 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1900     c = *code;
1901     continue;
1902 ph10 286 }
1903 ph10 172
1904 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1905 nigel 77
1906 ph10 335 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1907 ph10 170 {
1908 ph10 172 code += _pcre_OP_lengths[c];
1909 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1910     c = *code;
1911     continue;
1912     }
1913 ph10 507
1914 ph10 503 /* For a recursion/subroutine call, if its end has been reached, which
1915     implies a subroutine call, we can scan it. */
1916 ph10 507
1917 ph10 503 if (c == OP_RECURSE)
1918     {
1919 ph10 507 BOOL empty_branch = FALSE;
1920 ph10 503 const uschar *scode = cd->start_code + GET(code, 1);
1921     if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
1922     do
1923     {
1924 ph10 504 if (could_be_empty_branch(scode, endcode, utf8, cd))
1925     {
1926     empty_branch = TRUE;
1927 ph10 507 break;
1928     }
1929 ph10 503 scode += GET(scode, 1);
1930     }
1931     while (*scode == OP_ALT);
1932 ph10 504 if (!empty_branch) return FALSE; /* All branches are non-empty */
1933 ph10 503 continue;
1934 ph10 507 }
1935 ph10 170
1936     /* For other groups, scan the branches. */
1937 ph10 172
1938 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1939 nigel 77 {
1940     BOOL empty_branch;
1941     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1942 ph10 406
1943     /* If a conditional group has only one branch, there is a second, implied,
1944 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
1945     Otherwise, scan the individual branches of the group. */
1946 ph10 406
1947 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1948 nigel 77 code += GET(code, 1);
1949 ph10 395 else
1950 ph10 406 {
1951 ph10 395 empty_branch = FALSE;
1952     do
1953     {
1954 ph10 503 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
1955 ph10 395 empty_branch = TRUE;
1956     code += GET(code, 1);
1957     }
1958     while (*code == OP_ALT);
1959     if (!empty_branch) return FALSE; /* All branches are non-empty */
1960 nigel 77 }
1961 ph10 406
1962 ph10 172 c = *code;
1963 nigel 93 continue;
1964 nigel 77 }
1965    
1966 nigel 93 /* Handle the other opcodes */
1967    
1968     switch (c)
1969 nigel 77 {
1970 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1971     cannot be represented just by a bit map. This includes negated single
1972     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1973 ph10 220 actual length is stored in the compiled code, so we must update "code"
1974 ph10 216 here. */
1975 nigel 77
1976     #ifdef SUPPORT_UTF8
1977     case OP_XCLASS:
1978 ph10 216 ccode = code += GET(code, 1);
1979 nigel 77 goto CHECK_CLASS_REPEAT;
1980     #endif
1981    
1982     case OP_CLASS:
1983     case OP_NCLASS:
1984     ccode = code + 33;
1985    
1986     #ifdef SUPPORT_UTF8
1987     CHECK_CLASS_REPEAT:
1988     #endif
1989    
1990     switch (*ccode)
1991     {
1992     case OP_CRSTAR: /* These could be empty; continue */
1993     case OP_CRMINSTAR:
1994     case OP_CRQUERY:
1995     case OP_CRMINQUERY:
1996     break;
1997    
1998     default: /* Non-repeat => class must match */
1999     case OP_CRPLUS: /* These repeats aren't empty */
2000     case OP_CRMINPLUS:
2001     return FALSE;
2002    
2003     case OP_CRRANGE:
2004     case OP_CRMINRANGE:
2005     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2006     break;
2007     }
2008     break;
2009    
2010     /* Opcodes that must match a character */
2011    
2012     case OP_PROP:
2013     case OP_NOTPROP:
2014     case OP_EXTUNI:
2015     case OP_NOT_DIGIT:
2016     case OP_DIGIT:
2017     case OP_NOT_WHITESPACE:
2018     case OP_WHITESPACE:
2019     case OP_NOT_WORDCHAR:
2020     case OP_WORDCHAR:
2021     case OP_ANY:
2022 ph10 345 case OP_ALLANY:
2023 nigel 77 case OP_ANYBYTE:
2024     case OP_CHAR:
2025     case OP_CHARNC:
2026     case OP_NOT:
2027     case OP_PLUS:
2028     case OP_MINPLUS:
2029 nigel 93 case OP_POSPLUS:
2030 nigel 77 case OP_EXACT:
2031     case OP_NOTPLUS:
2032     case OP_NOTMINPLUS:
2033 nigel 93 case OP_NOTPOSPLUS:
2034 nigel 77 case OP_NOTEXACT:
2035     case OP_TYPEPLUS:
2036     case OP_TYPEMINPLUS:
2037 nigel 93 case OP_TYPEPOSPLUS:
2038 nigel 77 case OP_TYPEEXACT:
2039     return FALSE;
2040 ph10 227
2041     /* These are going to continue, as they may be empty, but we have to
2042     fudge the length for the \p and \P cases. */
2043    
2044 ph10 224 case OP_TYPESTAR:
2045     case OP_TYPEMINSTAR:
2046     case OP_TYPEPOSSTAR:
2047     case OP_TYPEQUERY:
2048     case OP_TYPEMINQUERY:
2049     case OP_TYPEPOSQUERY:
2050     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2051 ph10 227 break;
2052    
2053 ph10 224 /* Same for these */
2054 ph10 227
2055 ph10 224 case OP_TYPEUPTO:
2056     case OP_TYPEMINUPTO:
2057     case OP_TYPEPOSUPTO:
2058     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2059     break;
2060 nigel 77
2061     /* End of branch */
2062    
2063     case OP_KET:
2064     case OP_KETRMAX:
2065     case OP_KETRMIN:
2066     case OP_ALT:
2067     return TRUE;
2068    
2069 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2070     MINUPTO, and POSUPTO may be followed by a multibyte character */
2071 nigel 77
2072     #ifdef SUPPORT_UTF8
2073     case OP_STAR:
2074     case OP_MINSTAR:
2075 nigel 93 case OP_POSSTAR:
2076 nigel 77 case OP_QUERY:
2077     case OP_MINQUERY:
2078 nigel 93 case OP_POSQUERY:
2079 ph10 426 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2080     break;
2081 ph10 461
2082 nigel 77 case OP_UPTO:
2083     case OP_MINUPTO:
2084 nigel 93 case OP_POSUPTO:
2085 ph10 426 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2086 nigel 77 break;
2087     #endif
2088 ph10 503
2089 ph10 510 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2090     string. */
2091    
2092     case OP_MARK:
2093     case OP_PRUNE_ARG:
2094     case OP_SKIP_ARG:
2095     case OP_THEN_ARG:
2096     code += code[1];
2097 ph10 512 break;
2098 ph10 510
2099 ph10 503 /* None of the remaining opcodes are required to match a character. */
2100 ph10 507
2101 ph10 503 default:
2102 ph10 507 break;
2103 nigel 77 }
2104     }
2105    
2106     return TRUE;
2107     }
2108    
2109    
2110    
2111     /*************************************************
2112     * Scan compiled regex for non-emptiness *
2113     *************************************************/
2114    
2115     /* This function is called to check for left recursive calls. We want to check
2116     the current branch of the current pattern to see if it could match the empty
2117     string. If it could, we must look outwards for branches at other levels,
2118     stopping when we pass beyond the bracket which is the subject of the recursion.
2119    
2120     Arguments:
2121     code points to start of the recursion
2122     endcode points to where to stop (current RECURSE item)
2123     bcptr points to the chain of current (unclosed) branch starts
2124     utf8 TRUE if in UTF-8 mode
2125 ph10 507 cd pointers to tables etc
2126 nigel 77
2127     Returns: TRUE if what is matched could be empty
2128     */
2129    
2130     static BOOL
2131     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2132 ph10 503 BOOL utf8, compile_data *cd)
2133 nigel 77 {
2134 ph10 475 while (bcptr != NULL && bcptr->current_branch >= code)
2135 nigel 77 {
2136 ph10 503 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2137 ph10 475 return FALSE;
2138 nigel 77 bcptr = bcptr->outer;
2139     }
2140     return TRUE;
2141     }
2142    
2143    
2144    
2145     /*************************************************
2146     * Check for POSIX class syntax *
2147     *************************************************/
2148    
2149     /* This function is called when the sequence "[:" or "[." or "[=" is
2150 ph10 295 encountered in a character class. It checks whether this is followed by a
2151 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2152 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2153 nigel 77
2154 ph10 298 Originally, this function only recognized a sequence of letters between the
2155     terminators, but it seems that Perl recognizes any sequence of characters,
2156     though of course unknown POSIX names are subsequently rejected. Perl gives an
2157     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2158     didn't consider this to be a POSIX class. Likewise for [:1234:].
2159 ph10 295
2160 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2161     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2162     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2163     below handles the special case of \], but does not try to do any other escape
2164     processing. This makes it different from Perl for cases such as [:l\ower:]
2165 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2166 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2167 ph10 295 I think.
2168    
2169     Arguments:
2170 nigel 77 ptr pointer to the initial [
2171     endptr where to return the end pointer
2172    
2173     Returns: TRUE or FALSE
2174     */
2175    
2176     static BOOL
2177 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2178 nigel 77 {
2179     int terminator; /* Don't combine these lines; the Solaris cc */
2180     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2181 ph10 295 for (++ptr; *ptr != 0; ptr++)
2182 nigel 77 {
2183 ph10 391 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2184 ph10 298 {
2185 ph10 391 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2186     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2187 ph10 295 {
2188     *endptr = ptr;
2189     return TRUE;
2190 ph10 298 }
2191     }
2192     }
2193 nigel 77 return FALSE;
2194     }
2195    
2196    
2197    
2198    
2199     /*************************************************
2200     * Check POSIX class name *
2201     *************************************************/
2202    
2203     /* This function is called to check the name given in a POSIX-style class entry
2204     such as [:alnum:].
2205    
2206     Arguments:
2207     ptr points to the first letter
2208     len the length of the name
2209    
2210     Returns: a value representing the name, or -1 if unknown
2211     */
2212    
2213     static int
2214     check_posix_name(const uschar *ptr, int len)
2215     {
2216 ph10 240 const char *pn = posix_names;
2217 nigel 77 register int yield = 0;
2218     while (posix_name_lengths[yield] != 0)
2219     {
2220     if (len == posix_name_lengths[yield] &&
2221 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2222 ph10 243 pn += posix_name_lengths[yield] + 1;
2223 nigel 77 yield++;
2224     }
2225     return -1;
2226     }
2227    
2228    
2229     /*************************************************
2230     * Adjust OP_RECURSE items in repeated group *
2231     *************************************************/
2232    
2233     /* OP_RECURSE items contain an offset from the start of the regex to the group
2234     that is referenced. This means that groups can be replicated for fixed
2235     repetition simply by copying (because the recursion is allowed to refer to
2236     earlier groups that are outside the current group). However, when a group is
2237 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2238     inserted before it, after it has been compiled. This means that any OP_RECURSE
2239     items within it that refer to the group itself or any contained groups have to
2240     have their offsets adjusted. That one of the jobs of this function. Before it
2241     is called, the partially compiled regex must be temporarily terminated with
2242     OP_END.
2243 nigel 77
2244 nigel 93 This function has been extended with the possibility of forward references for
2245     recursions and subroutine calls. It must also check the list of such references
2246     for the group we are dealing with. If it finds that one of the recursions in
2247     the current group is on this list, it adjusts the offset in the list, not the
2248     value in the reference (which is a group number).
2249    
2250 nigel 77 Arguments:
2251     group points to the start of the group
2252     adjust the amount by which the group is to be moved
2253     utf8 TRUE in UTF-8 mode
2254     cd contains pointers to tables etc.
2255 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2256 nigel 77
2257     Returns: nothing
2258     */
2259    
2260     static void
2261 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2262     uschar *save_hwm)
2263 nigel 77 {
2264     uschar *ptr = group;
2265 ph10 224
2266 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2267     {
2268 nigel 93 int offset;
2269     uschar *hc;
2270    
2271     /* See if this recursion is on the forward reference list. If so, adjust the
2272     reference. */
2273 ph10 345
2274 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2275     {
2276     offset = GET(hc, 0);
2277     if (cd->start_code + offset == ptr + 1)
2278     {
2279     PUT(hc, 0, offset + adjust);
2280     break;
2281     }
2282     }
2283    
2284     /* Otherwise, adjust the recursion offset if it's after the start of this
2285     group. */
2286    
2287     if (hc >= cd->hwm)
2288     {
2289     offset = GET(ptr, 1);
2290     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2291     }
2292    
2293 nigel 77 ptr += 1 + LINK_SIZE;
2294     }
2295     }
2296    
2297    
2298    
2299     /*************************************************
2300     * Insert an automatic callout point *
2301     *************************************************/
2302    
2303     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2304     callout points before each pattern item.
2305    
2306     Arguments:
2307     code current code pointer
2308     ptr current pattern pointer
2309     cd pointers to tables etc
2310    
2311     Returns: new code pointer
2312     */
2313    
2314     static uschar *
2315     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2316     {
2317     *code++ = OP_CALLOUT;
2318     *code++ = 255;
2319     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2320     PUT(code, LINK_SIZE, 0); /* Default length */
2321     return code + 2*LINK_SIZE;
2322     }
2323    
2324    
2325    
2326     /*************************************************
2327     * Complete a callout item *
2328     *************************************************/
2329    
2330     /* A callout item contains the length of the next item in the pattern, which
2331     we can't fill in till after we have reached the relevant point. This is used
2332     for both automatic and manual callouts.
2333    
2334     Arguments:
2335     previous_callout points to previous callout item
2336     ptr current pattern pointer
2337     cd pointers to tables etc
2338    
2339     Returns: nothing
2340     */
2341    
2342     static void
2343     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2344     {
2345     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2346     PUT(previous_callout, 2 + LINK_SIZE, length);
2347     }
2348    
2349    
2350    
2351     #ifdef SUPPORT_UCP
2352     /*************************************************
2353     * Get othercase range *
2354     *************************************************/
2355    
2356     /* This function is passed the start and end of a class range, in UTF-8 mode
2357     with UCP support. It searches up the characters, looking for internal ranges of
2358     characters in the "other" case. Each call returns the next one, updating the
2359     start address.
2360    
2361     Arguments:
2362     cptr points to starting character value; updated
2363     d end value
2364     ocptr where to put start of othercase range
2365     odptr where to put end of othercase range
2366    
2367     Yield: TRUE when range returned; FALSE when no more
2368     */
2369    
2370     static BOOL
2371 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2372     unsigned int *odptr)
2373 nigel 77 {
2374 nigel 93 unsigned int c, othercase, next;
2375 nigel 77
2376     for (c = *cptr; c <= d; c++)
2377 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2378 nigel 77
2379     if (c > d) return FALSE;
2380    
2381     *ocptr = othercase;
2382     next = othercase + 1;
2383    
2384     for (++c; c <= d; c++)
2385     {
2386 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2387 nigel 77 next++;
2388     }
2389    
2390     *odptr = next - 1;
2391     *cptr = c;
2392    
2393     return TRUE;
2394     }
2395     #endif /* SUPPORT_UCP */
2396    
2397    
2398 nigel 93
2399 nigel 77 /*************************************************
2400 nigel 93 * Check if auto-possessifying is possible *
2401     *************************************************/
2402    
2403     /* This function is called for unlimited repeats of certain items, to see
2404     whether the next thing could possibly match the repeated item. If not, it makes
2405     sense to automatically possessify the repeated item.
2406    
2407     Arguments:
2408     op_code the repeated op code
2409     this data for this item, depends on the opcode
2410     utf8 TRUE in UTF-8 mode
2411     utf8_char used for utf8 character bytes, NULL if not relevant
2412     ptr next character in pattern
2413     options options bits
2414     cd contains pointers to tables etc.
2415    
2416     Returns: TRUE if possessifying is wanted
2417     */
2418    
2419     static BOOL
2420     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2421     const uschar *ptr, int options, compile_data *cd)
2422     {
2423     int next;
2424    
2425     /* Skip whitespace and comments in extended mode */
2426    
2427     if ((options & PCRE_EXTENDED) != 0)
2428     {
2429     for (;;)
2430     {
2431     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2432 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2433 nigel 93 {
2434     while (*(++ptr) != 0)
2435     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2436     }
2437     else break;
2438     }
2439     }
2440    
2441     /* If the next item is one that we can handle, get its value. A non-negative
2442     value is a character, a negative value is an escape value. */
2443    
2444 ph10 391 if (*ptr == CHAR_BACKSLASH)
2445 nigel 93 {
2446     int temperrorcode = 0;
2447     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2448     if (temperrorcode != 0) return FALSE;
2449     ptr++; /* Point after the escape sequence */
2450     }
2451    
2452     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2453     {
2454     #ifdef SUPPORT_UTF8
2455     if (utf8) { GETCHARINC(next, ptr); } else
2456     #endif
2457     next = *ptr++;
2458     }
2459    
2460     else return FALSE;
2461    
2462     /* Skip whitespace and comments in extended mode */
2463    
2464     if ((options & PCRE_EXTENDED) != 0)
2465     {
2466     for (;;)
2467     {
2468     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2469 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2470 nigel 93 {
2471     while (*(++ptr) != 0)
2472     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2473     }
2474     else break;
2475     }
2476     }
2477    
2478     /* If the next thing is itself optional, we have to give up. */
2479    
2480 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2481 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2482     return FALSE;
2483 nigel 93
2484     /* Now compare the next item with the previous opcode. If the previous is a
2485     positive single character match, "item" either contains the character or, if
2486     "item" is greater than 127 in utf8 mode, the character's bytes are in
2487     utf8_char. */
2488    
2489    
2490     /* Handle cases when the next item is a character. */
2491    
2492     if (next >= 0) switch(op_code)
2493     {
2494     case OP_CHAR:
2495     #ifdef SUPPORT_UTF8
2496     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2497 ph10 369 #else
2498     (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2499 nigel 93 #endif
2500     return item != next;
2501    
2502     /* For CHARNC (caseless character) we must check the other case. If we have
2503     Unicode property support, we can use it to test the other case of
2504     high-valued characters. */
2505    
2506     case OP_CHARNC:
2507     #ifdef SUPPORT_UTF8
2508     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2509     #endif
2510     if (item == next) return FALSE;
2511     #ifdef SUPPORT_UTF8
2512     if (utf8)
2513     {
2514     unsigned int othercase;
2515     if (next < 128) othercase = cd->fcc[next]; else
2516     #ifdef SUPPORT_UCP
2517 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2518 nigel 93 #else
2519     othercase = NOTACHAR;
2520     #endif
2521     return (unsigned int)item != othercase;
2522     }
2523     else
2524     #endif /* SUPPORT_UTF8 */
2525     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2526    
2527     /* For OP_NOT, "item" must be a single-byte character. */
2528    
2529     case OP_NOT:
2530     if (item == next) return TRUE;
2531     if ((options & PCRE_CASELESS) == 0) return FALSE;
2532     #ifdef SUPPORT_UTF8
2533     if (utf8)
2534     {
2535     unsigned int othercase;
2536     if (next < 128) othercase = cd->fcc[next]; else
2537     #ifdef SUPPORT_UCP
2538 ph10 349 othercase = UCD_OTHERCASE(next);
2539 nigel 93 #else
2540     othercase = NOTACHAR;
2541     #endif
2542     return (unsigned int)item == othercase;
2543     }
2544     else
2545     #endif /* SUPPORT_UTF8 */
2546     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2547    
2548     case OP_DIGIT:
2549     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2550    
2551     case OP_NOT_DIGIT:
2552     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2553    
2554     case OP_WHITESPACE:
2555     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2556    
2557     case OP_NOT_WHITESPACE:
2558     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2559    
2560     case OP_WORDCHAR:
2561     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2562    
2563     case OP_NOT_WORDCHAR:
2564     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2565    
2566 ph10 180 case OP_HSPACE:
2567     case OP_NOT_HSPACE:
2568     switch(next)
2569     {
2570     case 0x09:
2571     case 0x20:
2572     case 0xa0:
2573     case 0x1680:
2574     case 0x180e:
2575     case 0x2000:
2576     case 0x2001:
2577     case 0x2002:
2578     case 0x2003:
2579     case 0x2004:
2580     case 0x2005:
2581     case 0x2006:
2582     case 0x2007:
2583     case 0x2008:
2584     case 0x2009:
2585     case 0x200A:
2586     case 0x202f:
2587     case 0x205f:
2588     case 0x3000:
2589     return op_code != OP_HSPACE;
2590     default:
2591     return op_code == OP_HSPACE;
2592     }
2593    
2594     case OP_VSPACE:
2595     case OP_NOT_VSPACE:
2596     switch(next)
2597     {
2598     case 0x0a:
2599     case 0x0b:
2600     case 0x0c:
2601     case 0x0d:
2602     case 0x85:
2603     case 0x2028:
2604     case 0x2029:
2605     return op_code != OP_VSPACE;
2606     default:
2607     return op_code == OP_VSPACE;
2608     }
2609    
2610 nigel 93 default:
2611     return FALSE;
2612     }
2613    
2614    
2615     /* Handle the case when the next item is \d, \s, etc. */
2616    
2617     switch(op_code)
2618     {
2619     case OP_CHAR:
2620     case OP_CHARNC:
2621     #ifdef SUPPORT_UTF8
2622     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2623     #endif
2624     switch(-next)
2625     {
2626     case ESC_d:
2627     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2628    
2629     case ESC_D:
2630     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2631    
2632     case ESC_s:
2633     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2634    
2635     case ESC_S:
2636     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2637    
2638     case ESC_w:
2639     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2640    
2641     case ESC_W:
2642     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2643 ph10 182
2644 ph10 180 case ESC_h:
2645     case ESC_H:
2646     switch(item)
2647     {
2648     case 0x09:
2649     case 0x20:
2650     case 0xa0:
2651     case 0x1680:
2652     case 0x180e:
2653     case 0x2000:
2654     case 0x2001:
2655     case 0x2002:
2656     case 0x2003:
2657     case 0x2004:
2658     case 0x2005:
2659     case 0x2006:
2660     case 0x2007:
2661     case 0x2008:
2662     case 0x2009:
2663     case 0x200A:
2664     case 0x202f:
2665     case 0x205f:
2666     case 0x3000:
2667     return -next != ESC_h;
2668     default:
2669     return -next == ESC_h;
2670 ph10 182 }
2671    
2672 ph10 180 case ESC_v:
2673     case ESC_V:
2674     switch(item)
2675     {
2676     case 0x0a:
2677     case 0x0b:
2678     case 0x0c:
2679     case 0x0d:
2680     case 0x85:
2681     case 0x2028:
2682     case 0x2029:
2683     return -next != ESC_v;
2684     default:
2685     return -next == ESC_v;
2686 ph10 182 }
2687 nigel 93
2688     default:
2689     return FALSE;
2690     }
2691    
2692     case OP_DIGIT:
2693 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2694     next == -ESC_h || next == -ESC_v;
2695 nigel 93
2696     case OP_NOT_DIGIT:
2697     return next == -ESC_d;
2698    
2699     case OP_WHITESPACE:
2700     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2701    
2702     case OP_NOT_WHITESPACE:
2703 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2704 nigel 93
2705 ph10 180 case OP_HSPACE:
2706     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2707    
2708     case OP_NOT_HSPACE:
2709     return next == -ESC_h;
2710 ph10 182
2711 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2712 ph10 182 case OP_VSPACE:
2713 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2714    
2715     case OP_NOT_VSPACE:
2716 ph10 182 return next == -ESC_v;
2717 ph10 180
2718 nigel 93 case OP_WORDCHAR:
2719 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2720 nigel 93
2721     case OP_NOT_WORDCHAR:
2722     return next == -ESC_w || next == -ESC_d;
2723 ph10 182
2724 nigel 93 default:
2725     return FALSE;
2726     }
2727    
2728     /* Control does not reach here */
2729     }
2730    
2731    
2732    
2733     /*************************************************
2734 nigel 77 * Compile one branch *
2735     *************************************************/
2736    
2737 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2738 nigel 77 changed during the branch, the pointer is used to change the external options
2739 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2740     to find out the amount of memory needed, as well as during the real compile
2741     phase. The value of lengthptr distinguishes the two phases.
2742 nigel 77
2743     Arguments:
2744     optionsptr pointer to the option bits
2745     codeptr points to the pointer to the current code point
2746     ptrptr points to the current pattern pointer
2747     errorcodeptr points to error code variable
2748     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2749     reqbyteptr set to the last literal character required, else < 0
2750     bcptr points to current branch chain
2751     cd contains pointers to tables etc.
2752 nigel 93 lengthptr NULL during the real compile phase
2753     points to length accumulator during pre-compile phase
2754 nigel 77
2755     Returns: TRUE on success
2756     FALSE, with *errorcodeptr set non-zero on error
2757     */
2758    
2759     static BOOL
2760 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2761     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2762     compile_data *cd, int *lengthptr)
2763 nigel 77 {
2764     int repeat_type, op_type;
2765     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2766     int bravalue = 0;
2767     int greedy_default, greedy_non_default;
2768     int firstbyte, reqbyte;
2769     int zeroreqbyte, zerofirstbyte;
2770     int req_caseopt, reqvary, tempreqvary;
2771     int options = *optionsptr;
2772     int after_manual_callout = 0;
2773 nigel 93 int length_prevgroup = 0;
2774 nigel 77 register int c;
2775     register uschar *code = *codeptr;
2776 nigel 93 uschar *last_code = code;
2777     uschar *orig_code = code;
2778 nigel 77 uschar *tempcode;
2779     BOOL inescq = FALSE;
2780     BOOL groupsetfirstbyte = FALSE;
2781     const uschar *ptr = *ptrptr;
2782     const uschar *tempptr;
2783 ph10 518 const uschar *nestptr = NULL;
2784 nigel 77 uschar *previous = NULL;
2785     uschar *previous_callout = NULL;
2786 nigel 93 uschar *save_hwm = NULL;
2787 nigel 77 uschar classbits[32];
2788    
2789     #ifdef SUPPORT_UTF8
2790     BOOL class_utf8;
2791     BOOL utf8 = (options & PCRE_UTF8) != 0;
2792     uschar *class_utf8data;
2793 ph10 300 uschar *class_utf8data_base;
2794 nigel 77 uschar utf8_char[6];
2795     #else
2796     BOOL utf8 = FALSE;
2797 nigel 93 uschar *utf8_char = NULL;
2798 nigel 77 #endif
2799    
2800 ph10 475 #ifdef PCRE_DEBUG
2801 nigel 93 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2802     #endif
2803    
2804 nigel 77 /* Set up the default and non-default settings for greediness */
2805    
2806     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2807     greedy_non_default = greedy_default ^ 1;
2808    
2809     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2810     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2811     matches a non-fixed char first char; reqbyte just remains unset if we never
2812     find one.
2813    
2814     When we hit a repeat whose minimum is zero, we may have to adjust these values
2815     to take the zero repeat into account. This is implemented by setting them to
2816     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2817     item types that can be repeated set these backoff variables appropriately. */
2818    
2819     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2820    
2821     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2822     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2823     value > 255. It is added into the firstbyte or reqbyte variables to record the
2824     case status of the value. This is used only for ASCII characters. */
2825    
2826     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2827    
2828     /* Switch on next character until the end of the branch */
2829    
2830     for (;; ptr++)
2831     {
2832     BOOL negate_class;
2833 ph10 286 BOOL should_flip_negation;
2834 nigel 77 BOOL possessive_quantifier;
2835     BOOL is_quantifier;
2836 nigel 93 BOOL is_recurse;
2837 ph10 180 BOOL reset_bracount;
2838 nigel 77 int class_charcount;
2839     int class_lastchar;
2840     int newoptions;
2841     int recno;
2842 ph10 172 int refsign;
2843 nigel 77 int skipbytes;
2844     int subreqbyte;
2845     int subfirstbyte;
2846 nigel 93 int terminator;
2847 nigel 77 int mclength;
2848     uschar mcbuffer[8];
2849    
2850 nigel 93 /* Get next byte in the pattern */
2851 nigel 77
2852     c = *ptr;
2853 ph10 345
2854 ph10 518 /* If we are at the end of a nested substitution, revert to the outer level
2855     string. Nesting only happens one level deep. */
2856    
2857     if (c == 0 && nestptr != NULL)
2858     {
2859     ptr = nestptr;
2860     nestptr = NULL;
2861     c = *ptr;
2862     }
2863    
2864 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2865     previous cycle of this loop. */
2866    
2867     if (lengthptr != NULL)
2868     {
2869 ph10 475 #ifdef PCRE_DEBUG
2870 nigel 93 if (code > cd->hwm) cd->hwm = code; /* High water info */
2871     #endif
2872 ph10 505 if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */
2873 nigel 93 {
2874     *errorcodeptr = ERR52;
2875     goto FAILED;
2876     }
2877    
2878     /* There is at least one situation where code goes backwards: this is the
2879     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2880     the class is simply eliminated. However, it is created first, so we have to
2881     allow memory for it. Therefore, don't ever reduce the length at this point.
2882     */
2883    
2884     if (code < last_code) code = last_code;
2885 ph10 202
2886     /* Paranoid check for integer overflow */
2887    
2888     if (OFLOW_MAX - *lengthptr < code - last_code)
2889     {
2890     *errorcodeptr = ERR20;
2891     goto FAILED;
2892     }
2893    
2894 nigel 93 *lengthptr += code - last_code;
2895     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2896    
2897     /* If "previous" is set and it is not at the start of the work space, move
2898     it back to there, in order to avoid filling up the work space. Otherwise,
2899     if "previous" is NULL, reset the current code pointer to the start. */
2900    
2901     if (previous != NULL)
2902     {
2903     if (previous > orig_code)
2904     {
2905     memmove(orig_code, previous, code - previous);
2906     code -= previous - orig_code;
2907     previous = orig_code;
2908     }
2909     }
2910     else code = orig_code;
2911    
2912     /* Remember where this code item starts so we can pick up the length
2913     next time round. */
2914    
2915     last_code = code;
2916     }
2917    
2918     /* In the real compile phase, just check the workspace used by the forward
2919     reference list. */
2920    
2921 ph10 505 else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
2922 nigel 93 {
2923     *errorcodeptr = ERR52;
2924     goto FAILED;
2925     }
2926    
2927 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2928    
2929     if (inescq && c != 0)
2930     {
2931 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2932 nigel 77 {
2933     inescq = FALSE;
2934     ptr++;
2935     continue;
2936     }
2937     else
2938     {
2939     if (previous_callout != NULL)
2940     {
2941 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2942     complete_callout(previous_callout, ptr, cd);
2943 nigel 77 previous_callout = NULL;
2944     }
2945     if ((options & PCRE_AUTO_CALLOUT) != 0)
2946     {
2947     previous_callout = code;
2948     code = auto_callout(code, ptr, cd);
2949     }
2950     goto NORMAL_CHAR;
2951     }
2952     }
2953    
2954     /* Fill in length of a previous callout, except when the next thing is
2955     a quantifier. */
2956    
2957 ph10 392 is_quantifier =
2958 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2959     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2960 nigel 77
2961     if (!is_quantifier && previous_callout != NULL &&
2962     after_manual_callout-- <= 0)
2963     {
2964 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2965     complete_callout(previous_callout, ptr, cd);
2966 nigel 77 previous_callout = NULL;
2967     }
2968    
2969     /* In extended mode, skip white space and comments */
2970    
2971     if ((options & PCRE_EXTENDED) != 0)
2972     {
2973     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2974 ph10 391 if (c == CHAR_NUMBER_SIGN)
2975 nigel 77 {
2976 nigel 93 while (*(++ptr) != 0)
2977 nigel 91 {
2978 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2979 nigel 91 }
2980 nigel 93 if (*ptr != 0) continue;
2981    
2982 nigel 91 /* Else fall through to handle end of string */
2983     c = 0;
2984 nigel 77 }
2985     }
2986    
2987     /* No auto callout for quantifiers. */
2988    
2989     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2990     {
2991     previous_callout = code;
2992     code = auto_callout(code, ptr, cd);
2993     }
2994    
2995     switch(c)
2996     {
2997 nigel 93 /* ===================================================================*/
2998     case 0: /* The branch terminates at string end */
2999 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
3000     case CHAR_RIGHT_PARENTHESIS:
3001 nigel 77 *firstbyteptr = firstbyte;
3002     *reqbyteptr = reqbyte;
3003     *codeptr = code;
3004     *ptrptr = ptr;
3005 nigel 93 if (lengthptr != NULL)
3006     {
3007 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
3008     {
3009     *errorcodeptr = ERR20;
3010     goto FAILED;
3011     }
3012 nigel 93 *lengthptr += code - last_code; /* To include callout length */
3013     DPRINTF((">> end branch\n"));
3014     }
3015 nigel 77 return TRUE;
3016    
3017 nigel 93
3018     /* ===================================================================*/
3019 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
3020     the setting of any following char as a first character. */
3021    
3022 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
3023 nigel 77 if ((options & PCRE_MULTILINE) != 0)
3024     {
3025     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3026     }
3027     previous = NULL;
3028     *code++ = OP_CIRC;
3029     break;
3030    
3031 ph10 391 case CHAR_DOLLAR_SIGN:
3032 nigel 77 previous = NULL;
3033     *code++ = OP_DOLL;
3034     break;
3035    
3036     /* There can never be a first char if '.' is first, whatever happens about
3037     repeats. The value of reqbyte doesn't change either. */
3038    
3039 ph10 391 case CHAR_DOT:
3040 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3041     zerofirstbyte = firstbyte;
3042     zeroreqbyte = reqbyte;
3043     previous = code;
3044 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3045 nigel 77 break;
3046    
3047 nigel 93
3048     /* ===================================================================*/
3049 nigel 87 /* Character classes. If the included characters are all < 256, we build a
3050     32-byte bitmap of the permitted characters, except in the special case
3051     where there is only one such character. For negated classes, we build the
3052     map as usual, then invert it at the end. However, we use a different opcode
3053     so that data characters > 255 can be handled correctly.
3054 nigel 77
3055     If the class contains characters outside the 0-255 range, a different
3056     opcode is compiled. It may optionally have a bit map for characters < 256,
3057     but those above are are explicitly listed afterwards. A flag byte tells
3058     whether the bitmap is present, and whether this is a negated class or not.
3059 ph10 345
3060 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
3061     default (Perl) mode, it is treated as a data character. */
3062 ph10 345
3063 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
3064 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3065     {
3066     *errorcodeptr = ERR64;
3067 ph10 345 goto FAILED;
3068 ph10 336 }
3069 ph10 345 goto NORMAL_CHAR;
3070 nigel 77
3071 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
3072 nigel 77 previous = code;
3073    
3074     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3075     they are encountered at the top level, so we'll do that too. */
3076    
3077 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3078 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
3079 ph10 295 check_posix_syntax(ptr, &tempptr))
3080 nigel 77 {
3081 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3082 nigel 77 goto FAILED;
3083     }
3084    
3085 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
3086 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
3087 ph10 205 skip them too. This makes for compatibility with Perl. */
3088 ph10 208
3089 ph10 205 negate_class = FALSE;
3090     for (;;)
3091 nigel 77 {
3092     c = *(++ptr);
3093 ph10 391 if (c == CHAR_BACKSLASH)
3094 ph10 205 {
3095 ph10 392 if (ptr[1] == CHAR_E)
3096 ph10 391 ptr++;
3097 ph10 392 else if (strncmp((const char *)ptr+1,
3098     STR_Q STR_BACKSLASH STR_E, 3) == 0)
3099 ph10 391 ptr += 3;
3100 ph10 392 else
3101 ph10 391 break;
3102 ph10 205 }
3103 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3104 ph10 205 negate_class = TRUE;
3105     else break;
3106 ph10 208 }
3107 ph10 345
3108     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3109     an initial ']' is taken as a data character -- the code below handles
3110 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3111     [^] must match any character, so generate OP_ALLANY. */
3112 ph10 345
3113 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3114 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3115 ph10 341 {
3116     *code++ = negate_class? OP_ALLANY : OP_FAIL;
3117     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3118     zerofirstbyte = firstbyte;
3119     break;
3120 ph10 345 }
3121 nigel 77
3122 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
3123     negation flag at the end, so that support for characters > 255 works
3124 ph10 264 correctly (they are all included in the class). */
3125    
3126     should_flip_negation = FALSE;
3127    
3128 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
3129 nigel 93 of just a single character (as long as it's < 256). However, For higher
3130     valued UTF-8 characters, we don't yet do any optimization. */
3131 nigel 77
3132     class_charcount = 0;
3133     class_lastchar = -1;
3134    
3135 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
3136     temporary bit of memory, in case the class contains only 1 character (less
3137     than 256), because in that case the compiled code doesn't use the bit map.
3138     */
3139    
3140     memset(classbits, 0, 32 * sizeof(uschar));
3141    
3142 nigel 77 #ifdef SUPPORT_UTF8
3143     class_utf8 = FALSE; /* No chars >= 256 */
3144 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3145 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3146 nigel 77 #endif
3147    
3148     /* Process characters until ] is reached. By writing this as a "do" it
3149 nigel 93 means that an initial ] is taken as a data character. At the start of the
3150     loop, c contains the first byte of the character. */
3151 nigel 77
3152 nigel 93 if (c != 0) do
3153 nigel 77 {
3154 nigel 93 const uschar *oldptr;
3155    
3156 nigel 77 #ifdef SUPPORT_UTF8
3157     if (utf8 && c > 127)
3158     { /* Braces are required because the */
3159     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3160     }
3161 ph10 518
3162 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3163 ph10 309 data and reset the pointer. This is so that very large classes that
3164 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
3165 ph10 309 (which is on the stack). */
3166    
3167 ph10 300 if (lengthptr != NULL)
3168     {
3169     *lengthptr += class_utf8data - class_utf8data_base;
3170 ph10 309 class_utf8data = class_utf8data_base;
3171     }
3172    
3173 nigel 77 #endif
3174    
3175     /* Inside \Q...\E everything is literal except \E */
3176    
3177     if (inescq)
3178     {
3179 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3180 nigel 77 {
3181 nigel 93 inescq = FALSE; /* Reset literal state */
3182     ptr++; /* Skip the 'E' */
3183     continue; /* Carry on with next */
3184 nigel 77 }
3185 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
3186 nigel 77 }
3187    
3188     /* Handle POSIX class names. Perl allows a negation extension of the
3189     form [:^name:]. A square bracket that doesn't match the syntax is
3190     treated as a literal. We also recognize the POSIX constructions
3191     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3192     5.6 and 5.8 do. */
3193    
3194 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3195 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3196 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3197 nigel 77 {
3198     BOOL local_negate = FALSE;
3199 nigel 87 int posix_class, taboffset, tabopt;
3200 nigel 77 register const uschar *cbits = cd->cbits;
3201 nigel 87 uschar pbits[32];
3202 nigel 77
3203 ph10 391 if (ptr[1] != CHAR_COLON)
3204 nigel 77 {
3205     *errorcodeptr = ERR31;
3206     goto FAILED;
3207     }
3208    
3209     ptr += 2;
3210 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3211 nigel 77 {
3212     local_negate = TRUE;
3213 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3214 nigel 77 ptr++;
3215     }
3216    
3217     posix_class = check_posix_name(ptr, tempptr - ptr);
3218     if (posix_class < 0)
3219     {
3220     *errorcodeptr = ERR30;
3221     goto FAILED;
3222     }
3223    
3224     /* If matching is caseless, upper and lower are converted to
3225     alpha. This relies on the fact that the class table starts with
3226     alpha, lower, upper as the first 3 entries. */
3227    
3228     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3229     posix_class = 0;
3230 ph10 518
3231     /* When PCRE_UCP is set, some of the POSIX classes are converted to
3232     different escape sequences that use Unicode properties. */
3233    
3234     #ifdef SUPPORT_UCP
3235     if ((options & PCRE_UCP) != 0)
3236     {
3237     int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3238     if (posix_substitutes[pc] != NULL)
3239     {
3240     nestptr = tempptr + 1;
3241     ptr = posix_substitutes[pc] - 1;
3242     continue;
3243     }
3244     }
3245     #endif
3246     /* In the non-UCP case, we build the bit map for the POSIX class in a
3247     chunk of local store because we may be adding and subtracting from it,
3248     and we don't want to subtract bits that may be in the main map already.
3249     At the end we or the result into the bit map that is being built. */
3250 nigel 77
3251     posix_class *= 3;
3252 nigel 87
3253     /* Copy in the first table (always present) */
3254    
3255     memcpy(pbits, cbits + posix_class_maps[posix_class],
3256     32 * sizeof(uschar));
3257    
3258     /* If there is a second table, add or remove it as required. */
3259    
3260     taboffset = posix_class_maps[posix_class + 1];
3261     tabopt = posix_class_maps[posix_class + 2];
3262    
3263     if (taboffset >= 0)
3264 nigel 77 {
3265 nigel 87 if (tabopt >= 0)
3266     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3267 nigel 77 else
3268 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3269 nigel 77 }
3270    
3271 nigel 87 /* Not see if we need to remove any special characters. An option
3272     value of 1 removes vertical space and 2 removes underscore. */
3273    
3274     if (tabopt < 0) tabopt = -tabopt;
3275     if (tabopt == 1) pbits[1] &= ~0x3c;
3276     else if (tabopt == 2) pbits[11] &= 0x7f;
3277    
3278     /* Add the POSIX table or its complement into the main table that is
3279     being built and we are done. */
3280    
3281     if (local_negate)
3282     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3283     else
3284     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3285    
3286 nigel 77 ptr = tempptr + 1;
3287     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3288     continue; /* End of POSIX syntax handling */
3289     }
3290    
3291     /* Backslash may introduce a single character, or it may introduce one
3292 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3293 ph10 513 case. Inside a class (and only there) it is treated as backspace. We
3294     assume that other escapes have more than one character in them, so set
3295     class_charcount bigger than one. Unrecognized escapes fall through and
3296     are either treated as literal characters (by default), or are faulted if
3297     PCRE_EXTRA is set. */
3298 nigel 77
3299 ph10 391 if (c == CHAR_BACKSLASH)
3300 nigel 77 {
3301 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3302     if (*errorcodeptr != 0) goto FAILED;
3303 nigel 77
3304 ph10 513 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3305 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3306     {
3307 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3308 nigel 77 {
3309     ptr += 2; /* avoid empty string */
3310     }
3311     else inescq = TRUE;
3312     continue;
3313     }
3314 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3315 nigel 77
3316     if (c < 0)
3317     {
3318     register const uschar *cbits = cd->cbits;
3319     class_charcount += 2; /* Greater than 1 is what matters */
3320 nigel 93
3321 ph10 518 switch (-c)
3322 nigel 77 {
3323 ph10 518 #ifdef SUPPORT_UCP
3324     case ESC_du: /* These are the values given for \d etc */
3325     case ESC_DU: /* when PCRE_UCP is set. We replace the */
3326     case ESC_wu: /* escape sequence with an appropriate \p */
3327     case ESC_WU: /* or \P to test Unicode properties instead */
3328     case ESC_su: /* of the default ASCII testing. */
3329     case ESC_SU:
3330     nestptr = ptr;
3331     ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
3332     class_charcount -= 2; /* Undo! */
3333     continue;
3334     #endif
3335 nigel 77 case ESC_d:
3336     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3337     continue;
3338    
3339     case ESC_D:
3340 ph10 286 should_flip_negation = TRUE;
3341 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3342     continue;
3343    
3344     case ESC_w:
3345     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3346     continue;
3347    
3348     case ESC_W:
3349 ph10 286 should_flip_negation = TRUE;
3350 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3351     continue;
3352    
3353     case ESC_s:
3354     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3355     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3356     continue;
3357    
3358     case ESC_S:
3359 ph10 286 should_flip_negation = TRUE;
3360 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3361     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3362     continue;
3363    
3364 ph10 518 case ESC_h:
3365 ph10 178 SETBIT(classbits, 0x09); /* VT */
3366     SETBIT(classbits, 0x20); /* SPACE */
3367 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
3368 ph10 178 #ifdef SUPPORT_UTF8
3369     if (utf8)
3370 ph10 180 {
3371 ph10 178 class_utf8 = TRUE;
3372     *class_utf8data++ = XCL_SINGLE;
3373 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3374 ph10 178 *class_utf8data++ = XCL_SINGLE;
3375 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3376     *class_utf8data++ = XCL_RANGE;
3377     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3378     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3379 ph10 178 *class_utf8data++ = XCL_SINGLE;
3380 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3381 ph10 178 *class_utf8data++ = XCL_SINGLE;
3382 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3383 ph10 178 *class_utf8data++ = XCL_SINGLE;
3384 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3385     }
3386     #endif
3387     continue;
3388 nigel 93
3389 ph10 518 case ESC_H:
3390 ph10 178 for (c = 0; c < 32; c++)
3391     {
3392     int x = 0xff;
3393     switch (c)
3394 ph10 180 {
3395 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
3396     case 0x20/8: x ^= 1 << (0x20%8); break;
3397     case 0xa0/8: x ^= 1 << (0xa0%8); break;
3398     default: break;
3399     }
3400     classbits[c] |= x;
3401 ph10 180 }
3402    
3403 ph10 178 #ifdef SUPPORT_UTF8
3404     if (utf8)
3405 ph10 180 {
3406 ph10 178 class_utf8 = TRUE;
3407 ph10 180 *class_utf8data++ = XCL_RANGE;
3408     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3409     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3410     *class_utf8data++ = XCL_RANGE;
3411     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3412     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3413     *class_utf8data++ = XCL_RANGE;
3414     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3415     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3416     *class_utf8data++ = XCL_RANGE;
3417     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3418     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3419     *class_utf8data++ = XCL_RANGE;
3420     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3421     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3422     *class_utf8data++ = XCL_RANGE;
3423     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3424     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3425     *class_utf8data++ = XCL_RANGE;
3426     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3427     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3428     }
3429     #endif
3430     continue;
3431 ph10 178
3432 ph10 518 case ESC_v:
3433 ph10 178 SETBIT(classbits, 0x0a); /* LF */
3434     SETBIT(classbits, 0x0b); /* VT */
3435 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3436     SETBIT(classbits, 0x0d); /* CR */
3437     SETBIT(classbits, 0x85); /* NEL */
3438 ph10 178 #ifdef SUPPORT_UTF8
3439     if (utf8)
3440 ph10 180 {
3441 ph10 178 class_utf8 = TRUE;
3442 ph10 180 *class_utf8data++ = XCL_RANGE;
3443     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3444     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3445     }
3446     #endif
3447     continue;
3448 ph10 178
3449 ph10 518 case ESC_V:
3450 ph10 178 for (c = 0; c < 32; c++)
3451     {
3452     int x = 0xff;
3453     switch (c)
3454 ph10 180 {
3455 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3456     x ^= 1 << (0x0b%8);
3457     x ^= 1 << (0x0c%8);
3458 ph10 180 x ^= 1 << (0x0d%8);
3459 ph10 178 break;
3460     case 0x85/8: x ^= 1 << (0x85%8); break;
3461     default: break;
3462     }
3463     classbits[c] |= x;
3464 ph10 180 }
3465    
3466 ph10 178 #ifdef SUPPORT_UTF8
3467     if (utf8)
3468 ph10 180 {
3469 ph10 178 class_utf8 = TRUE;
3470 ph10 180 *class_utf8data++ = XCL_RANGE;
3471     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3472     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3473     *class_utf8data++ = XCL_RANGE;
3474     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3475     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3476     }
3477     #endif
3478     continue;
3479 ph10 178
3480 nigel 77 #ifdef SUPPORT_UCP
3481 ph10 518 case ESC_p:
3482     case ESC_P:
3483     {
3484     BOOL negated;
3485     int pdata;
3486     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3487     if (ptype < 0) goto FAILED;
3488     class_utf8 = TRUE;
3489     *class_utf8data++ = ((-c == ESC_p) != negated)?
3490     XCL_PROP : XCL_NOTPROP;
3491     *class_utf8data++ = ptype;
3492     *class_utf8data++ = pdata;
3493     class_charcount -= 2; /* Not a < 256 character */
3494     continue;
3495     }
3496 nigel 77 #endif
3497 ph10 518 /* Unrecognized escapes are faulted if PCRE is running in its
3498     strict mode. By default, for compatibility with Perl, they are
3499     treated as literals. */
3500 nigel 77
3501 ph10 518 default:
3502     if ((options & PCRE_EXTRA) != 0)
3503     {
3504     *errorcodeptr = ERR7;
3505     goto FAILED;
3506     }
3507     class_charcount -= 2; /* Undo the default count from above */
3508     c = *ptr; /* Get the final character and fall through */
3509     break;
3510 nigel 93 }
3511 nigel 77 }
3512    
3513     /* Fall through if we have a single character (c >= 0). This may be
3514 nigel 93 greater than 256 in UTF-8 mode. */
3515 nigel 77
3516     } /* End of backslash handling */
3517    
3518     /* A single character may be followed by '-' to form a range. However,
3519     Perl does not permit ']' to be the end of the range. A '-' character
3520 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3521     entirely. The code for handling \Q and \E is messy. */
3522 nigel 77
3523 nigel 93 CHECK_RANGE:
3524 ph10 391 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3525 nigel 77 {
3526 nigel 93 inescq = FALSE;
3527     ptr += 2;
3528     }
3529    
3530     oldptr = ptr;
3531 ph10 231
3532 ph10 230 /* Remember \r or \n */
3533 ph10 231
3534 ph10 391 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3535 ph10 231
3536 ph10 230 /* Check for range */
3537 nigel 93
3538 ph10 391 if (!inescq && ptr[1] == CHAR_MINUS)
3539 nigel 93 {
3540 nigel 77 int d;
3541     ptr += 2;
3542 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3543 nigel 77
3544 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3545     mode. */
3546    
3547 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3548 nigel 93 {
3549     ptr += 2;
3550 ph10 392 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3551 ph10 391 { ptr += 2; continue; }
3552 nigel 93 inescq = TRUE;
3553     break;
3554     }
3555    
3556 ph10 391 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3557 nigel 93 {
3558     ptr = oldptr;
3559     goto LONE_SINGLE_CHARACTER;
3560     }
3561    
3562 nigel 77 #ifdef SUPPORT_UTF8
3563     if (utf8)
3564     { /* Braces are required because the */
3565     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3566     }
3567     else
3568     #endif
3569     d = *ptr; /* Not UTF-8 mode */
3570    
3571     /* The second part of a range can be a single-character escape, but
3572     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3573     in such circumstances. */
3574    
3575 ph10 391 if (!inescq && d == CHAR_BACKSLASH)
3576 nigel 77 {
3577 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3578     if (*errorcodeptr != 0) goto FAILED;
3579 nigel 77
3580 ph10 514 /* \b is backspace; any other special means the '-' was literal */
3581 nigel 77
3582     if (d < 0)
3583     {
3584 ph10 514 if (d == -ESC_b) d = CHAR_BS; else
3585 nigel 77 {
3586 nigel 93 ptr = oldptr;
3587 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3588     }
3589     }
3590     }
3591    
3592 nigel 93 /* Check that the two values are in the correct order. Optimize
3593     one-character ranges */
3594 nigel 77
3595 nigel 93 if (d < c)
3596     {
3597     *errorcodeptr = ERR8;
3598     goto FAILED;
3599     }
3600    
3601 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3602    
3603 ph10 230 /* Remember \r or \n */
3604 ph10 231
3605 ph10 391 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3606 ph10 231
3607 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3608     matching, we have to use an XCLASS with extra data items. Caseless
3609     matching for characters > 127 is available only if UCP support is
3610     available. */
3611    
3612     #ifdef SUPPORT_UTF8
3613     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3614     {
3615     class_utf8 = TRUE;
3616    
3617     /* With UCP support, we can find the other case equivalents of
3618     the relevant characters. There may be several ranges. Optimize how
3619     they fit with the basic range. */
3620    
3621     #ifdef SUPPORT_UCP
3622     if ((options & PCRE_CASELESS) != 0)
3623     {
3624 nigel 93 unsigned int occ, ocd;
3625     unsigned int cc = c;
3626     unsigned int origd = d;
3627 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3628     {
3629 ph10 180 if (occ >= (unsigned int)c &&
3630     ocd <= (unsigned int)d)
3631 ph10 176 continue; /* Skip embedded ranges */
3632 nigel 77
3633 ph10 180 if (occ < (unsigned int)c &&
3634 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3635 nigel 77 { /* if there is overlap, */
3636     c = occ; /* noting that if occ < c */
3637     continue; /* we can't have ocd > d */
3638     } /* because a subrange is */
3639 ph10 180 if (ocd > (unsigned int)d &&
3640 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3641 nigel 77 { /* the basic range. */
3642     d = ocd;
3643     continue;
3644     }
3645    
3646     if (occ == ocd)
3647     {
3648     *class_utf8data++ = XCL_SINGLE;
3649     }
3650     else
3651     {
3652     *class_utf8data++ = XCL_RANGE;
3653     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3654     }
3655     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3656     }
3657     }
3658     #endif /* SUPPORT_UCP */
3659    
3660     /* Now record the original range, possibly modified for UCP caseless
3661     overlapping ranges. */
3662    
3663     *class_utf8data++ = XCL_RANGE;
3664     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3665     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3666    
3667     /* With UCP support, we are done. Without UCP support, there is no
3668     caseless matching for UTF-8 characters > 127; we can use the bit map
3669     for the smaller ones. */
3670    
3671     #ifdef SUPPORT_UCP
3672     continue; /* With next character in the class */
3673     #else
3674     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3675    
3676     /* Adjust upper limit and fall through to set up the map */
3677    
3678     d = 127;
3679    
3680     #endif /* SUPPORT_UCP */
3681     }
3682     #endif /* SUPPORT_UTF8 */
3683    
3684     /* We use the bit map for all cases when not in UTF-8 mode; else
3685     ranges that lie entirely within 0-127 when there is UCP support; else
3686     for partial ranges without UCP support. */
3687    
3688 nigel 93 class_charcount += d - c + 1;
3689     class_lastchar = d;
3690    
3691     /* We can save a bit of time by skipping this in the pre-compile. */
3692    
3693     if (lengthptr == NULL) for (; c <= d; c++)
3694 nigel 77 {
3695     classbits[c/8] |= (1 << (c&7));
3696     if ((options & PCRE_CASELESS) != 0)
3697     {
3698     int uc = cd->fcc[c]; /* flip case */
3699     classbits[uc/8] |= (1 << (uc&7));
3700     }
3701     }
3702    
3703     continue; /* Go get the next char in the class */
3704     }
3705    
3706     /* Handle a lone single character - we can get here for a normal
3707     non-escape char, or after \ that introduces a single character or for an
3708     apparent range that isn't. */
3709    
3710     LONE_SINGLE_CHARACTER:
3711 ph10 231
3712 nigel 77 /* Handle a character that cannot go in the bit map */
3713    
3714     #ifdef SUPPORT_UTF8
3715     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3716     {
3717     class_utf8 = TRUE;
3718     *class_utf8data++ = XCL_SINGLE;
3719     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3720    
3721     #ifdef SUPPORT_UCP
3722     if ((options & PCRE_CASELESS) != 0)
3723     {
3724 nigel 93 unsigned int othercase;
3725 ph10 349 if ((othercase = UCD_OTHERCASE(c)) != c)
3726 nigel 77 {
3727     *class_utf8data++ = XCL_SINGLE;
3728     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3729     }
3730     }
3731     #endif /* SUPPORT_UCP */
3732    
3733     }
3734     else
3735     #endif /* SUPPORT_UTF8 */
3736    
3737     /* Handle a single-byte character */
3738     {
3739     classbits[c/8] |= (1 << (c&7));
3740     if ((options & PCRE_CASELESS) != 0)
3741     {
3742     c = cd->fcc[c]; /* flip case */
3743     classbits[c/8] |= (1 << (c&7));
3744     }
3745     class_charcount++;
3746     class_lastchar = c;
3747     }
3748     }
3749    
3750 ph10 518 /* Loop until ']' reached. This "while" is the end of the "do" far above.
3751     If we are at the end of an internal nested string, revert to the outer
3752     string. */
3753 nigel 77
3754 ph10 518 while (((c = *(++ptr)) != 0 ||
3755     (nestptr != NULL &&
3756     (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
3757     (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3758 nigel 77
3759 ph10 518 /* Check for missing terminating ']' */
3760    
3761     if (c == 0)
3762 nigel 93 {
3763     *errorcodeptr = ERR6;
3764     goto FAILED;
3765     }
3766 ph10 231
3767 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3768 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3769     use of \p or \P, in other words, no use of any XCLASS features, we can
3770     optimize.
3771    
3772 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3773     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3774     operate on single-bytes only. This is an historical hangover. Maybe one day
3775     we can tidy these opcodes to handle multi-byte characters.
3776 nigel 77
3777     The optimization throws away the bit map. We turn the item into a
3778     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3779     that OP_NOT does not support multibyte characters. In the positive case, it
3780     can cause firstbyte to be set. Otherwise, there can be no first char if
3781     this item is first, whatever repeat count may follow. In the case of
3782     reqbyte, save the previous value for reinstating. */
3783 ph10 518
3784 nigel 77 #ifdef SUPPORT_UTF8
3785 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3786 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3787 nigel 77 #else
3788     if (class_charcount == 1)
3789     #endif
3790     {
3791     zeroreqbyte = reqbyte;
3792    
3793     /* The OP_NOT opcode works on one-byte characters only. */
3794    
3795     if (negate_class)
3796     {
3797     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3798     zerofirstbyte = firstbyte;
3799     *code++ = OP_NOT;
3800     *code++ = class_lastchar;
3801     break;
3802     }
3803    
3804     /* For a single, positive character, get the value into mcbuffer, and
3805     then we can handle this with the normal one-character code. */
3806    
3807     #ifdef SUPPORT_UTF8
3808     if (utf8 && class_lastchar > 127)
3809     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3810     else
3811     #endif
3812     {
3813     mcbuffer[0] = class_lastchar;
3814     mclength = 1;
3815     }
3816     goto ONE_CHAR;
3817     } /* End of 1-char optimization */
3818    
3819     /* The general case - not the one-char optimization. If this is the first
3820     thing in the branch, there can be no first char setting, whatever the
3821     repeat count. Any reqbyte setting must remain unchanged after any kind of
3822     repeat. */
3823    
3824     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3825     zerofirstbyte = firstbyte;
3826     zeroreqbyte = reqbyte;
3827    
3828     /* If there are characters with values > 255, we have to compile an
3829 ph10 286 extended class, with its own opcode, unless there was a negated special
3830 ph10 518 such as \S in the class, and PCRE_UCP is not set, because in that case all
3831     characters > 255 are in the class, so any that were explicitly given as
3832     well can be ignored. If (when there are explicit characters > 255 that must
3833     be listed) there are no characters < 256, we can omit the bitmap in the
3834     actual compiled code. */
3835 nigel 77
3836     #ifdef SUPPORT_UTF8
3837 ph10 518 if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))
3838 nigel 77 {
3839     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3840     *code++ = OP_XCLASS;
3841     code += LINK_SIZE;
3842     *code = negate_class? XCL_NOT : 0;
3843    
3844 nigel 93 /* If the map is required, move up the extra data to make room for it;
3845     otherwise just move the code pointer to the end of the extra data. */
3846 nigel 77
3847     if (class_charcount > 0)
3848     {
3849     *code++ |= XCL_MAP;
3850 nigel 93 memmove(code + 32, code, class_utf8data - code);
3851 nigel 77 memcpy(code, classbits, 32);
3852 nigel 93 code = class_utf8data + 32;
3853 nigel 77 }
3854 nigel 93 else code = class_utf8data;
3855 nigel 77
3856     /* Now fill in the complete length of the item */
3857    
3858     PUT(previous, 1, code - previous);
3859     break; /* End of class handling */
3860     }
3861     #endif
3862    
3863 ph10 518 /* If there are no characters > 255, or they are all to be included or
3864     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
3865     whole class was negated and whether there were negative specials such as \S
3866     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
3867     negating it if necessary. */
3868 ph10 286
3869 ph10 264 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3870 nigel 77 if (negate_class)
3871     {
3872 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3873     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3874 nigel 77 }
3875     else
3876     {
3877     memcpy(code, classbits, 32);
3878     }
3879     code += 32;
3880     break;
3881    
3882 nigel 93
3883     /* ===================================================================*/
3884 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3885     has been tested above. */
3886    
3887 ph10 391 case CHAR_LEFT_CURLY_BRACKET:
3888 nigel 77 if (!is_quantifier) goto NORMAL_CHAR;
3889     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3890     if (*errorcodeptr != 0) goto FAILED;
3891     goto REPEAT;
3892    
3893 ph10 391 case CHAR_ASTERISK:
3894 nigel 77 repeat_min = 0;
3895     repeat_max = -1;
3896     goto REPEAT;
3897    
3898 ph10 391 case CHAR_PLUS:
3899 nigel 77 repeat_min = 1;
3900     repeat_max = -1;
3901     goto REPEAT;
3902    
3903 ph10 391 case CHAR_QUESTION_MARK:
3904 nigel 77 repeat_min = 0;
3905     repeat_max = 1;
3906    
3907     REPEAT:
3908     if (previous == NULL)
3909     {
3910     *errorcodeptr = ERR9;
3911     goto FAILED;
3912     }
3913    
3914     if (repeat_min == 0)
3915     {
3916     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3917     reqbyte = zeroreqbyte; /* Ditto */
3918     }
3919    
3920     /* Remember whether this is a variable length repeat */
3921    
3922     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3923    
3924     op_type = 0; /* Default single-char op codes */
3925     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3926    
3927     /* Save start of previous item, in case we have to move it up to make space
3928     for an inserted OP_ONCE for the additional '+' extension. */
3929    
3930     tempcode = previous;
3931    
3932     /* If the next character is '+', we have a possessive quantifier. This
3933     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3934     If the next character is '?' this is a minimizing repeat, by default,
3935     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3936     repeat type to the non-default. */
3937    
3938 ph10 391 if (ptr[1] == CHAR_PLUS)