/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 758 - (hide annotations) (download)
Mon Nov 21 12:05:36 2011 UTC (17 months, 4 weeks ago) by ph10
File MIME type: text/plain
File size: 251043 byte(s)
Disallow \N in character classes, for Perl compatibility.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 ph10 475 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57     also used by pcretest. PCRE_DEBUG is not defined when building a production
58     library. */
59 nigel 85
60 ph10 475 #ifdef PCRE_DEBUG
61 nigel 85 #include "pcre_printint.src"
62     #endif
63    
64    
65 ph10 178 /* Macro for setting individual bits in class bitmaps. */
66    
67     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68    
69 ph10 202 /* Maximum length value to check against when making sure that the integer that
70     holds the compiled pattern length does not overflow. We make it a bit less than
71     INT_MAX to allow for adding in group terminating bytes, so that we don't have
72     to check them every time. */
73 ph10 178
74 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
75    
76    
77 nigel 77 /*************************************************
78     * Code parameters and static tables *
79     *************************************************/
80    
81 nigel 93 /* This value specifies the size of stack workspace that is used during the
82     first pre-compile phase that determines how much memory is required. The regex
83     is partly compiled into this space, but the compiled parts are discarded as
84     soon as they can be, so that hopefully there will never be an overrun. The code
85     does, however, check for an overrun. The largest amount I've seen used is 218,
86     so this number is very generous.
87 nigel 77
88 nigel 93 The same workspace is used during the second, actual compile phase for
89     remembering forward references to groups so that they can be filled in at the
90     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91     is 4 there is plenty of room. */
92 nigel 77
93 nigel 93 #define COMPILE_WORK_SIZE (4096)
94 nigel 77
95 ph10 507 /* The overrun tests check for a slightly smaller size so that they detect the
96 ph10 505 overrun before it actually does run off the end of the data block. */
97 nigel 93
98 ph10 505 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99    
100    
101 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
102     are simple data values; negative values are for special things like \d and so
103     on. Zero means further processing is needed (for things like \x), or the escape
104     is invalid. */
105    
106 ph10 391 #ifndef EBCDIC
107    
108     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
109 ph10 392 in UTF-8 mode. */
110 ph10 391
111 ph10 392 static const short int escapes[] = {
112 ph10 391 0, 0,
113     0, 0,
114 ph10 392 0, 0,
115     0, 0,
116     0, 0,
117 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
118 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
119 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
120 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
121     -ESC_B, -ESC_C,
122     -ESC_D, -ESC_E,
123     0, -ESC_G,
124     -ESC_H, 0,
125     0, -ESC_K,
126 ph10 391 0, 0,
127 ph10 514 -ESC_N, 0,
128 ph10 391 -ESC_P, -ESC_Q,
129     -ESC_R, -ESC_S,
130 ph10 392 0, 0,
131     -ESC_V, -ESC_W,
132     -ESC_X, 0,
133     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
134 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
135 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
136 ph10 391 CHAR_GRAVE_ACCENT, 7,
137 ph10 392 -ESC_b, 0,
138     -ESC_d, ESC_e,
139 ph10 391 ESC_f, 0,
140     -ESC_h, 0,
141 ph10 392 0, -ESC_k,
142 ph10 391 0, 0,
143     ESC_n, 0,
144 ph10 392 -ESC_p, 0,
145     ESC_r, -ESC_s,
146 ph10 391 ESC_tee, 0,
147 ph10 392 -ESC_v, -ESC_w,
148     0, 0,
149 ph10 391 -ESC_z
150 nigel 77 };
151    
152 ph10 392 #else
153 ph10 391
154     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155    
156 nigel 77 static const short int escapes[] = {
157     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
158     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
159     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
160     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
161     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
162     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
163     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
164     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
165 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
166 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
167 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
168 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
169 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
170     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
171     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
172     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
173 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
174 ph10 514 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
175 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
176 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
177 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
178     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
179     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
180     };
181     #endif
182    
183    
184 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185     searched linearly. Put all the names into a single string, in order to reduce
186 ph10 392 the number of relocations when a shared library is dynamically linked. The
187     string is built from string macros so that it works in UTF-8 mode on EBCDIC
188 ph10 391 platforms. */
189 ph10 210
190     typedef struct verbitem {
191 ph10 510 int len; /* Length of verb name */
192     int op; /* Op when no arg, or -1 if arg mandatory */
193     int op_arg; /* Op when arg present, or -1 if not allowed */
194 ph10 211 } verbitem;
195 ph10 210
196 ph10 240 static const char verbnames[] =
197 ph10 510 "\0" /* Empty name is a shorthand for MARK */
198 ph10 512 STRING_MARK0
199 ph10 391 STRING_ACCEPT0
200     STRING_COMMIT0
201     STRING_F0
202     STRING_FAIL0
203     STRING_PRUNE0
204     STRING_SKIP0
205     STRING_THEN;
206 ph10 240
207 ph10 327 static const verbitem verbs[] = {
208 ph10 510 { 0, -1, OP_MARK },
209 ph10 512 { 4, -1, OP_MARK },
210 ph10 510 { 6, OP_ACCEPT, -1 },
211     { 6, OP_COMMIT, -1 },
212     { 1, OP_FAIL, -1 },
213     { 4, OP_FAIL, -1 },
214     { 5, OP_PRUNE, OP_PRUNE_ARG },
215     { 4, OP_SKIP, OP_SKIP_ARG },
216     { 4, OP_THEN, OP_THEN_ARG }
217 ph10 210 };
218    
219 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
220 ph10 210
221    
222 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
223     now all in a single string, to reduce the number of relocations when a shared
224 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
225     length entry. The first three must be alpha, lower, upper, as this is assumed
226     for handling case independence. */
227 nigel 77
228 ph10 240 static const char posix_names[] =
229 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
230     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
231 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
232     STRING_word0 STRING_xdigit;
233 nigel 77
234     static const uschar posix_name_lengths[] = {
235     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
236    
237 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
238     base map, with an optional addition or removal of another map. Then, for some
239     classes, there is some additional tweaking: for [:blank:] the vertical space
240     characters are removed, and for [:alpha:] and [:alnum:] the underscore
241     character is removed. The triples in the table consist of the base map offset,
242     second map offset or -1 if no second map, and a non-negative value for map
243     addition or a negative value for map subtraction (if there are two maps). The
244     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
245     remove vertical space characters, 2 => remove underscore. */
246 nigel 77
247     static const int posix_class_maps[] = {
248 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
249     cbit_lower, -1, 0, /* lower */
250     cbit_upper, -1, 0, /* upper */
251     cbit_word, -1, 2, /* alnum - word without underscore */
252     cbit_print, cbit_cntrl, 0, /* ascii */
253     cbit_space, -1, 1, /* blank - a GNU extension */
254     cbit_cntrl, -1, 0, /* cntrl */
255     cbit_digit, -1, 0, /* digit */
256     cbit_graph, -1, 0, /* graph */
257     cbit_print, -1, 0, /* print */
258     cbit_punct, -1, 0, /* punct */
259     cbit_space, -1, 0, /* space */
260     cbit_word, -1, 0, /* word - a Perl extension */
261     cbit_xdigit,-1, 0 /* xdigit */
262 nigel 77 };
263    
264 ph10 535 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
265     substitutes must be in the order of the names, defined above, and there are
266 ph10 518 both positive and negative cases. NULL means no substitute. */
267 nigel 77
268 ph10 518 #ifdef SUPPORT_UCP
269     static const uschar *substitutes[] = {
270     (uschar *)"\\P{Nd}", /* \D */
271     (uschar *)"\\p{Nd}", /* \d */
272     (uschar *)"\\P{Xsp}", /* \S */ /* NOTE: Xsp is Perl space */
273     (uschar *)"\\p{Xsp}", /* \s */
274     (uschar *)"\\P{Xwd}", /* \W */
275 ph10 535 (uschar *)"\\p{Xwd}" /* \w */
276 ph10 518 };
277 ph10 535
278 ph10 518 static const uschar *posix_substitutes[] = {
279     (uschar *)"\\p{L}", /* alpha */
280 ph10 535 (uschar *)"\\p{Ll}", /* lower */
281     (uschar *)"\\p{Lu}", /* upper */
282     (uschar *)"\\p{Xan}", /* alnum */
283 ph10 518 NULL, /* ascii */
284     (uschar *)"\\h", /* blank */
285     NULL, /* cntrl */
286     (uschar *)"\\p{Nd}", /* digit */
287     NULL, /* graph */
288     NULL, /* print */
289     NULL, /* punct */
290     (uschar *)"\\p{Xps}", /* space */ /* NOTE: Xps is POSIX space */
291     (uschar *)"\\p{Xwd}", /* word */
292 ph10 535 NULL, /* xdigit */
293 ph10 518 /* Negated cases */
294     (uschar *)"\\P{L}", /* ^alpha */
295 ph10 535 (uschar *)"\\P{Ll}", /* ^lower */
296     (uschar *)"\\P{Lu}", /* ^upper */
297     (uschar *)"\\P{Xan}", /* ^alnum */
298 ph10 518 NULL, /* ^ascii */
299     (uschar *)"\\H", /* ^blank */
300     NULL, /* ^cntrl */
301     (uschar *)"\\P{Nd}", /* ^digit */
302     NULL, /* ^graph */
303     NULL, /* ^print */
304     NULL, /* ^punct */
305     (uschar *)"\\P{Xps}", /* ^space */ /* NOTE: Xps is POSIX space */
306     (uschar *)"\\P{Xwd}", /* ^word */
307 ph10 535 NULL /* ^xdigit */
308 ph10 518 };
309     #define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
310 ph10 535 #endif
311 ph10 518
312 nigel 93 #define STRING(a) # a
313     #define XSTRING(s) STRING(s)
314    
315 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
316 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
317     they are documented. Always add a new error instead. Messages marked DEAD below
318 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
319     the number of relocations needed when a shared library is loaded dynamically,
320     it is now one long string. We cannot use a table of offsets, because the
321     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
322     simply count through to the one we want - this isn't a performance issue
323 ph10 507 because these strings are used only when there is a compilation error.
324 nigel 77
325 ph10 507 Each substring ends with \0 to insert a null character. This includes the final
326     substring, so that the whole string ends with \0\0, which can be detected when
327 ph10 499 counting through. */
328    
329 ph10 240 static const char error_texts[] =
330     "no error\0"
331     "\\ at end of pattern\0"
332     "\\c at end of pattern\0"
333     "unrecognized character follows \\\0"
334     "numbers out of order in {} quantifier\0"
335 nigel 77 /* 5 */
336 ph10 240 "number too big in {} quantifier\0"
337     "missing terminating ] for character class\0"
338     "invalid escape sequence in character class\0"
339     "range out of order in character class\0"
340     "nothing to repeat\0"
341 nigel 77 /* 10 */
342 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
343     "internal error: unexpected repeat\0"
344 ph10 269 "unrecognized character after (? or (?-\0"
345 ph10 240 "POSIX named classes are supported only within a class\0"
346     "missing )\0"
347 nigel 77 /* 15 */
348 ph10 240 "reference to non-existent subpattern\0"
349     "erroffset passed as NULL\0"
350     "unknown option bit(s) set\0"
351     "missing ) after comment\0"
352     "parentheses nested too deeply\0" /** DEAD **/
353 nigel 77 /* 20 */
354 ph10 240 "regular expression is too large\0"
355     "failed to get memory\0"
356     "unmatched parentheses\0"
357     "internal error: code overflow\0"
358     "unrecognized character after (?<\0"
359 nigel 77 /* 25 */
360 ph10 240 "lookbehind assertion is not fixed length\0"
361     "malformed number or name after (?(\0"
362     "conditional group contains more than two branches\0"
363     "assertion expected after (?(\0"
364     "(?R or (?[+-]digits must be followed by )\0"
365 nigel 77 /* 30 */
366 ph10 240 "unknown POSIX class name\0"
367     "POSIX collating elements are not supported\0"
368     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
369     "spare error\0" /** DEAD **/
370     "character value in \\x{...} sequence is too large\0"
371 nigel 77 /* 35 */
372 ph10 240 "invalid condition (?(0)\0"
373     "\\C not allowed in lookbehind assertion\0"
374 ph10 514 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
375 ph10 240 "number after (?C is > 255\0"
376     "closing ) for (?C expected\0"
377 nigel 77 /* 40 */
378 ph10 240 "recursive call could loop indefinitely\0"
379     "unrecognized character after (?P\0"
380     "syntax error in subpattern name (missing terminator)\0"
381     "two named subpatterns have the same name\0"
382     "invalid UTF-8 string\0"
383 nigel 77 /* 45 */
384 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
385     "malformed \\P or \\p sequence\0"
386     "unknown property name after \\P or \\p\0"
387     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
388     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
389 nigel 91 /* 50 */
390 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
391     "octal value is greater than \\377 (not in UTF-8 mode)\0"
392     "internal error: overran compiling workspace\0"
393     "internal error: previously-checked referenced subpattern not found\0"
394     "DEFINE group contains more than one branch\0"
395 nigel 93 /* 55 */
396 ph10 637 "repeating a DEFINE group is not allowed\0" /** DEAD **/
397 ph10 240 "inconsistent NEWLINE options\0"
398 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
399     "a numbered reference must not be zero\0"
400 ph10 510 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
401 ph10 211 /* 60 */
402 ph10 240 "(*VERB) not recognized\0"
403 ph10 268 "number is too big\0"
404 ph10 272 "subpattern name expected\0"
405 ph10 336 "digit expected after (?+\0"
406 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
407     /* 65 */
408 ph10 510 "different names for subpatterns of the same number are not allowed\0"
409 ph10 512 "(*MARK) must have an argument\0"
410 ph10 535 "this version of PCRE is not compiled with PCRE_UCP support\0"
411 ph10 579 "\\c must be followed by an ASCII character\0"
412 ph10 654 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
413 ph10 747 /* 70 */
414     "internal error: unknown opcode in find_fixedlength()\0"
415 ph10 758 "\\N is not supported in a class\0"
416 ph10 510 ;
417 nigel 77
418     /* Table to identify digits and hex digits. This is used when compiling
419     patterns. Note that the tables in chartables are dependent on the locale, and
420     may mark arbitrary characters as digits - but the PCRE compiling code expects
421     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
422     a private table here. It costs 256 bytes, but it is a lot faster than doing
423     character value tests (at least in some simple cases I timed), and in some
424     applications one wants PCRE to compile efficiently as well as match
425     efficiently.
426    
427     For convenience, we use the same bit definitions as in chartables:
428    
429     0x04 decimal digit
430     0x08 hexadecimal digit
431    
432     Then we can use ctype_digit and ctype_xdigit in the code. */
433    
434 ph10 392 #ifndef EBCDIC
435 ph10 391
436 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
437 ph10 391 UTF-8 mode. */
438    
439 nigel 77 static const unsigned char digitab[] =
440     {
441     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
442     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
443     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
444     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
445     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
446     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
447     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
448     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
449     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
450     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
451     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
452     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
453     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
454     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
455     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
456     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
457     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
458     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
459     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
460     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
461     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
462     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
463     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
464     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
465     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
466     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
467     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
468     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
469     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
470     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
471     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
472     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
473    
474 ph10 392 #else
475 ph10 391
476     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
477    
478 nigel 77 static const unsigned char digitab[] =
479     {
480     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
481     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
482     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
483     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
484     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
485     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
486     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
487     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
488     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
489     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
490     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
491 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
492 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
493     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
494     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
495     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
496     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
497     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
498     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
499     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
500     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
501     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
502     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
503     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
504     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
505     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
506     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
507     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
508     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
509     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
510     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
511     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
512    
513     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
514     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
515     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
516     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
517     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
518     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
519     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
520     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
521     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
522     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
523     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
524     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
525 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
526 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
527     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
528     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
529     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
530     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
531     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
532     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
533     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
534     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
535     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
536     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
537     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
538     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
539     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
540     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
541     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
542     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
543     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
544     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
545     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
546     #endif
547    
548    
549     /* Definition to allow mutual recursion */
550    
551     static BOOL
552 ph10 642 compile_regex(int, uschar **, const uschar **, int *, BOOL, BOOL, int, int,
553     int *, int *, branch_chain *, compile_data *, int *);
554 nigel 77
555    
556    
557     /*************************************************
558 ph10 240 * Find an error text *
559     *************************************************/
560    
561 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
562     some of the text is of unknown length, we can't use a table of offsets.
563     Instead, just count through the strings. This is not a performance issue
564 ph10 240 because it happens only when there has been a compilation error.
565    
566     Argument: the error number
567     Returns: pointer to the error string
568     */
569    
570     static const char *
571     find_error_text(int n)
572     {
573     const char *s = error_texts;
574 ph10 507 for (; n > 0; n--)
575 ph10 499 {
576     while (*s++ != 0) {};
577     if (*s == 0) return "Error text not found (please report)";
578 ph10 507 }
579 ph10 240 return s;
580     }
581    
582    
583     /*************************************************
584 ph10 640 * Check for counted repeat *
585     *************************************************/
586    
587     /* This function is called when a '{' is encountered in a place where it might
588     start a quantifier. It looks ahead to see if it really is a quantifier or not.
589     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
590     where the ddds are digits.
591    
592     Arguments:
593     p pointer to the first char after '{'
594    
595     Returns: TRUE or FALSE
596     */
597    
598     static BOOL
599     is_counted_repeat(const uschar *p)
600     {
601     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
602     while ((digitab[*p] & ctype_digit) != 0) p++;
603     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
604    
605     if (*p++ != CHAR_COMMA) return FALSE;
606     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
607    
608     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
609     while ((digitab[*p] & ctype_digit) != 0) p++;
610    
611     return (*p == CHAR_RIGHT_CURLY_BRACKET);
612     }
613    
614    
615    
616     /*************************************************
617 nigel 77 * Handle escapes *
618     *************************************************/
619    
620     /* This function is called when a \ has been encountered. It either returns a
621     positive value for a simple escape such as \n, or a negative value which
622 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
623     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
624     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
625     ptr is pointing at the \. On exit, it is on the final character of the escape
626     sequence.
627 nigel 77
628     Arguments:
629     ptrptr points to the pattern position pointer
630     errorcodeptr points to the errorcode variable
631     bracount number of previous extracting brackets
632     options the options bits
633     isclass TRUE if inside a character class
634    
635     Returns: zero or positive => a data character
636     negative => a special escape sequence
637 ph10 213 on error, errorcodeptr is set
638 nigel 77 */
639    
640     static int
641     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
642     int options, BOOL isclass)
643     {
644 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
645     const uschar *ptr = *ptrptr + 1;
646 nigel 77 int c, i;
647    
648 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
649     ptr--; /* Set pointer back to the last byte */
650    
651 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
652    
653     if (c == 0) *errorcodeptr = ERR1;
654    
655 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
656     in a table. A non-zero result is something that can be returned immediately.
657 nigel 77 Otherwise further processing may be required. */
658    
659 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
660     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
661     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
662 nigel 77
663 ph10 97 #else /* EBCDIC coding */
664 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
665 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
666     #endif
667    
668     /* Escapes that need further processing, or are illegal. */
669    
670     else
671     {
672     const uschar *oldptr;
673 nigel 93 BOOL braced, negated;
674    
675 nigel 77 switch (c)
676     {
677     /* A number of Perl escapes are not handled by PCRE. We give an explicit
678     error. */
679    
680 ph10 391 case CHAR_l:
681     case CHAR_L:
682 zherczeg 744 *errorcodeptr = ERR37;
683     break;
684    
685 ph10 391 case CHAR_u:
686 zherczeg 744 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
687     {
688     /* In JavaScript, \u must be followed by four hexadecimal numbers.
689     Otherwise it is a lowercase u letter. */
690     if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0
691     && (digitab[ptr[3]] & ctype_xdigit) != 0 && (digitab[ptr[4]] & ctype_xdigit) != 0)
692     {
693     c = 0;
694     for (i = 0; i < 4; ++i)
695     {
696     register int cc = *(++ptr);
697     #ifndef EBCDIC /* ASCII/UTF-8 coding */
698     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
699     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
700     #else /* EBCDIC coding */
701     if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
702     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
703     #endif
704     }
705     }
706     }
707     else
708     *errorcodeptr = ERR37;
709     break;
710    
711 ph10 391 case CHAR_U:
712 zherczeg 744 /* In JavaScript, \U is an uppercase U letter. */
713     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
714 nigel 77 break;
715    
716 ph10 654 /* In a character class, \g is just a literal "g". Outside a character
717 ph10 640 class, \g must be followed by one of a number of specific things:
718 ph10 345
719 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
720     backreference. If negative, it is a relative backreference. This is a Perl
721     5.10 feature.
722 ph10 345
723 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
724     is part of Perl's movement towards a unified syntax for back references. As
725     this is synonymous with \k{name}, we fudge it up by pretending it really
726     was \k.
727 ph10 345
728     (3) For Oniguruma compatibility we also support \g followed by a name or a
729     number either in angle brackets or in single quotes. However, these are
730     (possibly recursive) subroutine calls, _not_ backreferences. Just return
731 ph10 333 the -ESC_g code (cf \k). */
732 nigel 93
733 ph10 391 case CHAR_g:
734 ph10 640 if (isclass) break;
735 ph10 391 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
736 ph10 333 {
737     c = -ESC_g;
738 ph10 345 break;
739     }
740 ph10 333
741     /* Handle the Perl-compatible cases */
742 ph10 345
743 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
744 nigel 93 {
745 ph10 171 const uschar *p;
746 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
747     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
748     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
749 ph10 171 {
750     c = -ESC_k;
751     break;
752 ph10 172 }
753 nigel 93 braced = TRUE;
754     ptr++;
755     }
756     else braced = FALSE;
757    
758 ph10 391 if (ptr[1] == CHAR_MINUS)
759 nigel 93 {
760     negated = TRUE;
761     ptr++;
762     }
763     else negated = FALSE;
764    
765     c = 0;
766     while ((digitab[ptr[1]] & ctype_digit) != 0)
767 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
768 ph10 220
769 ph10 333 if (c < 0) /* Integer overflow */
770 ph10 213 {
771     *errorcodeptr = ERR61;
772     break;
773 ph10 220 }
774 ph10 345
775 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
776 nigel 93 {
777     *errorcodeptr = ERR57;
778 ph10 213 break;
779 nigel 93 }
780 ph10 345
781 ph10 333 if (c == 0)
782     {
783     *errorcodeptr = ERR58;
784     break;
785 ph10 345 }
786 nigel 93
787     if (negated)
788     {
789     if (c > bracount)
790     {
791     *errorcodeptr = ERR15;
792 ph10 213 break;
793 nigel 93 }
794     c = bracount - (c - 1);
795     }
796    
797     c = -(ESC_REF + c);
798     break;
799    
800 nigel 77 /* The handling of escape sequences consisting of a string of digits
801     starting with one that is not zero is not straightforward. By experiment,
802     the way Perl works seems to be as follows:
803    
804     Outside a character class, the digits are read as a decimal number. If the
805     number is less than 10, or if there are that many previous extracting
806     left brackets, then it is a back reference. Otherwise, up to three octal
807     digits are read to form an escaped byte. Thus \123 is likely to be octal
808     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
809     value is greater than 377, the least significant 8 bits are taken. Inside a
810     character class, \ followed by a digit is always an octal number. */
811    
812 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
813     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
814 nigel 77
815     if (!isclass)
816     {
817     oldptr = ptr;
818 ph10 391 c -= CHAR_0;
819 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
820 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
821 ph10 333 if (c < 0) /* Integer overflow */
822 ph10 213 {
823     *errorcodeptr = ERR61;
824 ph10 220 break;
825     }
826 nigel 77 if (c < 10 || c <= bracount)
827     {
828     c = -(ESC_REF + c);
829     break;
830     }
831     ptr = oldptr; /* Put the pointer back and fall through */
832     }
833    
834     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
835     generates a binary zero byte and treats the digit as a following literal.
836     Thus we have to pull back the pointer by one. */
837    
838 ph10 391 if ((c = *ptr) >= CHAR_8)
839 nigel 77 {
840     ptr--;
841     c = 0;
842     break;
843     }
844    
845     /* \0 always starts an octal number, but we may drop through to here with a
846 nigel 91 larger first octal digit. The original code used just to take the least
847     significant 8 bits of octal numbers (I think this is what early Perls used
848     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
849     than 3 octal digits. */
850 nigel 77
851 ph10 391 case CHAR_0:
852     c -= CHAR_0;
853     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
854     c = c * 8 + *(++ptr) - CHAR_0;
855 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
856 nigel 77 break;
857    
858 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
859     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
860     treated as a data character. */
861 nigel 77
862 ph10 391 case CHAR_x:
863 zherczeg 744 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
864     {
865     /* In JavaScript, \x must be followed by two hexadecimal numbers.
866     Otherwise it is a lowercase x letter. */
867     if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0)
868     {
869     c = 0;
870     for (i = 0; i < 2; ++i)
871     {
872     register int cc = *(++ptr);
873     #ifndef EBCDIC /* ASCII/UTF-8 coding */
874     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
875     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
876     #else /* EBCDIC coding */
877     if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
878     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
879     #endif
880     }
881     }
882     break;
883     }
884    
885 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
886 nigel 77 {
887     const uschar *pt = ptr + 2;
888 nigel 87 int count = 0;
889    
890 nigel 77 c = 0;
891     while ((digitab[*pt] & ctype_xdigit) != 0)
892     {
893 nigel 87 register int cc = *pt++;
894 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
895 nigel 77 count++;
896 nigel 87
897 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
898     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
899     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
900 ph10 97 #else /* EBCDIC coding */
901 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
902     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
903 nigel 77 #endif
904     }
905 nigel 87
906 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
907 nigel 77 {
908 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
909 nigel 77 ptr = pt;
910     break;
911     }
912 nigel 87
913 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
914     recognize this construct; fall through to the normal \x handling. */
915     }
916    
917 nigel 87 /* Read just a single-byte hex-defined char */
918 nigel 77
919     c = 0;
920     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
921     {
922 ph10 391 int cc; /* Some compilers don't like */
923     cc = *(++ptr); /* ++ in initializers */
924     #ifndef EBCDIC /* ASCII/UTF-8 coding */
925     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
926     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
927 ph10 97 #else /* EBCDIC coding */
928 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
929     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
930 nigel 77 #endif
931     }
932     break;
933    
934 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
935 ph10 574 An error is given if the byte following \c is not an ASCII character. This
936     coding is ASCII-specific, but then the whole concept of \cx is
937 nigel 93 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
938 nigel 77
939 ph10 391 case CHAR_c:
940 nigel 77 c = *(++ptr);
941     if (c == 0)
942     {
943     *errorcodeptr = ERR2;
944 ph10 213 break;
945 nigel 77 }
946 ph10 574 #ifndef EBCDIC /* ASCII/UTF-8 coding */
947     if (c > 127) /* Excludes all non-ASCII in either mode */
948     {
949     *errorcodeptr = ERR68;
950 ph10 579 break;
951     }
952 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
953 nigel 77 c ^= 0x40;
954 ph10 574 #else /* EBCDIC coding */
955 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
956 nigel 77 c ^= 0xC0;
957     #endif
958     break;
959    
960     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
961 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
962     otherwise, for Perl compatibility, it is a literal. This code looks a bit
963     odd, but there used to be some cases other than the default, and there may
964     be again in future, so I haven't "optimized" it. */
965 nigel 77
966     default:
967     if ((options & PCRE_EXTRA) != 0) switch(c)
968     {
969     default:
970     *errorcodeptr = ERR3;
971     break;
972     }
973     break;
974     }
975     }
976 ph10 518
977     /* Perl supports \N{name} for character names, as well as plain \N for "not
978 ph10 654 newline". PCRE does not support \N{name}. However, it does support
979 ph10 640 quantification such as \N{2,3}. */
980 nigel 77
981 ph10 640 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
982     !is_counted_repeat(ptr+2))
983 ph10 518 *errorcodeptr = ERR37;
984 ph10 514
985 ph10 518 /* If PCRE_UCP is set, we change the values for \d etc. */
986    
987     if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
988     c -= (ESC_DU - ESC_D);
989    
990     /* Set the pointer to the final character before returning. */
991    
992 nigel 77 *ptrptr = ptr;
993     return c;
994     }
995    
996    
997    
998     #ifdef SUPPORT_UCP
999     /*************************************************
1000     * Handle \P and \p *
1001     *************************************************/
1002    
1003     /* This function is called after \P or \p has been encountered, provided that
1004     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1005     pointing at the P or p. On exit, it is pointing at the final character of the
1006     escape sequence.
1007    
1008     Argument:
1009     ptrptr points to the pattern position pointer
1010     negptr points to a boolean that is set TRUE for negation else FALSE
1011 nigel 87 dptr points to an int that is set to the detailed property value
1012 nigel 77 errorcodeptr points to the error code variable
1013    
1014 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
1015 nigel 77 */
1016    
1017     static int
1018 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1019 nigel 77 {
1020     int c, i, bot, top;
1021     const uschar *ptr = *ptrptr;
1022 nigel 87 char name[32];
1023 nigel 77
1024     c = *(++ptr);
1025     if (c == 0) goto ERROR_RETURN;
1026    
1027     *negptr = FALSE;
1028    
1029 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1030     negation. */
1031 nigel 77
1032 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
1033 nigel 77 {
1034 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1035 nigel 77 {
1036     *negptr = TRUE;
1037     ptr++;
1038     }
1039 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
1040 nigel 77 {
1041     c = *(++ptr);
1042     if (c == 0) goto ERROR_RETURN;
1043 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1044 nigel 77 name[i] = c;
1045     }
1046 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1047 nigel 77 name[i] = 0;
1048     }
1049    
1050     /* Otherwise there is just one following character */
1051    
1052     else
1053     {
1054     name[0] = c;
1055     name[1] = 0;
1056     }
1057    
1058     *ptrptr = ptr;
1059    
1060     /* Search for a recognized property name using binary chop */
1061    
1062     bot = 0;
1063     top = _pcre_utt_size;
1064    
1065     while (bot < top)
1066     {
1067 nigel 87 i = (bot + top) >> 1;
1068 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
1069 nigel 87 if (c == 0)
1070     {
1071     *dptr = _pcre_utt[i].value;
1072     return _pcre_utt[i].type;
1073     }
1074 nigel 77 if (c > 0) bot = i + 1; else top = i;
1075     }
1076    
1077     *errorcodeptr = ERR47;
1078     *ptrptr = ptr;
1079     return -1;
1080    
1081     ERROR_RETURN:
1082     *errorcodeptr = ERR46;
1083     *ptrptr = ptr;
1084     return -1;
1085     }
1086     #endif
1087    
1088    
1089    
1090    
1091     /*************************************************
1092     * Read repeat counts *
1093     *************************************************/
1094    
1095     /* Read an item of the form {n,m} and return the values. This is called only
1096     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1097     so the syntax is guaranteed to be correct, but we need to check the values.
1098    
1099     Arguments:
1100     p pointer to first char after '{'
1101     minp pointer to int for min
1102     maxp pointer to int for max
1103     returned as -1 if no max
1104     errorcodeptr points to error code variable
1105    
1106     Returns: pointer to '}' on success;
1107     current ptr on error, with errorcodeptr set non-zero
1108     */
1109    
1110     static const uschar *
1111     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
1112     {
1113     int min = 0;
1114     int max = -1;
1115    
1116 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
1117     an integer overflow. */
1118    
1119 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1120 nigel 81 if (min < 0 || min > 65535)
1121     {
1122     *errorcodeptr = ERR5;
1123     return p;
1124     }
1125 nigel 77
1126 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
1127     Also, max must not be less than min. */
1128    
1129 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1130 nigel 77 {
1131 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1132 nigel 77 {
1133     max = 0;
1134 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1135 nigel 81 if (max < 0 || max > 65535)
1136     {
1137     *errorcodeptr = ERR5;
1138     return p;
1139     }
1140 nigel 77 if (max < min)
1141     {
1142     *errorcodeptr = ERR4;
1143     return p;
1144     }
1145     }
1146     }
1147    
1148 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1149     '}'. */
1150 nigel 77
1151 nigel 81 *minp = min;
1152     *maxp = max;
1153 nigel 77 return p;
1154     }
1155    
1156    
1157    
1158     /*************************************************
1159 ph10 408 * Subroutine for finding forward reference *
1160 nigel 91 *************************************************/
1161    
1162 ph10 408 /* This recursive function is called only from find_parens() below. The
1163     top-level call starts at the beginning of the pattern. All other calls must
1164     start at a parenthesis. It scans along a pattern's text looking for capturing
1165 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1166     name it is given, it returns its number. Alternatively, if the name is NULL, it
1167 ph10 578 returns when it reaches a given numbered subpattern. Recursion is used to keep
1168     track of subpatterns that reset the capturing group numbers - the (?| feature.
1169 nigel 91
1170 ph10 578 This function was originally called only from the second pass, in which we know
1171     that if (?< or (?' or (?P< is encountered, the name will be correctly
1172     terminated because that is checked in the first pass. There is now one call to
1173     this function in the first pass, to check for a recursive back reference by
1174     name (so that we can make the whole group atomic). In this case, we need check
1175 ph10 579 only up to the current position in the pattern, and that is still OK because
1176     and previous occurrences will have been checked. To make this work, the test
1177     for "end of pattern" is a check against cd->end_pattern in the main loop,
1178 ph10 578 instead of looking for a binary zero. This means that the special first-pass
1179 ph10 579 call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1180     processing items within the loop are OK, because afterwards the main loop will
1181 ph10 578 terminate.)
1182    
1183 nigel 91 Arguments:
1184 ph10 408 ptrptr address of the current character pointer (updated)
1185 ph10 345 cd compile background data
1186 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1187     lorn name length, or subpattern number if name is NULL
1188     xmode TRUE if we are in /x mode
1189 ph10 579 utf8 TRUE if we are in UTF-8 mode
1190 ph10 411 count pointer to the current capturing subpattern number (updated)
1191 nigel 91
1192     Returns: the number of the named subpattern, or -1 if not found
1193     */
1194    
1195     static int
1196 ph10 408 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1197 ph10 556 BOOL xmode, BOOL utf8, int *count)
1198 nigel 91 {
1199 ph10 408 uschar *ptr = *ptrptr;
1200     int start_count = *count;
1201     int hwm_count = start_count;
1202     BOOL dup_parens = FALSE;
1203 nigel 93
1204 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1205 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1206    
1207     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1208     {
1209 ph10 544 /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1210 ph10 545
1211 ph10 544 if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1212 ph10 545
1213 ph10 544 /* Handle a normal, unnamed capturing parenthesis. */
1214 ph10 408
1215 ph10 544 else if (ptr[1] != CHAR_QUESTION_MARK)
1216 ph10 408 {
1217     *count += 1;
1218     if (name == NULL && *count == lorn) return *count;
1219 ph10 411 ptr++;
1220 ph10 408 }
1221    
1222 ph10 544 /* All cases now have (? at the start. Remember when we are in a group
1223     where the parenthesis numbers are duplicated. */
1224    
1225     else if (ptr[2] == CHAR_VERTICAL_LINE)
1226     {
1227     ptr += 3;
1228     dup_parens = TRUE;
1229     }
1230 ph10 545
1231 ph10 544 /* Handle comments; all characters are allowed until a ket is reached. */
1232    
1233     else if (ptr[2] == CHAR_NUMBER_SIGN)
1234     {
1235     for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1236     goto FAIL_EXIT;
1237 ph10 545 }
1238 ph10 544
1239 ph10 408 /* Handle a condition. If it is an assertion, just carry on so that it
1240     is processed as normal. If not, skip to the closing parenthesis of the
1241 ph10 544 condition (there can't be any nested parens). */
1242 ph10 411
1243 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1244     {
1245 ph10 411 ptr += 2;
1246 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1247     {
1248     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1249 ph10 411 if (*ptr != 0) ptr++;
1250 ph10 408 }
1251 ph10 411 }
1252    
1253 ph10 544 /* Start with (? but not a condition. */
1254 ph10 408
1255     else
1256 ph10 411 {
1257 ph10 408 ptr += 2;
1258     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1259    
1260     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1261 ph10 411
1262 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1263     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1264     {
1265     int term;
1266     const uschar *thisname;
1267     *count += 1;
1268     if (name == NULL && *count == lorn) return *count;
1269     term = *ptr++;
1270     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1271     thisname = ptr;
1272     while (*ptr != term) ptr++;
1273     if (name != NULL && lorn == ptr - thisname &&
1274     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1275     return *count;
1276 ph10 461 term++;
1277 ph10 411 }
1278 ph10 408 }
1279 ph10 411 }
1280 ph10 408
1281 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1282 ph10 579 bars. Stop if we get to cd->end_pattern. Note that this is important for the
1283     first-pass call when this value is temporarily adjusted to stop at the current
1284 ph10 578 position. So DO NOT change this to a test for binary zero. */
1285 ph10 408
1286 ph10 578 for (; ptr < cd->end_pattern; ptr++)
1287 nigel 91 {
1288 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1289    
1290 ph10 391 if (*ptr == CHAR_BACKSLASH)
1291 nigel 93 {
1292 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1293 ph10 391 if (*ptr == CHAR_Q) for (;;)
1294 nigel 93 {
1295 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1296 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1297 ph10 391 if (*(++ptr) == CHAR_E) break;
1298 nigel 93 }
1299     continue;
1300     }
1301    
1302 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1303     are handled for real. If the first character is '^', skip it. Also, if the
1304     first few characters (either before or after ^) are \Q\E or \E we skip them
1305 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1306 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1307 nigel 93
1308 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1309 nigel 93 {
1310 ph10 340 BOOL negate_class = FALSE;
1311     for (;;)
1312     {
1313 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1314 ph10 340 {
1315 ph10 438 if (ptr[2] == CHAR_E)
1316     ptr+= 2;
1317     else if (strncmp((const char *)ptr+2,
1318 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1319 ph10 438 ptr += 4;
1320 ph10 392 else
1321 ph10 391 break;
1322 ph10 340 }
1323 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1324 ph10 461 {
1325 ph10 340 negate_class = TRUE;
1326 ph10 438 ptr++;
1327 ph10 461 }
1328 ph10 340 else break;
1329     }
1330    
1331     /* If the next character is ']', it is a data character that must be
1332 ph10 341 skipped, except in JavaScript compatibility mode. */
1333 ph10 345
1334 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1335 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1336 ph10 345 ptr++;
1337    
1338 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1339 nigel 93 {
1340 ph10 220 if (*ptr == 0) return -1;
1341 ph10 391 if (*ptr == CHAR_BACKSLASH)
1342 nigel 93 {
1343 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1344 ph10 391 if (*ptr == CHAR_Q) for (;;)
1345 nigel 93 {
1346 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1347 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1348 ph10 391 if (*(++ptr) == CHAR_E) break;
1349 nigel 93 }
1350     continue;
1351     }
1352     }
1353     continue;
1354     }
1355    
1356     /* Skip comments in /x mode */
1357    
1358 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1359 nigel 93 {
1360 ph10 579 ptr++;
1361 ph10 556 while (*ptr != 0)
1362     {
1363     if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1364     ptr++;
1365 ph10 579 #ifdef SUPPORT_UTF8
1366 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
1367     #endif
1368     }
1369 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1370 nigel 93 continue;
1371     }
1372    
1373 ph10 408 /* Check for the special metacharacters */
1374 ph10 411
1375 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1376 nigel 93 {
1377 ph10 556 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
1378 ph10 408 if (rc > 0) return rc;
1379     if (*ptr == 0) goto FAIL_EXIT;
1380 nigel 93 }
1381 ph10 411
1382 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1383     {
1384     if (dup_parens && *count < hwm_count) *count = hwm_count;
1385 ph10 545 goto FAIL_EXIT;
1386 ph10 408 }
1387 ph10 411
1388     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1389 ph10 408 {
1390     if (*count > hwm_count) hwm_count = *count;
1391     *count = start_count;
1392 ph10 411 }
1393 ph10 408 }
1394 nigel 93
1395 ph10 408 FAIL_EXIT:
1396     *ptrptr = ptr;
1397     return -1;
1398     }
1399 nigel 93
1400    
1401    
1402    
1403 ph10 408 /*************************************************
1404     * Find forward referenced subpattern *
1405     *************************************************/
1406 nigel 93
1407 ph10 408 /* This function scans along a pattern's text looking for capturing
1408     subpatterns, and counting them. If it finds a named pattern that matches the
1409     name it is given, it returns its number. Alternatively, if the name is NULL, it
1410     returns when it reaches a given numbered subpattern. This is used for forward
1411     references to subpatterns. We used to be able to start this scan from the
1412     current compiling point, using the current count value from cd->bracount, and
1413     do it all in a single loop, but the addition of the possibility of duplicate
1414     subpattern numbers means that we have to scan from the very start, in order to
1415     take account of such duplicates, and to use a recursive function to keep track
1416     of the different types of group.
1417    
1418     Arguments:
1419     cd compile background data
1420     name name to seek, or NULL if seeking a numbered subpattern
1421     lorn name length, or subpattern number if name is NULL
1422     xmode TRUE if we are in /x mode
1423 ph10 579 utf8 TRUE if we are in UTF-8 mode
1424 ph10 408
1425     Returns: the number of the found subpattern, or -1 if not found
1426     */
1427    
1428     static int
1429 ph10 556 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
1430     BOOL utf8)
1431 ph10 408 {
1432     uschar *ptr = (uschar *)cd->start_pattern;
1433     int count = 0;
1434     int rc;
1435    
1436     /* If the pattern does not start with an opening parenthesis, the first call
1437     to find_parens_sub() will scan right to the end (if necessary). However, if it
1438     does start with a parenthesis, find_parens_sub() will return when it hits the
1439     matching closing parens. That is why we have to have a loop. */
1440    
1441 ph10 411 for (;;)
1442     {
1443 ph10 556 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
1444 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1445     }
1446    
1447 ph10 408 return rc;
1448 nigel 91 }
1449    
1450    
1451    
1452 ph10 408
1453 nigel 91 /*************************************************
1454 nigel 77 * Find first significant op code *
1455     *************************************************/
1456    
1457     /* This is called by several functions that scan a compiled expression looking
1458     for a fixed first character, or an anchoring op code etc. It skips over things
1459 ph10 602 that do not influence this. For some calls, it makes sense to skip negative
1460     forward and all backward assertions, and also the \b assertion; for others it
1461     does not.
1462 nigel 77
1463     Arguments:
1464     code pointer to the start of the group
1465     skipassert TRUE if certain assertions are to be skipped
1466    
1467     Returns: pointer to the first significant opcode
1468     */
1469    
1470     static const uschar*
1471 ph10 604 first_significant_code(const uschar *code, BOOL skipassert)
1472 nigel 77 {
1473     for (;;)
1474     {
1475     switch ((int)*code)
1476     {
1477     case OP_ASSERT_NOT:
1478     case OP_ASSERTBACK:
1479     case OP_ASSERTBACK_NOT:
1480     if (!skipassert) return code;
1481     do code += GET(code, 1); while (*code == OP_ALT);
1482     code += _pcre_OP_lengths[*code];
1483     break;
1484    
1485     case OP_WORD_BOUNDARY:
1486     case OP_NOT_WORD_BOUNDARY:
1487     if (!skipassert) return code;
1488     /* Fall through */
1489    
1490     case OP_CALLOUT:
1491     case OP_CREF:
1492 ph10 459 case OP_NCREF:
1493 nigel 93 case OP_RREF:
1494 ph10 459 case OP_NRREF:
1495 nigel 93 case OP_DEF:
1496 nigel 77 code += _pcre_OP_lengths[*code];
1497     break;
1498    
1499     default:
1500     return code;
1501     }
1502     }
1503     /* Control never reaches here */
1504     }
1505    
1506    
1507    
1508    
1509     /*************************************************
1510 ph10 454 * Find the fixed length of a branch *
1511 nigel 77 *************************************************/
1512    
1513 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1514 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1515 ph10 461 In UTF8 mode, the result is in characters rather than bytes. The branch is
1516 ph10 454 temporarily terminated with OP_END when this function is called.
1517 nigel 77
1518 ph10 461 This function is called when a backward assertion is encountered, so that if it
1519     fails, the error message can point to the correct place in the pattern.
1520 ph10 454 However, we cannot do this when the assertion contains subroutine calls,
1521 ph10 461 because they can be forward references. We solve this by remembering this case
1522 ph10 454 and doing the check at the end; a flag specifies which mode we are running in.
1523    
1524 nigel 77 Arguments:
1525     code points to the start of the pattern (the bracket)
1526 ph10 604 utf8 TRUE in UTF-8 mode
1527 ph10 461 atend TRUE if called when the pattern is complete
1528     cd the "compile data" structure
1529 nigel 77
1530 ph10 461 Returns: the fixed length,
1531 ph10 454 or -1 if there is no fixed length,
1532 ph10 754 or -2 if \C was encountered (in UTF-8 mode only)
1533 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1534 ph10 747 or -4 if an unknown opcode was encountered (internal error)
1535 nigel 77 */
1536    
1537     static int
1538 ph10 604 find_fixedlength(uschar *code, BOOL utf8, BOOL atend, compile_data *cd)
1539 nigel 77 {
1540     int length = -1;
1541    
1542     register int branchlength = 0;
1543     register uschar *cc = code + 1 + LINK_SIZE;
1544    
1545     /* Scan along the opcodes for this branch. If we get to the end of the
1546     branch, check the length against that of the other branches. */
1547    
1548     for (;;)
1549     {
1550     int d;
1551 ph10 454 uschar *ce, *cs;
1552 nigel 77 register int op = *cc;
1553     switch (op)
1554     {
1555 ph10 604 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1556     OP_BRA (normal non-capturing bracket) because the other variants of these
1557     opcodes are all concerned with unlimited repeated groups, which of course
1558 ph10 747 are not of fixed length. */
1559 ph10 604
1560 nigel 93 case OP_CBRA:
1561 nigel 77 case OP_BRA:
1562     case OP_ONCE:
1563 ph10 733 case OP_ONCE_NC:
1564 nigel 77 case OP_COND:
1565 ph10 604 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);
1566 nigel 77 if (d < 0) return d;
1567     branchlength += d;
1568     do cc += GET(cc, 1); while (*cc == OP_ALT);
1569     cc += 1 + LINK_SIZE;
1570     break;
1571    
1572 ph10 747 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1573     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1574     an ALT. If it is END it's the end of the outer call. All can be handled by
1575     the same code. Note that we must not include the OP_KETRxxx opcodes here,
1576     because they all imply an unlimited repeat. */
1577 nigel 77
1578     case OP_ALT:
1579     case OP_KET:
1580     case OP_END:
1581 ph10 747 case OP_ACCEPT:
1582     case OP_ASSERT_ACCEPT:
1583 nigel 77 if (length < 0) length = branchlength;
1584     else if (length != branchlength) return -1;
1585     if (*cc != OP_ALT) return length;
1586     cc += 1 + LINK_SIZE;
1587     branchlength = 0;
1588     break;
1589 ph10 461
1590 ph10 454 /* A true recursion implies not fixed length, but a subroutine call may
1591     be OK. If the subroutine is a forward reference, we can't deal with
1592     it until the end of the pattern, so return -3. */
1593 ph10 461
1594 ph10 454 case OP_RECURSE:
1595     if (!atend) return -3;
1596     cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1597     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1598     if (cc > cs && cc < ce) return -1; /* Recursion */
1599 ph10 604 d = find_fixedlength(cs + 2, utf8, atend, cd);
1600 ph10 461 if (d < 0) return d;
1601 ph10 454 branchlength += d;
1602     cc += 1 + LINK_SIZE;
1603 ph10 461 break;
1604 nigel 77
1605     /* Skip over assertive subpatterns */
1606    
1607     case OP_ASSERT:
1608     case OP_ASSERT_NOT:
1609     case OP_ASSERTBACK:
1610     case OP_ASSERTBACK_NOT:
1611     do cc += GET(cc, 1); while (*cc == OP_ALT);
1612     /* Fall through */
1613    
1614     /* Skip over things that don't match chars */
1615    
1616 ph10 747 case OP_MARK:
1617     case OP_PRUNE_ARG:
1618     case OP_SKIP_ARG:
1619     case OP_THEN_ARG:
1620     cc += cc[1] + _pcre_OP_lengths[*cc];
1621     break;
1622    
1623 nigel 77 case OP_CALLOUT:
1624     case OP_CIRC:
1625 ph10 602 case OP_CIRCM:
1626 ph10 747 case OP_CLOSE:
1627     case OP_COMMIT:
1628     case OP_CREF:
1629     case OP_DEF:
1630 nigel 77 case OP_DOLL:
1631 ph10 602 case OP_DOLLM:
1632 ph10 747 case OP_EOD:
1633     case OP_EODN:
1634     case OP_FAIL:
1635     case OP_NCREF:
1636     case OP_NRREF:
1637 nigel 77 case OP_NOT_WORD_BOUNDARY:
1638 ph10 747 case OP_PRUNE:
1639     case OP_REVERSE:
1640     case OP_RREF:
1641     case OP_SET_SOM:
1642     case OP_SKIP:
1643     case OP_SOD:
1644     case OP_SOM:
1645     case OP_THEN:
1646 nigel 77 case OP_WORD_BOUNDARY:
1647     cc += _pcre_OP_lengths[*cc];
1648     break;
1649    
1650     /* Handle literal characters */
1651    
1652     case OP_CHAR:
1653 ph10 602 case OP_CHARI:
1654 nigel 91 case OP_NOT:
1655 ph10 604 case OP_NOTI:
1656 nigel 77 branchlength++;
1657     cc += 2;
1658     #ifdef SUPPORT_UTF8
1659 ph10 604 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1660 nigel 77 #endif
1661     break;
1662    
1663     /* Handle exact repetitions. The count is already in characters, but we
1664     need to skip over a multibyte character in UTF8 mode. */
1665    
1666     case OP_EXACT:
1667 ph10 747 case OP_EXACTI:
1668     case OP_NOTEXACT:
1669     case OP_NOTEXACTI:
1670 nigel 77 branchlength += GET2(cc,1);
1671     cc += 4;
1672     #ifdef SUPPORT_UTF8
1673 ph10 604 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1674 nigel 77 #endif
1675     break;
1676    
1677     case OP_TYPEEXACT:
1678     branchlength += GET2(cc,1);
1679 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1680 nigel 77 cc += 4;
1681     break;
1682    
1683     /* Handle single-char matchers */
1684    
1685     case OP_PROP:
1686     case OP_NOTPROP:
1687 nigel 87 cc += 2;
1688 nigel 77 /* Fall through */
1689    
1690 ph10 747 case OP_HSPACE:
1691     case OP_VSPACE:
1692     case OP_NOT_HSPACE:
1693     case OP_NOT_VSPACE:
1694 nigel 77 case OP_NOT_DIGIT:
1695     case OP_DIGIT:
1696     case OP_NOT_WHITESPACE:
1697     case OP_WHITESPACE:
1698     case OP_NOT_WORDCHAR:
1699     case OP_WORDCHAR:
1700     case OP_ANY:
1701 ph10 342 case OP_ALLANY:
1702 nigel 77 branchlength++;
1703     cc++;
1704     break;
1705    
1706 ph10 754 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1707     otherwise \C is coded as OP_ALLANY. */
1708 nigel 77
1709     case OP_ANYBYTE:
1710     return -2;
1711    
1712     /* Check a class for variable quantification */
1713    
1714     #ifdef SUPPORT_UTF8
1715     case OP_XCLASS:
1716     cc += GET(cc, 1) - 33;
1717     /* Fall through */
1718     #endif
1719    
1720     case OP_CLASS:
1721     case OP_NCLASS:
1722     cc += 33;
1723    
1724     switch (*cc)
1725     {
1726 ph10 747 case OP_CRPLUS:
1727     case OP_CRMINPLUS:
1728 nigel 77 case OP_CRSTAR:
1729     case OP_CRMINSTAR:
1730     case OP_CRQUERY:
1731     case OP_CRMINQUERY:
1732     return -1;
1733    
1734     case OP_CRRANGE:
1735     case OP_CRMINRANGE:
1736     if (GET2(cc,1) != GET2(cc,3)) return -1;
1737     branchlength += GET2(cc,1);
1738     cc += 5;
1739     break;
1740    
1741     default:
1742     branchlength++;
1743     }
1744     break;
1745    
1746     /* Anything else is variable length */
1747    
1748 ph10 747 case OP_ANYNL:
1749     case OP_BRAMINZERO:
1750     case OP_BRAPOS:
1751     case OP_BRAPOSZERO:
1752     case OP_BRAZERO:
1753     case OP_CBRAPOS:
1754     case OP_EXTUNI:
1755     case OP_KETRMAX:
1756     case OP_KETRMIN:
1757     case OP_KETRPOS:
1758     case OP_MINPLUS:
1759     case OP_MINPLUSI:
1760     case OP_MINQUERY:
1761     case OP_MINQUERYI:
1762     case OP_MINSTAR:
1763     case OP_MINSTARI:
1764     case OP_MINUPTO:
1765     case OP_MINUPTOI:
1766     case OP_NOTMINPLUS:
1767     case OP_NOTMINPLUSI:
1768     case OP_NOTMINQUERY:
1769     case OP_NOTMINQUERYI:
1770     case OP_NOTMINSTAR:
1771     case OP_NOTMINSTARI:
1772     case OP_NOTMINUPTO:
1773     case OP_NOTMINUPTOI:
1774     case OP_NOTPLUS:
1775     case OP_NOTPLUSI:
1776     case OP_NOTPOSPLUS:
1777     case OP_NOTPOSPLUSI:
1778     case OP_NOTPOSQUERY:
1779     case OP_NOTPOSQUERYI:
1780     case OP_NOTPOSSTAR:
1781     case OP_NOTPOSSTARI:
1782     case OP_NOTPOSUPTO:
1783     case OP_NOTPOSUPTOI:
1784     case OP_NOTQUERY:
1785     case OP_NOTQUERYI:
1786     case OP_NOTSTAR:
1787     case OP_NOTSTARI:
1788     case OP_NOTUPTO:
1789     case OP_NOTUPTOI:
1790     case OP_PLUS:
1791     case OP_PLUSI:
1792     case OP_POSPLUS:
1793     case OP_POSPLUSI:
1794     case OP_POSQUERY:
1795     case OP_POSQUERYI:
1796     case OP_POSSTAR:
1797     case OP_POSSTARI:
1798     case OP_POSUPTO:
1799     case OP_POSUPTOI:
1800     case OP_QUERY:
1801     case OP_QUERYI:
1802     case OP_REF:
1803     case OP_REFI:
1804     case OP_SBRA:
1805     case OP_SBRAPOS:
1806     case OP_SCBRA:
1807     case OP_SCBRAPOS:
1808     case OP_SCOND:
1809     case OP_SKIPZERO:
1810     case OP_STAR:
1811     case OP_STARI:
1812     case OP_TYPEMINPLUS:
1813     case OP_TYPEMINQUERY:
1814     case OP_TYPEMINSTAR:
1815     case OP_TYPEMINUPTO:
1816     case OP_TYPEPLUS:
1817     case OP_TYPEPOSPLUS:
1818     case OP_TYPEPOSQUERY:
1819     case OP_TYPEPOSSTAR:
1820     case OP_TYPEPOSUPTO:
1821     case OP_TYPEQUERY:
1822     case OP_TYPESTAR:
1823     case OP_TYPEUPTO:
1824     case OP_UPTO:
1825     case OP_UPTOI:
1826     return -1;
1827    
1828     /* Catch unrecognized opcodes so that when new ones are added they
1829     are not forgotten, as has happened in the past. */
1830    
1831 nigel 77 default:
1832 ph10 747 return -4;
1833 nigel 77 }
1834     }
1835     /* Control never gets here */
1836     }
1837    
1838    
1839    
1840    
1841     /*************************************************
1842 ph10 454 * Scan compiled regex for specific bracket *
1843 nigel 77 *************************************************/
1844    
1845     /* This little function scans through a compiled pattern until it finds a
1846 ph10 454 capturing bracket with the given number, or, if the number is negative, an
1847 ph10 461 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1848     so that it can be called from pcre_study() when finding the minimum matching
1849 ph10 455 length.
1850 nigel 77
1851     Arguments:
1852     code points to start of expression
1853     utf8 TRUE in UTF-8 mode
1854 ph10 454 number the required bracket number or negative to find a lookbehind
1855 nigel 77
1856     Returns: pointer to the opcode for the bracket, or NULL if not found
1857     */
1858    
1859 ph10 455 const uschar *
1860     _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1861 nigel 77 {
1862     for (;;)
1863     {
1864     register int c = *code;
1865 ph10 618
1866 nigel 77 if (c == OP_END) return NULL;
1867 nigel 91
1868     /* XCLASS is used for classes that cannot be represented just by a bit
1869     map. This includes negated single high-valued characters. The length in
1870     the table is zero; the actual length is stored in the compiled code. */
1871    
1872     if (c == OP_XCLASS) code += GET(code, 1);
1873 ph10 461
1874 ph10 454 /* Handle recursion */
1875 ph10 461
1876 ph10 454 else if (c == OP_REVERSE)
1877     {
1878 ph10 461 if (number < 0) return (uschar *)code;
1879 ph10 454 code += _pcre_OP_lengths[c];
1880     }
1881 nigel 91
1882 nigel 93 /* Handle capturing bracket */
1883 nigel 91
1884 ph10 604 else if (c == OP_CBRA || c == OP_SCBRA ||
1885     c == OP_CBRAPOS || c == OP_SCBRAPOS)
1886 nigel 77 {
1887 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1888 nigel 77 if (n == number) return (uschar *)code;
1889 nigel 93 code += _pcre_OP_lengths[c];
1890 nigel 77 }
1891 nigel 91
1892 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1893     repeated character types, we have to test for \p and \P, which have an extra
1894 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1895 ph10 510 must add in its length. */
1896 nigel 91
1897 nigel 77 else
1898     {
1899 ph10 218 switch(c)
1900     {
1901     case OP_TYPESTAR:
1902     case OP_TYPEMINSTAR:
1903     case OP_TYPEPLUS:
1904     case OP_TYPEMINPLUS:
1905     case OP_TYPEQUERY:
1906     case OP_TYPEMINQUERY:
1907     case OP_TYPEPOSSTAR:
1908     case OP_TYPEPOSPLUS:
1909     case OP_TYPEPOSQUERY:
1910     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1911 ph10 220 break;
1912 ph10 221
1913     case OP_TYPEUPTO:
1914     case OP_TYPEMINUPTO:
1915     case OP_TYPEEXACT:
1916     case OP_TYPEPOSUPTO:
1917     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1918     break;
1919 ph10 512
1920 ph10 510 case OP_MARK:
1921     case OP_PRUNE_ARG:
1922     case OP_SKIP_ARG:
1923     code += code[1];
1924 ph10 512 break;
1925 ph10 550
1926     case OP_THEN_ARG:
1927 ph10 716 code += code[1];
1928 ph10 550 break;
1929 ph10 220 }
1930    
1931 ph10 218 /* Add in the fixed length from the table */
1932 ph10 220
1933 nigel 77 code += _pcre_OP_lengths[c];
1934 ph10 220
1935 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1936     a multi-byte character. The length in the table is a minimum, so we have to
1937     arrange to skip the extra bytes. */
1938 ph10 220
1939 ph10 107 #ifdef SUPPORT_UTF8
1940 nigel 77 if (utf8) switch(c)
1941     {
1942     case OP_CHAR:
1943 ph10 602 case OP_CHARI:
1944 nigel 77 case OP_EXACT:
1945 ph10 602 case OP_EXACTI:
1946 nigel 77 case OP_UPTO:
1947 ph10 602 case OP_UPTOI:
1948 nigel 77 case OP_MINUPTO:
1949 ph10 602 case OP_MINUPTOI:
1950 nigel 93 case OP_POSUPTO:
1951 ph10 602 case OP_POSUPTOI:
1952 nigel 77 case OP_STAR:
1953 ph10 602 case OP_STARI:
1954 nigel 77 case OP_MINSTAR:
1955 ph10 602 case OP_MINSTARI:
1956 nigel 93 case OP_POSSTAR:
1957 ph10 602 case OP_POSSTARI:
1958 nigel 77 case OP_PLUS:
1959 ph10 602 case OP_PLUSI:
1960 nigel 77 case OP_MINPLUS:
1961 ph10 602 case OP_MINPLUSI:
1962 nigel 93 case OP_POSPLUS:
1963 ph10 602 case OP_POSPLUSI:
1964 nigel 77 case OP_QUERY:
1965 ph10 602 case OP_QUERYI:
1966 nigel 77 case OP_MINQUERY:
1967 ph10 602 case OP_MINQUERYI:
1968 nigel 93 case OP_POSQUERY:
1969 ph10 602 case OP_POSQUERYI:
1970 nigel 93 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1971 nigel 77 break;
1972     }
1973 ph10 369 #else
1974     (void)(utf8); /* Keep compiler happy by referencing function argument */
1975 ph10 111 #endif
1976 nigel 77 }
1977     }
1978     }
1979    
1980    
1981    
1982     /*************************************************
1983     * Scan compiled regex for recursion reference *
1984     *************************************************/
1985    
1986     /* This little function scans through a compiled pattern until it finds an
1987     instance of OP_RECURSE.
1988    
1989     Arguments:
1990     code points to start of expression
1991     utf8 TRUE in UTF-8 mode
1992    
1993     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1994     */
1995    
1996     static const uschar *
1997     find_recurse(const uschar *code, BOOL utf8)
1998     {
1999     for (;;)
2000     {
2001     register int c = *code;
2002     if (c == OP_END) return NULL;
2003 nigel 91 if (c == OP_RECURSE) return code;
2004 ph10 220
2005 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
2006     map. This includes negated single high-valued characters. The length in
2007     the table is zero; the actual length is stored in the compiled code. */
2008    
2009     if (c == OP_XCLASS) code += GET(code, 1);
2010    
2011 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
2012     repeated character types, we have to test for \p and \P, which have an extra
2013 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2014 ph10 510 must add in its length. */
2015 nigel 91
2016 nigel 77 else
2017     {
2018 ph10 218 switch(c)
2019     {
2020     case OP_TYPESTAR:
2021     case OP_TYPEMINSTAR:
2022     case OP_TYPEPLUS:
2023     case OP_TYPEMINPLUS:
2024     case OP_TYPEQUERY:
2025     case OP_TYPEMINQUERY:
2026     case OP_TYPEPOSSTAR:
2027     case OP_TYPEPOSPLUS:
2028     case OP_TYPEPOSQUERY:
2029     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2030 ph10 220 break;
2031 ph10 221
2032     case OP_TYPEPOSUPTO:
2033     case OP_TYPEUPTO:
2034     case OP_TYPEMINUPTO:
2035     case OP_TYPEEXACT:
2036     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2037     break;
2038 ph10 512
2039 ph10 510 case OP_MARK:
2040     case OP_PRUNE_ARG:
2041     case OP_SKIP_ARG:
2042     code += code[1];
2043 ph10 512 break;
2044 ph10 550
2045     case OP_THEN_ARG:
2046 ph10 716 code += code[1];
2047 ph10 550 break;
2048 ph10 220 }
2049    
2050 ph10 218 /* Add in the fixed length from the table */
2051    
2052 nigel 77 code += _pcre_OP_lengths[c];
2053 ph10 220
2054 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
2055     by a multi-byte character. The length in the table is a minimum, so we have
2056     to arrange to skip the extra bytes. */
2057 ph10 220
2058 ph10 107 #ifdef SUPPORT_UTF8
2059 nigel 77 if (utf8) switch(c)
2060     {
2061     case OP_CHAR:
2062 ph10 602 case OP_CHARI:
2063 nigel 77 case OP_EXACT:
2064 ph10 602 case OP_EXACTI:
2065 nigel 77 case OP_UPTO:
2066 ph10 602 case OP_UPTOI:
2067 nigel 77 case OP_MINUPTO:
2068 ph10 602 case OP_MINUPTOI:
2069 nigel 93 case OP_POSUPTO:
2070 ph10 602 case OP_POSUPTOI:
2071 nigel 77 case OP_STAR:
2072 ph10 602 case OP_STARI:
2073 nigel 77 case OP_MINSTAR:
2074 ph10 602 case OP_MINSTARI:
2075 nigel 93 case OP_POSSTAR:
2076 ph10 602 case OP_POSSTARI:
2077 nigel 77 case OP_PLUS:
2078 ph10 602 case OP_PLUSI:
2079 nigel 77 case OP_MINPLUS:
2080 ph10 602 case OP_MINPLUSI:
2081 nigel 93 case OP_POSPLUS:
2082 ph10 602 case OP_POSPLUSI:
2083 nigel 77 case OP_QUERY:
2084 ph10 602 case OP_QUERYI:
2085 nigel 77 case OP_MINQUERY:
2086 ph10 602 case OP_MINQUERYI:
2087 nigel 93 case OP_POSQUERY:
2088 ph10 602 case OP_POSQUERYI:
2089 nigel 93 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
2090 nigel 77 break;
2091     }
2092 ph10 369 #else
2093     (void)(utf8); /* Keep compiler happy by referencing function argument */
2094 ph10 111 #endif
2095 nigel 77 }
2096     }
2097     }
2098    
2099    
2100    
2101     /*************************************************
2102     * Scan compiled branch for non-emptiness *
2103     *************************************************/
2104    
2105     /* This function scans through a branch of a compiled pattern to see whether it
2106 nigel 93 can match the empty string or not. It is called from could_be_empty()
2107     below and from compile_branch() when checking for an unlimited repeat of a
2108     group that can match nothing. Note that first_significant_code() skips over
2109 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
2110     hit an unclosed bracket, we return "empty" - this means we've struck an inner
2111     bracket whose current branch will already have been scanned.
2112 nigel 77
2113     Arguments:
2114     code points to start of search
2115     endcode points to where to stop
2116     utf8 TRUE if in UTF8 mode
2117 ph10 503 cd contains pointers to tables etc.
2118 nigel 77
2119     Returns: TRUE if what is matched could be empty
2120     */
2121    
2122     static BOOL
2123 ph10 503 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
2124     compile_data *cd)
2125 nigel 77 {
2126     register int c;
2127 ph10 604 for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);
2128 nigel 77 code < endcode;
2129 ph10 604 code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))
2130 nigel 77 {
2131     const uschar *ccode;
2132    
2133     c = *code;
2134 ph10 507
2135 ph10 286 /* Skip over forward assertions; the other assertions are skipped by
2136 ph10 282 first_significant_code() with a TRUE final argument. */
2137 ph10 286
2138 ph10 282 if (c == OP_ASSERT)
2139 ph10 286 {
2140 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
2141     c = *code;
2142     continue;
2143 ph10 286 }
2144 ph10 172
2145 ph10 503 /* For a recursion/subroutine call, if its end has been reached, which
2146 ph10 624 implies a backward reference subroutine call, we can scan it. If it's a
2147     forward reference subroutine call, we can't. To detect forward reference
2148 ph10 654 we have to scan up the list that is kept in the workspace. This function is
2149     called only when doing the real compile, not during the pre-compile that
2150 ph10 624 measures the size of the compiled pattern. */
2151 ph10 507
2152 ph10 503 if (c == OP_RECURSE)
2153     {
2154 ph10 624 const uschar *scode;
2155     BOOL empty_branch;
2156 ph10 654
2157 ph10 624 /* Test for forward reference */
2158 ph10 654
2159 ph10 624 for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2160 ph10 654 if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
2161 ph10 624
2162     /* Not a forward reference, test for completed backward reference */
2163 ph10 654
2164 ph10 624 empty_branch = FALSE;
2165     scode = cd->start_code + GET(code, 1);
2166 ph10 503 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2167 ph10 654
2168 ph10 624 /* Completed backwards reference */
2169 ph10 654
2170 ph10 503 do
2171     {
2172 ph10 504 if (could_be_empty_branch(scode, endcode, utf8, cd))
2173     {
2174     empty_branch = TRUE;
2175 ph10 507 break;
2176     }
2177 ph10 503 scode += GET(scode, 1);
2178     }
2179     while (*scode == OP_ALT);
2180 ph10 654
2181 ph10 504 if (!empty_branch) return FALSE; /* All branches are non-empty */
2182 ph10 503 continue;
2183 ph10 507 }
2184 ph10 170
2185 ph10 604 /* Groups with zero repeats can of course be empty; skip them. */
2186    
2187     if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2188     c == OP_BRAPOSZERO)
2189     {
2190     code += _pcre_OP_lengths[c];
2191     do code += GET(code, 1); while (*code == OP_ALT);
2192     c = *code;
2193     continue;
2194     }
2195    
2196     /* A nested group that is already marked as "could be empty" can just be
2197     skipped. */
2198    
2199     if (c == OP_SBRA || c == OP_SBRAPOS ||
2200     c == OP_SCBRA || c == OP_SCBRAPOS)
2201     {
2202     do code += GET(code, 1); while (*code == OP_ALT);
2203     c = *code;
2204     continue;
2205     }
2206    
2207 ph10 170 /* For other groups, scan the branches. */
2208 ph10 172
2209 ph10 604 if (c == OP_BRA || c == OP_BRAPOS ||
2210     c == OP_CBRA || c == OP_CBRAPOS ||
2211 ph10 723 c == OP_ONCE || c == OP_ONCE_NC ||
2212     c == OP_COND)
2213 nigel 77 {
2214     BOOL empty_branch;
2215     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2216 ph10 406
2217     /* If a conditional group has only one branch, there is a second, implied,
2218 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
2219     Otherwise, scan the individual branches of the group. */
2220 ph10 406
2221 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2222 nigel 77 code += GET(code, 1);
2223 ph10 395 else
2224 ph10 406 {
2225 ph10 395 empty_branch = FALSE;
2226     do
2227     {
2228 ph10 503 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
2229 ph10 395 empty_branch = TRUE;
2230     code += GET(code, 1);
2231     }
2232     while (*code == OP_ALT);
2233     if (!empty_branch) return FALSE; /* All branches are non-empty */
2234 nigel 77 }
2235 ph10 406
2236 ph10 172 c = *code;
2237 nigel 93 continue;
2238 nigel 77 }
2239    
2240 nigel 93 /* Handle the other opcodes */
2241    
2242     switch (c)
2243 nigel 77 {
2244 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
2245     cannot be represented just by a bit map. This includes negated single
2246     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
2247 ph10 220 actual length is stored in the compiled code, so we must update "code"
2248 ph10 216 here. */
2249 nigel 77
2250     #ifdef SUPPORT_UTF8
2251     case OP_XCLASS:
2252 ph10 216 ccode = code += GET(code, 1);
2253 nigel 77 goto CHECK_CLASS_REPEAT;
2254     #endif
2255    
2256     case OP_CLASS:
2257     case OP_NCLASS:
2258     ccode = code + 33;
2259    
2260     #ifdef SUPPORT_UTF8
2261     CHECK_CLASS_REPEAT:
2262     #endif
2263    
2264     switch (*ccode)
2265     {
2266     case OP_CRSTAR: /* These could be empty; continue */
2267     case OP_CRMINSTAR:
2268     case OP_CRQUERY:
2269     case OP_CRMINQUERY:
2270     break;
2271    
2272     default: /* Non-repeat => class must match */
2273     case OP_CRPLUS: /* These repeats aren't empty */
2274     case OP_CRMINPLUS:
2275     return FALSE;
2276    
2277     case OP_CRRANGE:
2278     case OP_CRMINRANGE:
2279     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2280     break;
2281     }
2282     break;
2283    
2284     /* Opcodes that must match a character */
2285    
2286     case OP_PROP:
2287     case OP_NOTPROP:
2288     case OP_EXTUNI:
2289     case OP_NOT_DIGIT:
2290     case OP_DIGIT:
2291     case OP_NOT_WHITESPACE:
2292     case OP_WHITESPACE:
2293     case OP_NOT_WORDCHAR:
2294     case OP_WORDCHAR:
2295     case OP_ANY:
2296 ph10 345 case OP_ALLANY:
2297 nigel 77 case OP_ANYBYTE:
2298     case OP_CHAR:
2299 ph10 602 case OP_CHARI:
2300 nigel 77 case OP_NOT:
2301 ph10 602 case OP_NOTI:
2302 nigel 77 case OP_PLUS:
2303     case OP_MINPLUS:
2304 nigel 93 case OP_POSPLUS:
2305 nigel 77 case OP_EXACT:
2306     case OP_NOTPLUS:
2307     case OP_NOTMINPLUS:
2308 nigel 93 case OP_NOTPOSPLUS:
2309 nigel 77 case OP_NOTEXACT:
2310     case OP_TYPEPLUS:
2311     case OP_TYPEMINPLUS:
2312 nigel 93 case OP_TYPEPOSPLUS:
2313 nigel 77 case OP_TYPEEXACT:
2314     return FALSE;
2315 ph10 227
2316     /* These are going to continue, as they may be empty, but we have to
2317     fudge the length for the \p and \P cases. */
2318    
2319 ph10 224 case OP_TYPESTAR:
2320     case OP_TYPEMINSTAR:
2321     case OP_TYPEPOSSTAR:
2322     case OP_TYPEQUERY:
2323     case OP_TYPEMINQUERY:
2324     case OP_TYPEPOSQUERY:
2325     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2326 ph10 227 break;
2327    
2328 ph10 224 /* Same for these */
2329 ph10 227
2330 ph10 224 case OP_TYPEUPTO:
2331     case OP_TYPEMINUPTO:
2332     case OP_TYPEPOSUPTO:
2333     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2334     break;
2335 nigel 77
2336     /* End of branch */
2337    
2338     case OP_KET:
2339     case OP_KETRMAX:
2340     case OP_KETRMIN:
2341 ph10 604 case OP_KETRPOS:
2342 nigel 77 case OP_ALT:
2343     return TRUE;
2344    
2345 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2346     MINUPTO, and POSUPTO may be followed by a multibyte character */
2347 nigel 77
2348     #ifdef SUPPORT_UTF8
2349     case OP_STAR:
2350 ph10 602 case OP_STARI:
2351 nigel 77 case OP_MINSTAR:
2352 ph10 602 case OP_MINSTARI:
2353 nigel 93 case OP_POSSTAR:
2354 ph10 602 case OP_POSSTARI:
2355 nigel 77 case OP_QUERY:
2356 ph10 602 case OP_QUERYI:
2357 nigel 77 case OP_MINQUERY:
2358 ph10 602 case OP_MINQUERYI:
2359 nigel 93 case OP_POSQUERY:
2360 ph10 602 case OP_POSQUERYI:
2361 ph10 426 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2362     break;
2363 ph10 461
2364 nigel 77 case OP_UPTO:
2365 ph10 602 case OP_UPTOI:
2366 nigel 77 case OP_MINUPTO:
2367 ph10 602 case OP_MINUPTOI:
2368 nigel 93 case OP_POSUPTO:
2369 ph10 602 case OP_POSUPTOI:
2370 ph10 426 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2371 nigel 77 break;
2372     #endif
2373 ph10 503
2374 ph10 510 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2375     string. */
2376    
2377     case OP_MARK:
2378     case OP_PRUNE_ARG:
2379     case OP_SKIP_ARG:
2380     code += code[1];
2381 ph10 512 break;
2382 ph10 510
2383 ph10 550 case OP_THEN_ARG:
2384 ph10 716 code += code[1];
2385 ph10 550 break;
2386    
2387 ph10 503 /* None of the remaining opcodes are required to match a character. */
2388 ph10 507
2389 ph10 503 default:
2390 ph10 507 break;
2391 nigel 77 }
2392     }
2393    
2394     return TRUE;
2395     }
2396    
2397    
2398    
2399     /*************************************************
2400     * Scan compiled regex for non-emptiness *
2401     *************************************************/
2402    
2403     /* This function is called to check for left recursive calls. We want to check
2404     the current branch of the current pattern to see if it could match the empty
2405     string. If it could, we must look outwards for branches at other levels,
2406     stopping when we pass beyond the bracket which is the subject of the recursion.
2407 ph10 654 This function is called only during the real compile, not during the
2408 ph10 624 pre-compile.
2409 nigel 77
2410     Arguments:
2411     code points to start of the recursion
2412     endcode points to where to stop (current RECURSE item)
2413     bcptr points to the chain of current (unclosed) branch starts
2414     utf8 TRUE if in UTF-8 mode
2415 ph10 507 cd pointers to tables etc
2416 nigel 77
2417     Returns: TRUE if what is matched could be empty
2418     */
2419    
2420     static BOOL
2421     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2422 ph10 503 BOOL utf8, compile_data *cd)
2423 nigel 77 {
2424 ph10 475 while (bcptr != NULL && bcptr->current_branch >= code)
2425 nigel 77 {
2426 ph10 503 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2427 ph10 475 return FALSE;
2428 nigel 77 bcptr = bcptr->outer;
2429     }
2430     return TRUE;
2431     }
2432    
2433    
2434    
2435     /*************************************************
2436     * Check for POSIX class syntax *
2437     *************************************************/
2438    
2439     /* This function is called when the sequence "[:" or "[." or "[=" is
2440 ph10 295 encountered in a character class. It checks whether this is followed by a
2441 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2442 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2443 nigel 77
2444 ph10 298 Originally, this function only recognized a sequence of letters between the
2445     terminators, but it seems that Perl recognizes any sequence of characters,
2446     though of course unknown POSIX names are subsequently rejected. Perl gives an
2447     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2448     didn't consider this to be a POSIX class. Likewise for [:1234:].
2449 ph10 295
2450 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2451     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2452     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2453     below handles the special case of \], but does not try to do any other escape
2454     processing. This makes it different from Perl for cases such as [:l\ower:]
2455 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2456 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2457 ph10 295 I think.
2458    
2459 ph10 640 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2460     It seems that the appearance of a nested POSIX class supersedes an apparent
2461     external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2462 ph10 691 a digit.
2463 ph10 640
2464 ph10 661 In Perl, unescaped square brackets may also appear as part of class names. For
2465     example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2466     [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2467 ph10 691 seem right at all. PCRE does not allow closing square brackets in POSIX class
2468 ph10 661 names.
2469    
2470 ph10 295 Arguments:
2471 nigel 77 ptr pointer to the initial [
2472     endptr where to return the end pointer
2473    
2474     Returns: TRUE or FALSE
2475     */
2476    
2477     static BOOL
2478 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2479 nigel 77 {
2480     int terminator; /* Don't combine these lines; the Solaris cc */
2481     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2482 ph10 295 for (++ptr; *ptr != 0; ptr++)
2483 nigel 77 {
2484 ph10 654 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2485     ptr++;
2486 ph10 691 else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2487 ph10 640 else
2488 ph10 298 {
2489 ph10 391 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2490 ph10 295 {
2491     *endptr = ptr;
2492     return TRUE;
2493 ph10 298 }
2494 ph10 640 if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2495     (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2496     ptr[1] == CHAR_EQUALS_SIGN) &&
2497     check_posix_syntax(ptr, endptr))
2498 ph10 654 return FALSE;
2499 ph10 298 }
2500     }
2501 nigel 77 return FALSE;
2502     }
2503    
2504    
2505    
2506    
2507     /*************************************************
2508     * Check POSIX class name *
2509     *************************************************/
2510    
2511     /* This function is called to check the name given in a POSIX-style class entry
2512     such as [:alnum:].
2513    
2514     Arguments:
2515     ptr points to the first letter
2516     len the length of the name
2517    
2518     Returns: a value representing the name, or -1 if unknown
2519     */
2520    
2521     static int
2522     check_posix_name(const uschar *ptr, int len)
2523     {
2524 ph10 240 const char *pn = posix_names;
2525 nigel 77 register int yield = 0;
2526     while (posix_name_lengths[yield] != 0)
2527     {
2528     if (len == posix_name_lengths[yield] &&
2529 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2530 ph10 243 pn += posix_name_lengths[yield] + 1;
2531 nigel 77 yield++;
2532     }
2533     return -1;
2534     }
2535    
2536    
2537     /*************************************************
2538     * Adjust OP_RECURSE items in repeated group *
2539     *************************************************/
2540    
2541     /* OP_RECURSE items contain an offset from the start of the regex to the group
2542     that is referenced. This means that groups can be replicated for fixed
2543     repetition simply by copying (because the recursion is allowed to refer to
2544     earlier groups that are outside the current group). However, when a group is
2545 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2546     inserted before it, after it has been compiled. This means that any OP_RECURSE
2547     items within it that refer to the group itself or any contained groups have to
2548     have their offsets adjusted. That one of the jobs of this function. Before it
2549     is called, the partially compiled regex must be temporarily terminated with
2550     OP_END.
2551 nigel 77
2552 nigel 93 This function has been extended with the possibility of forward references for
2553     recursions and subroutine calls. It must also check the list of such references
2554     for the group we are dealing with. If it finds that one of the recursions in
2555     the current group is on this list, it adjusts the offset in the list, not the
2556     value in the reference (which is a group number).
2557    
2558 nigel 77 Arguments:
2559     group points to the start of the group
2560     adjust the amount by which the group is to be moved
2561     utf8 TRUE in UTF-8 mode
2562     cd contains pointers to tables etc.
2563 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2564 nigel 77
2565     Returns: nothing
2566     */
2567    
2568     static void
2569 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2570     uschar *save_hwm)
2571 nigel 77 {
2572     uschar *ptr = group;
2573 ph10 224
2574 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2575     {
2576 nigel 93 int offset;
2577     uschar *hc;
2578    
2579     /* See if this recursion is on the forward reference list. If so, adjust the
2580     reference. */
2581 ph10 345
2582 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2583     {
2584     offset = GET(hc, 0);
2585     if (cd->start_code + offset == ptr + 1)
2586     {
2587     PUT(hc, 0, offset + adjust);
2588     break;
2589     }
2590     }
2591    
2592     /* Otherwise, adjust the recursion offset if it's after the start of this
2593     group. */
2594    
2595     if (hc >= cd->hwm)
2596     {
2597     offset = GET(ptr, 1);
2598     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2599     }
2600    
2601 nigel 77 ptr += 1 + LINK_SIZE;
2602     }
2603     }
2604    
2605    
2606    
2607     /*************************************************
2608     * Insert an automatic callout point *
2609     *************************************************/
2610    
2611     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2612     callout points before each pattern item.
2613    
2614     Arguments:
2615     code current code pointer
2616     ptr current pattern pointer
2617     cd pointers to tables etc
2618    
2619     Returns: new code pointer
2620     */
2621    
2622     static uschar *
2623     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2624     {
2625     *code++ = OP_CALLOUT;
2626     *code++ = 255;
2627 ph10 530 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
2628     PUT(code, LINK_SIZE, 0); /* Default length */
2629 nigel 77 return code + 2*LINK_SIZE;
2630     }
2631    
2632    
2633    
2634     /*************************************************
2635     * Complete a callout item *
2636     *************************************************/
2637    
2638     /* A callout item contains the length of the next item in the pattern, which
2639     we can't fill in till after we have reached the relevant point. This is used
2640     for both automatic and manual callouts.
2641    
2642     Arguments:
2643     previous_callout points to previous callout item
2644     ptr current pattern pointer
2645     cd pointers to tables etc
2646    
2647     Returns: nothing
2648     */
2649    
2650     static void
2651     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2652     {
2653 ph10 530 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2654 nigel 77 PUT(previous_callout, 2 + LINK_SIZE, length);
2655     }
2656    
2657    
2658    
2659     #ifdef SUPPORT_UCP
2660     /*************************************************
2661     * Get othercase range *
2662     *************************************************/
2663    
2664     /* This function is passed the start and end of a class range, in UTF-8 mode
2665     with UCP support. It searches up the characters, looking for internal ranges of
2666     characters in the "other" case. Each call returns the next one, updating the
2667     start address.
2668    
2669     Arguments:
2670     cptr points to starting character value; updated
2671     d end value
2672     ocptr where to put start of othercase range
2673     odptr where to put end of othercase range
2674    
2675     Yield: TRUE when range returned; FALSE when no more
2676     */
2677    
2678     static BOOL
2679 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2680     unsigned int *odptr)
2681 nigel 77 {
2682 nigel 93 unsigned int c, othercase, next;
2683 nigel 77
2684     for (c = *cptr; c <= d; c++)
2685 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2686 nigel 77
2687     if (c > d) return FALSE;
2688    
2689     *ocptr = othercase;
2690     next = othercase + 1;
2691    
2692     for (++c; c <= d; c++)
2693     {
2694 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2695 nigel 77 next++;
2696     }
2697    
2698     *odptr = next - 1;
2699     *cptr = c;
2700    
2701     return TRUE;
2702     }
2703 ph10 532
2704    
2705    
2706     /*************************************************
2707     * Check a character and a property *
2708     *************************************************/
2709    
2710     /* This function is called by check_auto_possessive() when a property item
2711     is adjacent to a fixed character.
2712    
2713     Arguments:
2714     c the character
2715     ptype the property type
2716     pdata the data for the type
2717     negated TRUE if it's a negated property (\P or \p{^)
2718 ph10 535
2719 ph10 532 Returns: TRUE if auto-possessifying is OK
2720 ph10 535 */
2721 ph10 532
2722     static BOOL
2723     check_char_prop(int c, int ptype, int pdata, BOOL negated)
2724     {
2725     const ucd_record *prop = GET_UCD(c);
2726     switch(ptype)
2727     {
2728     case PT_LAMP:
2729     return (prop->chartype == ucp_Lu ||
2730     prop->chartype == ucp_Ll ||
2731     prop->chartype == ucp_Lt) == negated;
2732    
2733     case PT_GC:
2734     return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
2735    
2736     case PT_PC:
2737     return (pdata == prop->chartype) == negated;
2738    
2739     case PT_SC:
2740     return (pdata == prop->script) == negated;
2741    
2742     /* These are specials */
2743    
2744     case PT_ALNUM:
2745     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2746     _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
2747    
2748     case PT_SPACE: /* Perl space */
2749     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2750     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2751     == negated;
2752    
2753     case PT_PXSPACE: /* POSIX space */
2754     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2755     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2756     c == CHAR_FF || c == CHAR_CR)
2757     == negated;
2758    
2759     case PT_WORD:
2760     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2761     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2762     c == CHAR_UNDERSCORE) == negated;
2763     }
2764 ph10 535 return FALSE;
2765 ph10 532 }
2766 nigel 77 #endif /* SUPPORT_UCP */
2767    
2768    
2769 nigel 93
2770 nigel 77 /*************************************************
2771 nigel 93 * Check if auto-possessifying is possible *
2772     *************************************************/
2773    
2774     /* This function is called for unlimited repeats of certain items, to see
2775     whether the next thing could possibly match the repeated item. If not, it makes
2776     sense to automatically possessify the repeated item.
2777    
2778     Arguments:
2779 ph10 532 previous pointer to the repeated opcode
2780 nigel 93 utf8 TRUE in UTF-8 mode
2781     ptr next character in pattern
2782     options options bits
2783     cd contains pointers to tables etc.
2784    
2785     Returns: TRUE if possessifying is wanted
2786     */
2787    
2788     static BOOL
2789 ph10 535 check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
2790 ph10 532 int options, compile_data *cd)
2791 nigel 93 {
2792 ph10 532 int c, next;
2793     int op_code = *previous++;
2794 nigel 93
2795     /* Skip whitespace and comments in extended mode */
2796    
2797     if ((options & PCRE_EXTENDED) != 0)
2798     {
2799     for (;;)
2800     {
2801     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2802 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2803 nigel 93 {
2804 ph10 579 ptr++;
2805 ph10 556 while (*ptr != 0)
2806     {
2807 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2808 ph10 556 ptr++;
2809 ph10 579 #ifdef SUPPORT_UTF8
2810 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2811     #endif
2812     }
2813 nigel 93 }
2814     else break;
2815     }
2816     }
2817    
2818     /* If the next item is one that we can handle, get its value. A non-negative
2819     value is a character, a negative value is an escape value. */
2820    
2821 ph10 391 if (*ptr == CHAR_BACKSLASH)
2822 nigel 93 {
2823     int temperrorcode = 0;
2824     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2825     if (temperrorcode != 0) return FALSE;
2826     ptr++; /* Point after the escape sequence */
2827     }
2828    
2829     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2830     {
2831     #ifdef SUPPORT_UTF8
2832     if (utf8) { GETCHARINC(next, ptr); } else
2833     #endif
2834     next = *ptr++;
2835     }
2836    
2837     else return FALSE;
2838    
2839     /* Skip whitespace and comments in extended mode */
2840    
2841     if ((options & PCRE_EXTENDED) != 0)
2842     {
2843     for (;;)
2844     {
2845     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2846 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2847 nigel 93 {
2848 ph10 579 ptr++;
2849 ph10 556 while (*ptr != 0)
2850     {
2851 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2852 ph10 556 ptr++;
2853 ph10 579 #ifdef SUPPORT_UTF8
2854 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2855     #endif
2856     }
2857 nigel 93 }
2858     else break;
2859     }
2860     }
2861    
2862     /* If the next thing is itself optional, we have to give up. */
2863    
2864 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2865 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2866     return FALSE;
2867 nigel 93
2868 ph10 532 /* Now compare the next item with the previous opcode. First, handle cases when
2869     the next item is a character. */
2870 nigel 93
2871     if (next >= 0) switch(op_code)
2872     {
2873     case OP_CHAR:
2874 ph10 535 #ifdef SUPPORT_UTF8
2875 ph10 532 GETCHARTEST(c, previous);
2876 ph10 369 #else
2877 ph10 532 c = *previous;
2878 ph10 535 #endif
2879     return c != next;
2880 nigel 93
2881 ph10 602 /* For CHARI (caseless character) we must check the other case. If we have
2882 nigel 93 Unicode property support, we can use it to test the other case of
2883     high-valued characters. */
2884    
2885 ph10 602 case OP_CHARI:
2886 ph10 535 #ifdef SUPPORT_UTF8
2887 ph10 532 GETCHARTEST(c, previous);
2888     #else
2889     c = *previous;
2890 ph10 535 #endif
2891 ph10 532 if (c == next) return FALSE;
2892 nigel 93 #ifdef SUPPORT_UTF8
2893     if (utf8)
2894     {
2895     unsigned int othercase;
2896     if (next < 128) othercase = cd->fcc[next]; else
2897     #ifdef SUPPORT_UCP
2898 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2899 nigel 93 #else
2900     othercase = NOTACHAR;
2901     #endif
2902 ph10 532 return (unsigned int)c != othercase;
2903 nigel 93 }
2904     else
2905     #endif /* SUPPORT_UTF8 */
2906 ph10 532 return (c != cd->fcc[next]); /* Non-UTF-8 mode */
2907 nigel 93
2908 ph10 602 /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
2909 ph10 604 opcodes are not used for multi-byte characters, because they are coded using
2910 ph10 602 an XCLASS instead. */
2911 nigel 93
2912     case OP_NOT:
2913 ph10 602 return (c = *previous) == next;
2914 ph10 604
2915     case OP_NOTI:
2916 ph10 532 if ((c = *previous) == next) return TRUE;
2917 nigel 93 #ifdef SUPPORT_UTF8
2918     if (utf8)
2919     {
2920     unsigned int othercase;
2921     if (next < 128) othercase = cd->fcc[next]; else
2922     #ifdef SUPPORT_UCP
2923 ph10 349 othercase = UCD_OTHERCASE(next);
2924 nigel 93 #else
2925     othercase = NOTACHAR;
2926     #endif
2927 ph10 532 return (unsigned int)c == othercase;
2928 nigel 93 }
2929     else
2930     #endif /* SUPPORT_UTF8 */
2931 ph10 532 return (c == cd->fcc[next]); /* Non-UTF-8 mode */
2932 nigel 93
2933 ph10 535 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
2934     When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
2935    
2936 nigel 93 case OP_DIGIT:
2937     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2938    
2939     case OP_NOT_DIGIT:
2940     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2941    
2942     case OP_WHITESPACE:
2943     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2944    
2945     case OP_NOT_WHITESPACE:
2946     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2947    
2948     case OP_WORDCHAR:
2949     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2950    
2951     case OP_NOT_WORDCHAR:
2952     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2953    
2954 ph10 180 case OP_HSPACE:
2955     case OP_NOT_HSPACE:
2956     switch(next)
2957     {
2958     case 0x09:
2959     case 0x20:
2960     case 0xa0:
2961     case 0x1680:
2962     case 0x180e:
2963     case 0x2000:
2964     case 0x2001:
2965     case 0x2002:
2966     case 0x2003:
2967     case 0x2004:
2968     case 0x2005:
2969     case 0x2006:
2970     case 0x2007:
2971     case 0x2008:
2972     case 0x2009:
2973     case 0x200A:
2974     case 0x202f:
2975     case 0x205f:
2976     case 0x3000:
2977 ph10 528 return op_code == OP_NOT_HSPACE;
2978 ph10 180 default:
2979 ph10 528 return op_code != OP_NOT_HSPACE;
2980 ph10 180 }
2981    
2982 ph10 528 case OP_ANYNL:
2983 ph10 180 case OP_VSPACE:
2984     case OP_NOT_VSPACE:
2985     switch(next)
2986     {
2987     case 0x0a:
2988     case 0x0b:
2989     case 0x0c:
2990     case 0x0d:
2991     case 0x85:
2992     case 0x2028:
2993     case 0x2029:
2994 ph10 528 return op_code == OP_NOT_VSPACE;
2995 ph10 180 default:
2996 ph10 528 return op_code != OP_NOT_VSPACE;
2997 ph10 180 }
2998    
2999 ph10 532 #ifdef SUPPORT_UCP
3000     case OP_PROP:
3001     return check_char_prop(next, previous[0], previous[1], FALSE);
3002 ph10 535
3003 ph10 532 case OP_NOTPROP:
3004     return check_char_prop(next, previous[0], previous[1], TRUE);
3005     #endif
3006    
3007 nigel 93 default:
3008     return FALSE;
3009     }
3010    
3011    
3012 ph10 535 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3013     is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3014     generated only when PCRE_UCP is *not* set, that is, when only ASCII
3015     characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
3016 ph10 532 replaced by OP_PROP codes when PCRE_UCP is set. */
3017 nigel 93
3018     switch(op_code)
3019     {
3020     case OP_CHAR:
3021 ph10 602 case OP_CHARI:
3022 ph10 535 #ifdef SUPPORT_UTF8
3023 ph10 532 GETCHARTEST(c, previous);
3024     #else
3025     c = *previous;
3026 ph10 535 #endif
3027 nigel 93 switch(-next)
3028     {
3029     case ESC_d:
3030 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
3031 nigel 93
3032     case ESC_D:
3033 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
3034 nigel 93
3035     case ESC_s:
3036 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
3037 nigel 93
3038     case ESC_S:
3039 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
3040 nigel 93
3041     case ESC_w:
3042 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
3043 nigel 93
3044     case ESC_W:
3045 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
3046 ph10 182
3047 ph10 180 case ESC_h:
3048     case ESC_H:
3049 ph10 532 switch(c)
3050 ph10 180 {
3051     case 0x09:
3052     case 0x20:
3053     case 0xa0:
3054     case 0x1680:
3055     case 0x180e:
3056     case 0x2000:
3057     case 0x2001:
3058     case 0x2002:
3059     case 0x2003:
3060     case 0x2004:
3061     case 0x2005:
3062     case 0x2006:
3063     case 0x2007:
3064     case 0x2008:
3065     case 0x2009:
3066     case 0x200A:
3067     case 0x202f:
3068     case 0x205f:
3069     case 0x3000:
3070     return -next != ESC_h;
3071     default:
3072     return -next == ESC_h;
3073 ph10 182 }
3074    
3075 ph10 180 case ESC_v:
3076     case ESC_V:
3077 ph10 532 switch(c)
3078 ph10 180 {
3079     case 0x0a:
3080     case 0x0b:
3081     case 0x0c:
3082     case 0x0d:
3083     case 0x85:
3084     case 0x2028:
3085     case 0x2029:
3086     return -next != ESC_v;
3087     default:
3088     return -next == ESC_v;
3089 ph10 182 }
3090 ph10 535
3091     /* When PCRE_UCP is set, these values get generated for \d etc. Find
3092     their substitutions and process them. The result will always be either
3093 ph10 532 -ESC_p or -ESC_P. Then fall through to process those values. */
3094 ph10 535
3095 ph10 532 #ifdef SUPPORT_UCP
3096     case ESC_du:
3097     case ESC_DU:
3098     case ESC_wu:
3099     case ESC_WU:
3100     case ESC_su:
3101     case ESC_SU:
3102     {
3103     int temperrorcode = 0;
3104     ptr = substitutes[-next - ESC_DU];
3105     next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
3106     if (temperrorcode != 0) return FALSE;
3107     ptr++; /* For compatibility */
3108     }
3109 ph10 535 /* Fall through */
3110 nigel 93
3111 ph10 532 case ESC_p:
3112     case ESC_P:
3113     {
3114     int ptype, pdata, errorcodeptr;
3115 ph10 535 BOOL negated;
3116    
3117 ph10 532 ptr--; /* Make ptr point at the p or P */
3118     ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
3119     if (ptype < 0) return FALSE;
3120     ptr++; /* Point past the final curly ket */
3121 ph10 535
3122 ph10 532 /* If the property item is optional, we have to give up. (When generated
3123     from \d etc by PCRE_UCP, this test will have been applied much earlier,
3124     to the original \d etc. At this point, ptr will point to a zero byte. */
3125 ph10 535
3126 ph10 532 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3127     strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3128     return FALSE;
3129 ph10 535
3130 ph10 532 /* Do the property check. */
3131 ph10 535
3132 ph10 532 return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
3133 ph10 535 }
3134 ph10 532 #endif
3135    
3136 nigel 93 default:
3137     return FALSE;
3138     }
3139    
3140 ph10 535 /* In principle, support for Unicode properties should be integrated here as
3141     well. It means re-organizing the above code so as to get hold of the property
3142     values before switching on the op-code. However, I wonder how many patterns
3143     combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
3144     these op-codes are never generated.) */
3145    
3146 nigel 93 case OP_DIGIT:
3147 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
3148 ph10 528 next == -ESC_h || next == -ESC_v || next == -ESC_R;
3149 nigel 93
3150     case OP_NOT_DIGIT:
3151     return next == -ESC_d;
3152    
3153     case OP_WHITESPACE:
3154 ph10 528 return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
3155 nigel 93
3156     case OP_NOT_WHITESPACE:
3157 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
3158 nigel 93
3159 ph10 180 case OP_HSPACE:
3160 ph10 535 return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
3161 ph10 528 next == -ESC_w || next == -ESC_v || next == -ESC_R;
3162 ph10 180
3163     case OP_NOT_HSPACE:
3164     return next == -ESC_h;
3165 ph10 182
3166 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
3167 ph10 535 case OP_ANYNL:
3168 ph10 182 case OP_VSPACE:
3169 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
3170    
3171     case OP_NOT_VSPACE:
3172 ph10 528 return next == -ESC_v || next == -ESC_R;
3173 ph10 180
3174 nigel 93 case OP_WORDCHAR:
3175 ph10 535 return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
3176 ph10 528 next == -ESC_v || next == -ESC_R;
3177 nigel 93
3178     case OP_NOT_WORDCHAR:
3179     return next == -ESC_w || next == -ESC_d;
3180 ph10 182
3181 nigel 93 default:
3182     return FALSE;
3183     }
3184    
3185     /* Control does not reach here */
3186     }
3187    
3188    
3189    
3190     /*************************************************
3191 nigel 77 * Compile one branch *
3192     *************************************************/
3193    
3194 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
3195 nigel 77 changed during the branch, the pointer is used to change the external options
3196 nigel 93 bits. This function is used during the pre-compile phase when we are trying
3197     to find out the amount of memory needed, as well as during the real compile
3198     phase. The value of lengthptr distinguishes the two phases.
3199 nigel 77
3200     Arguments:
3201     optionsptr pointer to the option bits
3202     codeptr points to the pointer to the current code point
3203     ptrptr points to the current pattern pointer
3204     errorcodeptr points to error code variable
3205     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3206     reqbyteptr set to the last literal character required, else < 0
3207     bcptr points to current branch chain
3208 ph10 654 cond_depth conditional nesting depth
3209 nigel 77 cd contains pointers to tables etc.
3210 nigel 93 lengthptr NULL during the real compile phase
3211     points to length accumulator during pre-compile phase
3212 nigel 77
3213     Returns: TRUE on success
3214     FALSE, with *errorcodeptr set non-zero on error
3215     */
3216    
3217     static BOOL
3218 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
3219     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
3220 ph10 642 int cond_depth, compile_data *cd, int *lengthptr)
3221 nigel 77 {
3222     int repeat_type, op_type;
3223     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
3224     int bravalue = 0;
3225     int greedy_default, greedy_non_default;
3226     int firstbyte, reqbyte;
3227     int zeroreqbyte, zerofirstbyte;
3228     int req_caseopt, reqvary, tempreqvary;
3229 ph10 635 int options = *optionsptr; /* May change dynamically */
3230 nigel 77 int after_manual_callout = 0;
3231 nigel 93 int length_prevgroup = 0;
3232 nigel 77 register int c;
3233     register uschar *code = *codeptr;
3234 nigel 93 uschar *last_code = code;
3235     uschar *orig_code = code;
3236 nigel 77 uschar *tempcode;
3237     BOOL inescq = FALSE;
3238     BOOL groupsetfirstbyte = FALSE;
3239     const uschar *ptr = *ptrptr;
3240     const uschar *tempptr;
3241 ph10 518 const uschar *nestptr = NULL;
3242 nigel 77 uschar *previous = NULL;
3243     uschar *previous_callout = NULL;
3244 nigel 93 uschar *save_hwm = NULL;
3245 nigel 77 uschar classbits[32];
3246    
3247 ph10 635 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3248 ph10 654 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3249 ph10 635 dynamically as we process the pattern. */
3250    
3251 nigel 77 #ifdef SUPPORT_UTF8
3252     BOOL class_utf8;
3253     BOOL utf8 = (options & PCRE_UTF8) != 0;
3254     uschar *class_utf8data;
3255 ph10 300 uschar *class_utf8data_base;
3256 nigel 77 uschar utf8_char[6];
3257     #else
3258     BOOL utf8 = FALSE;
3259     #endif
3260    
3261 ph10 475 #ifdef PCRE_DEBUG
3262 nigel 93 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3263     #endif
3264    
3265 nigel 77 /* Set up the default and non-default settings for greediness */
3266    
3267     greedy_default = ((options & PCRE_UNGREEDY) != 0);
3268     greedy_non_default = greedy_default ^ 1;
3269    
3270     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3271     matching encountered yet". It gets changed to REQ_NONE if we hit something that
3272     matches a non-fixed char first char; reqbyte just remains unset if we never
3273     find one.
3274    
3275     When we hit a repeat whose minimum is zero, we may have to adjust these values
3276     to take the zero repeat into account. This is implemented by setting them to
3277     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
3278     item types that can be repeated set these backoff variables appropriately. */
3279    
3280     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
3281    
3282     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
3283     according to the current setting of the caseless flag. REQ_CASELESS is a bit
3284     value > 255. It is added into the firstbyte or reqbyte variables to record the
3285     case status of the value. This is used only for ASCII characters. */
3286    
3287     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3288    
3289     /* Switch on next character until the end of the branch */
3290    
3291     for (;; ptr++)
3292     {
3293     BOOL negate_class;
3294 ph10 286 BOOL should_flip_negation;
3295 nigel 77 BOOL possessive_quantifier;
3296     BOOL is_quantifier;
3297 nigel 93 BOOL is_recurse;
3298 ph10 180 BOOL reset_bracount;
3299 nigel 77 int class_charcount;
3300     int class_lastchar;
3301     int newoptions;
3302     int recno;
3303 ph10 172 int refsign;
3304 nigel 77 int skipbytes;
3305     int subreqbyte;
3306     int subfirstbyte;
3307 nigel 93 int terminator;
3308 nigel 77 int mclength;
3309 ph10 733 int tempbracount;
3310 nigel 77 uschar mcbuffer[8];
3311    
3312 nigel 93 /* Get next byte in the pattern */
3313 nigel 77
3314     c = *ptr;
3315 ph10 345
3316 ph10 535 /* If we are at the end of a nested substitution, revert to the outer level
3317 ph10 518 string. Nesting only happens one level deep. */
3318    
3319     if (c == 0 && nestptr != NULL)
3320     {
3321     ptr = nestptr;
3322     nestptr = NULL;
3323     c = *ptr;
3324     }
3325    
3326 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
3327     previous cycle of this loop. */
3328    
3329     if (lengthptr != NULL)
3330     {
3331 ph10 475 #ifdef PCRE_DEBUG
3332 nigel 93 if (code > cd->hwm) cd->hwm = code; /* High water info */
3333     #endif
3334 ph10 505 if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */
3335 nigel 93 {
3336     *errorcodeptr = ERR52;
3337     goto FAILED;
3338     }
3339    
3340     /* There is at least one situation where code goes backwards: this is the
3341     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3342     the class is simply eliminated. However, it is created first, so we have to
3343     allow memory for it. Therefore, don't ever reduce the length at this point.
3344     */
3345    
3346     if (code < last_code) code = last_code;
3347 ph10 202
3348     /* Paranoid check for integer overflow */
3349    
3350     if (OFLOW_MAX - *lengthptr < code - last_code)
3351     {
3352     *errorcodeptr = ERR20;
3353     goto FAILED;
3354     }
3355    
3356 ph10 530 *lengthptr += (int)(code - last_code);
3357 ph10 751 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code),
3358     c));
3359 nigel 93
3360     /* If "previous" is set and it is not at the start of the work space, move
3361     it back to there, in order to avoid filling up the work space. Otherwise,
3362     if "previous" is NULL, reset the current code pointer to the start. */
3363    
3364     if (previous != NULL)
3365     {
3366     if (previous > orig_code)
3367     {
3368     memmove(orig_code, previous, code - previous);
3369     code -= previous - orig_code;
3370     previous = orig_code;
3371     }
3372     }
3373     else code = orig_code;
3374    
3375     /* Remember where this code item starts so we can pick up the length
3376     next time round. */
3377    
3378     last_code = code;
3379     }
3380    
3381     /* In the real compile phase, just check the workspace used by the forward
3382     reference list. */
3383    
3384 ph10 505 else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
3385 nigel 93 {
3386     *errorcodeptr = ERR52;
3387     goto FAILED;
3388     }
3389    
3390 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
3391    
3392     if (inescq && c != 0)
3393     {
3394 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3395 nigel 77 {
3396     inescq = FALSE;
3397     ptr++;
3398     continue;
3399     }
3400     else
3401     {
3402     if (previous_callout != NULL)
3403     {
3404 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3405     complete_callout(previous_callout, ptr, cd);
3406 nigel 77 previous_callout = NULL;
3407     }
3408     if ((options & PCRE_AUTO_CALLOUT) != 0)
3409     {
3410     previous_callout = code;
3411     code = auto_callout(code, ptr, cd);
3412     }
3413     goto NORMAL_CHAR;
3414     }
3415     }
3416    
3417     /* Fill in length of a previous callout, except when the next thing is
3418     a quantifier. */
3419    
3420 ph10 392 is_quantifier =
3421 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3422     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3423 nigel 77
3424     if (!is_quantifier && previous_callout != NULL &&
3425     after_manual_callout-- <= 0)
3426     {
3427 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3428     complete_callout(previous_callout, ptr, cd);
3429 nigel 77 previous_callout = NULL;
3430     }
3431    
3432 ph10 635 /* In extended mode, skip white space and comments. */
3433 nigel 77
3434     if ((options & PCRE_EXTENDED) != 0)
3435     {
3436     if ((cd->ctypes[c] & ctype_space) != 0) continue;
3437 ph10 391 if (c == CHAR_NUMBER_SIGN)
3438 nigel 77 {
3439 ph10 579 ptr++;
3440 ph10 556 while (*ptr != 0)
3441 nigel 91 {
3442 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3443 ph10 556 ptr++;
3444 ph10 579 #ifdef SUPPORT_UTF8
3445 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
3446     #endif
3447 nigel 91 }
3448 nigel 93 if (*ptr != 0) continue;
3449    
3450 nigel 91 /* Else fall through to handle end of string */
3451     c = 0;
3452 nigel 77 }
3453     }
3454    
3455     /* No auto callout for quantifiers. */
3456    
3457     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3458     {
3459     previous_callout = code;
3460     code = auto_callout(code, ptr, cd);
3461     }
3462    
3463     switch(c)
3464     {
3465 nigel 93 /* ===================================================================*/
3466     case 0: /* The branch terminates at string end */
3467 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
3468     case CHAR_RIGHT_PARENTHESIS:
3469 nigel 77 *firstbyteptr = firstbyte;
3470     *reqbyteptr = reqbyte;
3471     *codeptr = code;
3472     *ptrptr = ptr;
3473 nigel 93 if (lengthptr != NULL)
3474     {
3475 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
3476     {
3477     *errorcodeptr = ERR20;
3478     goto FAILED;
3479     }
3480 ph10 530 *lengthptr += (int)(code - last_code); /* To include callout length */
3481 nigel 93 DPRINTF((">> end branch\n"));
3482     }
3483 nigel 77 return TRUE;
3484    
3485 nigel 93
3486     /* ===================================================================*/
3487 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
3488     the setting of any following char as a first character. */
3489    
3490 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
3491 ph10 602 previous = NULL;
3492 nigel 77 if ((options & PCRE_MULTILINE) != 0)
3493     {
3494     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3495 ph10 602 *code++ = OP_CIRCM;
3496 nigel 77 }
3497 ph10 602 else *code++ = OP_CIRC;
3498 nigel 77 break;
3499    
3500 ph10 391 case CHAR_DOLLAR_SIGN:
3501 nigel 77 previous = NULL;
3502 ph10 602 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3503 nigel 77 break;
3504    
3505     /* There can never be a first char if '.' is first, whatever happens about
3506     repeats. The value of reqbyte doesn't change either. */
3507    
3508 ph10 391 case CHAR_DOT:
3509 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3510     zerofirstbyte = firstbyte;
3511     zeroreqbyte = reqbyte;
3512     previous = code;
3513 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3514 nigel 77 break;
3515    
3516 nigel 93
3517     /* ===================================================================*/
3518 nigel 87 /* Character classes. If the included characters are all < 256, we build a
3519     32-byte bitmap of the permitted characters, except in the special case
3520     where there is only one such character. For negated classes, we build the
3521     map as usual, then invert it at the end. However, we use a different opcode
3522     so that data characters > 255 can be handled correctly.
3523 nigel 77
3524     If the class contains characters outside the 0-255 range, a different
3525     opcode is compiled. It may optionally have a bit map for characters < 256,
3526     but those above are are explicitly listed afterwards. A flag byte tells
3527     whether the bitmap is present, and whether this is a negated class or not.
3528 ph10 345
3529 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
3530     default (Perl) mode, it is treated as a data character. */
3531 ph10 345
3532 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
3533 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3534     {
3535     *errorcodeptr = ERR64;
3536 ph10 345 goto FAILED;
3537 ph10 336 }
3538 ph10 345 goto NORMAL_CHAR;
3539 nigel 77
3540 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
3541 nigel 77 previous = code;
3542    
3543     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3544     they are encountered at the top level, so we'll do that too. */
3545    
3546 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3547 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
3548 ph10 295 check_posix_syntax(ptr, &tempptr))
3549 nigel 77 {
3550 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3551 nigel 77 goto FAILED;
3552     }
3553    
3554 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
3555 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
3556 ph10 205 skip them too. This makes for compatibility with Perl. */
3557 ph10 208
3558 ph10 205 negate_class = FALSE;
3559     for (;;)
3560 nigel 77 {
3561     c = *(++ptr);
3562 ph10 391 if (c == CHAR_BACKSLASH)
3563 ph10 205 {
3564 ph10 392 if (ptr[1] == CHAR_E)
3565 ph10 391 ptr++;
3566 ph10 392 else if (strncmp((const char *)ptr+1,
3567     STR_Q STR_BACKSLASH STR_E, 3) == 0)
3568 ph10 391 ptr += 3;
3569 ph10 392 else
3570 ph10 391 break;
3571 ph10 205 }
3572 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3573 ph10 205 negate_class = TRUE;
3574     else break;
3575 ph10 208 }
3576 ph10 345
3577     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3578     an initial ']' is taken as a data character -- the code below handles
3579 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3580     [^] must match any character, so generate OP_ALLANY. */
3581 ph10 345
3582 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3583 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3584 ph10 341 {
3585     *code++ = negate_class? OP_ALLANY : OP_FAIL;
3586     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3587     zerofirstbyte = firstbyte;
3588     break;
3589 ph10 345 }
3590 nigel 77
3591 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
3592     negation flag at the end, so that support for characters > 255 works
3593 ph10 264 correctly (they are all included in the class). */
3594    
3595     should_flip_negation = FALSE;
3596    
3597 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
3598 nigel 93 of just a single character (as long as it's < 256). However, For higher
3599     valued UTF-8 characters, we don't yet do any optimization. */
3600 nigel 77
3601     class_charcount = 0;
3602     class_lastchar = -1;
3603    
3604 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
3605     temporary bit of memory, in case the class contains only 1 character (less
3606     than 256), because in that case the compiled code doesn't use the bit map.
3607     */
3608    
3609     memset(classbits, 0, 32 * sizeof(uschar));
3610    
3611 nigel 77 #ifdef SUPPORT_UTF8
3612     class_utf8 = FALSE; /* No chars >= 256 */
3613 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3614 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3615 nigel 77 #endif
3616    
3617     /* Process characters until ] is reached. By writing this as a "do" it
3618 nigel 93 means that an initial ] is taken as a data character. At the start of the
3619     loop, c contains the first byte of the character. */
3620 nigel 77
3621 nigel 93 if (c != 0) do
3622 nigel 77 {
3623 nigel 93 const uschar *oldptr;
3624    
3625 nigel 77 #ifdef SUPPORT_UTF8
3626     if (utf8 && c > 127)
3627     { /* Braces are required because the */
3628     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3629     }
3630 ph10 535
3631 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3632 ph10 309 data and reset the pointer. This is so that very large classes that
3633 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
3634 ph10 309 (which is on the stack). */
3635    
3636 ph10 300 if (lengthptr != NULL)
3637     {
3638     *lengthptr += class_utf8data - class_utf8data_base;
3639 ph10 309 class_utf8data = class_utf8data_base;
3640     }
3641    
3642 nigel 77 #endif
3643    
3644     /* Inside \Q...\E everything is literal except \E */
3645    
3646     if (inescq)
3647     {
3648 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3649 nigel 77 {
3650 nigel 93 inescq = FALSE; /* Reset literal state */
3651     ptr++; /* Skip the 'E' */
3652     continue; /* Carry on with next */
3653 nigel 77 }
3654 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
3655 nigel 77 }
3656    
3657     /* Handle POSIX class names. Perl allows a negation extension of the
3658     form [:^name:]. A square bracket that doesn't match the syntax is
3659     treated as a literal. We also recognize the POSIX constructions
3660     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3661     5.6 and 5.8 do. */
3662    
3663 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3664 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3665 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3666 nigel 77 {
3667     BOOL local_negate = FALSE;
3668 nigel 87 int posix_class, taboffset, tabopt;
3669 nigel 77 register const uschar *cbits = cd->cbits;
3670 nigel 87 uschar pbits[32];
3671 nigel 77
3672 ph10 391 if (ptr[1] != CHAR_COLON)
3673 nigel 77 {
3674     *errorcodeptr = ERR31;
3675     goto FAILED;
3676     }
3677    
3678     ptr += 2;
3679 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3680 nigel 77 {
3681     local_negate = TRUE;
3682 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3683 nigel 77 ptr++;
3684     }
3685    
3686 ph10 530 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3687 nigel 77 if (posix_class < 0)
3688     {
3689     *errorcodeptr = ERR30;
3690     goto FAILED;
3691     }
3692    
3693     /* If matching is caseless, upper and lower are converted to
3694     alpha. This relies on the fact that the class table starts with
3695     alpha, lower, upper as the first 3 entries. */
3696    
3697     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3698     posix_class = 0;
3699 ph10 535
3700     /* When PCRE_UCP is set, some of the POSIX classes are converted to
3701 ph10 518 different escape sequences that use Unicode properties. */
3702 ph10 535
3703 ph10 518 #ifdef SUPPORT_UCP
3704     if ((options & PCRE_UCP) != 0)
3705     {
3706     int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3707     if (posix_substitutes[pc] != NULL)
3708     {
3709 ph10 535 nestptr = tempptr + 1;
3710 ph10 518 ptr = posix_substitutes[pc] - 1;
3711 ph10 535 continue;
3712     }
3713     }
3714     #endif
3715 ph10 518 /* In the non-UCP case, we build the bit map for the POSIX class in a
3716     chunk of local store because we may be adding and subtracting from it,
3717     and we don't want to subtract bits that may be in the main map already.
3718     At the end we or the result into the bit map that is being built. */
3719 nigel 77
3720     posix_class *= 3;
3721 nigel 87
3722     /* Copy in the first table (always present) */
3723    
3724     memcpy(pbits, cbits + posix_class_maps[posix_class],
3725     32 * sizeof(uschar));
3726    
3727     /* If there is a second table, add or remove it as required. */
3728    
3729     taboffset = posix_class_maps[posix_class + 1];
3730     tabopt = posix_class_maps[posix_class + 2];
3731    
3732     if (taboffset >= 0)
3733 nigel 77 {
3734 nigel 87 if (tabopt >= 0)
3735     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3736 nigel 77 else
3737 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3738 nigel 77 }
3739    
3740 nigel 87 /* Not see if we need to remove any special characters. An option
3741     value of 1 removes vertical space and 2 removes underscore. */
3742    
3743     if (tabopt < 0) tabopt = -tabopt;
3744     if (tabopt == 1) pbits[1] &= ~0x3c;
3745     else if (tabopt == 2) pbits[11] &= 0x7f;
3746    
3747     /* Add the POSIX table or its complement into the main table that is
3748     being built and we are done. */
3749    
3750     if (local_negate)
3751     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3752     else
3753     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3754    
3755 nigel 77 ptr = tempptr + 1;
3756     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3757     continue; /* End of POSIX syntax handling */
3758     }
3759    
3760     /* Backslash may introduce a single character, or it may introduce one
3761 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3762 ph10 513 case. Inside a class (and only there) it is treated as backspace. We
3763     assume that other escapes have more than one character in them, so set
3764     class_charcount bigger than one. Unrecognized escapes fall through and
3765     are either treated as literal characters (by default), or are faulted if
3766     PCRE_EXTRA is set. */
3767 nigel 77
3768 ph10 391 if (c == CHAR_BACKSLASH)
3769 nigel 77 {
3770 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3771     if (*errorcodeptr != 0) goto FAILED;
3772 nigel 77
3773 ph10 513 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3774 ph10 758 else if (-c == ESC_N) /* \N is not supported in a class */
3775     {
3776     *errorcodeptr = ERR71;
3777     goto FAILED;
3778     }
3779 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3780     {
3781 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3782 nigel 77 {
3783     ptr += 2; /* avoid empty string */
3784     }
3785     else inescq = TRUE;
3786     continue;
3787     }
3788 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3789 nigel 77
3790     if (c < 0)
3791     {
3792     register const uschar *cbits = cd->cbits;
3793     class_charcount += 2; /* Greater than 1 is what matters */
3794 nigel 93
3795 ph10 518 switch (-c)
3796 nigel 77 {
3797 ph10 518 #ifdef SUPPORT_UCP
3798     case ESC_du: /* These are the values given for \d etc */
3799     case ESC_DU: /* when PCRE_UCP is set. We replace the */
3800     case ESC_wu: /* escape sequence with an appropriate \p */
3801     case ESC_WU: /* or \P to test Unicode properties instead */
3802     case ESC_su: /* of the default ASCII testing. */
3803     case ESC_SU:
3804     nestptr = ptr;
3805     ptr = substitutes[-c - ESC_DU] - 1; /* Just before substitute */
3806 ph10 535 class_charcount -= 2; /* Undo! */
3807 ph10 518 continue;
3808     #endif
3809 nigel 77 case ESC_d:
3810     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3811     continue;
3812    
3813     case ESC_D:
3814 ph10 286 should_flip_negation = TRUE;
3815 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3816     continue;
3817    
3818     case ESC_w:
3819     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3820     continue;
3821    
3822     case ESC_W:
3823 ph10 286 should_flip_negation = TRUE;
3824 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3825     continue;
3826    
3827 ph10 552 /* Perl 5.004 onwards omits VT from \s, but we must preserve it
3828 ph10 579 if it was previously set by something earlier in the character
3829     class. */
3830 ph10 552
3831 nigel 77 case ESC_s:
3832 ph10 552 classbits[0] |= cbits[cbit_space];
3833 ph10 579 classbits[1] |= cbits[cbit_space+1] & ~0x08;
3834 ph10 552 for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3835 nigel 77 continue;
3836    
3837     case ESC_S:
3838 ph10 286 should_flip_negation = TRUE;
3839 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3840     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3841     continue;
3842    
3843 ph10 518 case ESC_h:
3844 ph10 178 SETBIT(classbits, 0x09); /* VT */
3845     SETBIT(classbits, 0x20); /* SPACE */
3846 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
3847 ph10 178 #ifdef SUPPORT_UTF8
3848     if (utf8)
3849 ph10 180 {
3850 ph10 178 class_utf8 = TRUE;
3851     *class_utf8data++ = XCL_SINGLE;
3852 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3853 ph10 178 *class_utf8data++ = XCL_SINGLE;
3854 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3855     *class_utf8data++ = XCL_RANGE;
3856     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3857     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3858 ph10 178 *class_utf8data++ = XCL_SINGLE;
3859 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3860 ph10 178 *class_utf8data++ = XCL_SINGLE;
3861 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3862 ph10 178 *class_utf8data++ = XCL_SINGLE;
3863 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3864     }
3865     #endif
3866     continue;
3867 nigel 93
3868 ph10 518 case ESC_H:
3869 ph10 178 for (c = 0; c < 32; c++)
3870     {
3871     int x = 0xff;
3872     switch (c)
3873 ph10 180 {
3874 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
3875     case 0x20/8: x ^= 1 << (0x20%8); break;
3876     case 0xa0/8: x ^= 1 << (0xa0%8); break;
3877     default: break;
3878     }
3879     classbits[c] |= x;
3880 ph10 180 }
3881    
3882 ph10 178 #ifdef SUPPORT_UTF8
3883     if (utf8)
3884 ph10 180 {
3885 ph10 178 class_utf8 = TRUE;
3886 ph10 180 *class_utf8data++ = XCL_RANGE;
3887     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3888     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3889     *class_utf8data++ = XCL_RANGE;
3890     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3891     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3892     *class_utf8data++ = XCL_RANGE;
3893     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3894     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3895     *class_utf8data++ = XCL_RANGE;
3896     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3897     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3898     *class_utf8data++ = XCL_RANGE;
3899     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3900     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3901     *class_utf8data++ = XCL_RANGE;
3902     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3903     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3904     *class_utf8data++ = XCL_RANGE;
3905     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3906     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3907     }
3908     #endif
3909