/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 505 - (hide annotations) (download)
Tue Mar 9 16:50:47 2010 UTC (4 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 223308 byte(s)
Improve compile-time overrun checking.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 475 Copyright (c) 1997-2010 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 ph10 475 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57     also used by pcretest. PCRE_DEBUG is not defined when building a production
58     library. */
59 nigel 85
60 ph10 475 #ifdef PCRE_DEBUG
61 nigel 85 #include "pcre_printint.src"
62     #endif
63    
64    
65 ph10 178 /* Macro for setting individual bits in class bitmaps. */
66    
67     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68    
69 ph10 202 /* Maximum length value to check against when making sure that the integer that
70     holds the compiled pattern length does not overflow. We make it a bit less than
71     INT_MAX to allow for adding in group terminating bytes, so that we don't have
72     to check them every time. */
73 ph10 178
74 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
75    
76    
77 nigel 77 /*************************************************
78     * Code parameters and static tables *
79     *************************************************/
80    
81 nigel 93 /* This value specifies the size of stack workspace that is used during the
82     first pre-compile phase that determines how much memory is required. The regex
83     is partly compiled into this space, but the compiled parts are discarded as
84     soon as they can be, so that hopefully there will never be an overrun. The code
85     does, however, check for an overrun. The largest amount I've seen used is 218,
86     so this number is very generous.
87 nigel 77
88 nigel 93 The same workspace is used during the second, actual compile phase for
89     remembering forward references to groups so that they can be filled in at the
90     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91     is 4 there is plenty of room. */
92 nigel 77
93 nigel 93 #define COMPILE_WORK_SIZE (4096)
94 nigel 77
95 ph10 505 /* The overrun tests check for a slightly smaller size so that they detect the
96     overrun before it actually does run off the end of the data block. */
97 nigel 93
98 ph10 505 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99    
100    
101 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
102     are simple data values; negative values are for special things like \d and so
103     on. Zero means further processing is needed (for things like \x), or the escape
104     is invalid. */
105    
106 ph10 391 #ifndef EBCDIC
107    
108     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
109 ph10 392 in UTF-8 mode. */
110 ph10 391
111 ph10 392 static const short int escapes[] = {
112 ph10 391 0, 0,
113     0, 0,
114 ph10 392 0, 0,
115     0, 0,
116     0, 0,
117 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
118 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
119 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
120 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
121     -ESC_B, -ESC_C,
122     -ESC_D, -ESC_E,
123     0, -ESC_G,
124     -ESC_H, 0,
125     0, -ESC_K,
126 ph10 391 0, 0,
127 ph10 392 0, 0,
128 ph10 391 -ESC_P, -ESC_Q,
129     -ESC_R, -ESC_S,
130 ph10 392 0, 0,
131     -ESC_V, -ESC_W,
132     -ESC_X, 0,
133     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
134 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
135 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
136 ph10 391 CHAR_GRAVE_ACCENT, 7,
137 ph10 392 -ESC_b, 0,
138     -ESC_d, ESC_e,
139 ph10 391 ESC_f, 0,
140     -ESC_h, 0,
141 ph10 392 0, -ESC_k,
142 ph10 391 0, 0,
143     ESC_n, 0,
144 ph10 392 -ESC_p, 0,
145     ESC_r, -ESC_s,
146 ph10 391 ESC_tee, 0,
147 ph10 392 -ESC_v, -ESC_w,
148     0, 0,
149 ph10 391 -ESC_z
150 nigel 77 };
151    
152 ph10 392 #else
153 ph10 391
154     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155    
156 nigel 77 static const short int escapes[] = {
157     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
158     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
159     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
160     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
161     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
162     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
163     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
164     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
165 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
166 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
167 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
168 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
169 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
170     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
171     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
172     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
173 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
174 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
175 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
176 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
177 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
178     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
179     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
180     };
181     #endif
182    
183    
184 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185     searched linearly. Put all the names into a single string, in order to reduce
186 ph10 392 the number of relocations when a shared library is dynamically linked. The
187     string is built from string macros so that it works in UTF-8 mode on EBCDIC
188 ph10 391 platforms. */
189 ph10 210
190     typedef struct verbitem {
191     int len;
192     int op;
193 ph10 211 } verbitem;
194 ph10 210
195 ph10 240 static const char verbnames[] =
196 ph10 391 STRING_ACCEPT0
197     STRING_COMMIT0
198     STRING_F0
199     STRING_FAIL0
200     STRING_PRUNE0
201     STRING_SKIP0
202     STRING_THEN;
203 ph10 240
204 ph10 327 static const verbitem verbs[] = {
205 ph10 240 { 6, OP_ACCEPT },
206     { 6, OP_COMMIT },
207     { 1, OP_FAIL },
208     { 4, OP_FAIL },
209     { 5, OP_PRUNE },
210     { 4, OP_SKIP },
211     { 4, OP_THEN }
212 ph10 210 };
213    
214 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
215 ph10 210
216    
217 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
218     now all in a single string, to reduce the number of relocations when a shared
219 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
220     length entry. The first three must be alpha, lower, upper, as this is assumed
221     for handling case independence. */
222 nigel 77
223 ph10 240 static const char posix_names[] =
224 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
225     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
226 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
227     STRING_word0 STRING_xdigit;
228 nigel 77
229     static const uschar posix_name_lengths[] = {
230     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
231    
232 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
233     base map, with an optional addition or removal of another map. Then, for some
234     classes, there is some additional tweaking: for [:blank:] the vertical space
235     characters are removed, and for [:alpha:] and [:alnum:] the underscore
236     character is removed. The triples in the table consist of the base map offset,
237     second map offset or -1 if no second map, and a non-negative value for map
238     addition or a negative value for map subtraction (if there are two maps). The
239     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
240     remove vertical space characters, 2 => remove underscore. */
241 nigel 77
242     static const int posix_class_maps[] = {
243 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
244     cbit_lower, -1, 0, /* lower */
245     cbit_upper, -1, 0, /* upper */
246     cbit_word, -1, 2, /* alnum - word without underscore */
247     cbit_print, cbit_cntrl, 0, /* ascii */
248     cbit_space, -1, 1, /* blank - a GNU extension */
249     cbit_cntrl, -1, 0, /* cntrl */
250     cbit_digit, -1, 0, /* digit */
251     cbit_graph, -1, 0, /* graph */
252     cbit_print, -1, 0, /* print */
253     cbit_punct, -1, 0, /* punct */
254     cbit_space, -1, 0, /* space */
255     cbit_word, -1, 0, /* word - a Perl extension */
256     cbit_xdigit,-1, 0 /* xdigit */
257 nigel 77 };
258    
259    
260 nigel 93 #define STRING(a) # a
261     #define XSTRING(s) STRING(s)
262    
263 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
264 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
265     they are documented. Always add a new error instead. Messages marked DEAD below
266 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
267     the number of relocations needed when a shared library is loaded dynamically,
268     it is now one long string. We cannot use a table of offsets, because the
269     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
270     simply count through to the one we want - this isn't a performance issue
271 ph10 499 because these strings are used only when there is a compilation error.
272 nigel 77
273 ph10 499 Each substring ends with \0 to insert a null character. This includes the final
274     substring, so that the whole string ends with \0\0, which can be detected when
275     counting through. */
276    
277 ph10 240 static const char error_texts[] =
278     "no error\0"
279     "\\ at end of pattern\0"
280     "\\c at end of pattern\0"
281     "unrecognized character follows \\\0"
282     "numbers out of order in {} quantifier\0"
283 nigel 77 /* 5 */
284 ph10 240 "number too big in {} quantifier\0"
285     "missing terminating ] for character class\0"
286     "invalid escape sequence in character class\0"
287     "range out of order in character class\0"
288     "nothing to repeat\0"
289 nigel 77 /* 10 */
290 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
291     "internal error: unexpected repeat\0"
292 ph10 269 "unrecognized character after (? or (?-\0"
293 ph10 240 "POSIX named classes are supported only within a class\0"
294     "missing )\0"
295 nigel 77 /* 15 */
296 ph10 240 "reference to non-existent subpattern\0"
297     "erroffset passed as NULL\0"
298     "unknown option bit(s) set\0"
299     "missing ) after comment\0"
300     "parentheses nested too deeply\0" /** DEAD **/
301 nigel 77 /* 20 */
302 ph10 240 "regular expression is too large\0"
303     "failed to get memory\0"
304     "unmatched parentheses\0"
305     "internal error: code overflow\0"
306     "unrecognized character after (?<\0"
307 nigel 77 /* 25 */
308 ph10 240 "lookbehind assertion is not fixed length\0"
309     "malformed number or name after (?(\0"
310     "conditional group contains more than two branches\0"
311     "assertion expected after (?(\0"
312     "(?R or (?[+-]digits must be followed by )\0"
313 nigel 77 /* 30 */
314 ph10 240 "unknown POSIX class name\0"
315     "POSIX collating elements are not supported\0"
316     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
317     "spare error\0" /** DEAD **/
318     "character value in \\x{...} sequence is too large\0"
319 nigel 77 /* 35 */
320 ph10 240 "invalid condition (?(0)\0"
321     "\\C not allowed in lookbehind assertion\0"
322     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
323     "number after (?C is > 255\0"
324     "closing ) for (?C expected\0"
325 nigel 77 /* 40 */
326 ph10 240 "recursive call could loop indefinitely\0"
327     "unrecognized character after (?P\0"
328     "syntax error in subpattern name (missing terminator)\0"
329     "two named subpatterns have the same name\0"
330     "invalid UTF-8 string\0"
331 nigel 77 /* 45 */
332 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
333     "malformed \\P or \\p sequence\0"
334     "unknown property name after \\P or \\p\0"
335     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
336     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
337 nigel 91 /* 50 */
338 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
339     "octal value is greater than \\377 (not in UTF-8 mode)\0"
340     "internal error: overran compiling workspace\0"
341     "internal error: previously-checked referenced subpattern not found\0"
342     "DEFINE group contains more than one branch\0"
343 nigel 93 /* 55 */
344 ph10 240 "repeating a DEFINE group is not allowed\0"
345     "inconsistent NEWLINE options\0"
346 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
347     "a numbered reference must not be zero\0"
348 ph10 240 "(*VERB) with an argument is not supported\0"
349 ph10 211 /* 60 */
350 ph10 240 "(*VERB) not recognized\0"
351 ph10 268 "number is too big\0"
352 ph10 272 "subpattern name expected\0"
353 ph10 336 "digit expected after (?+\0"
354 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
355     /* 65 */
356 ph10 499 "different names for subpatterns of the same number are not allowed\0";
357 nigel 77
358     /* Table to identify digits and hex digits. This is used when compiling
359     patterns. Note that the tables in chartables are dependent on the locale, and
360     may mark arbitrary characters as digits - but the PCRE compiling code expects
361     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
362     a private table here. It costs 256 bytes, but it is a lot faster than doing
363     character value tests (at least in some simple cases I timed), and in some
364     applications one wants PCRE to compile efficiently as well as match
365     efficiently.
366    
367     For convenience, we use the same bit definitions as in chartables:
368    
369     0x04 decimal digit
370     0x08 hexadecimal digit
371    
372     Then we can use ctype_digit and ctype_xdigit in the code. */
373    
374 ph10 392 #ifndef EBCDIC
375 ph10 391
376 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
377 ph10 391 UTF-8 mode. */
378    
379 nigel 77 static const unsigned char digitab[] =
380     {
381     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
382     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
383     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
387     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
388     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
389     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
393     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
394     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
395     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
396     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
397     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
398     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
399     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
400     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
401     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
402     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
403     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
404     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
405     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
406     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
407     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
408     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
409     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
410     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
411     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
412     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
413    
414 ph10 392 #else
415 ph10 391
416     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
417    
418 nigel 77 static const unsigned char digitab[] =
419     {
420     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
421     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
422     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
423     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
424     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
425     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
426     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
427     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
428     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
429     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
430     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
431 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
432 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
433     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
434     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
435     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
436     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
437     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
438     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
439     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
440     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
441     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
442     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
443     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
444     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
445     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
446     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
447     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
448     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
449     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
450     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
451     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
452    
453     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
454     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
455     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
456     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
457     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
458     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
459     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
460     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
461     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
462     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
463     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
464     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
465 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
466 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
467     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
468     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
469     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
470     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
471     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
472     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
473     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
474     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
475     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
476     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
477     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
478     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
479     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
480     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
481     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
482     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
483     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
484     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
485     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
486     #endif
487    
488    
489     /* Definition to allow mutual recursion */
490    
491     static BOOL
492 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
493 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
494 nigel 77
495    
496    
497     /*************************************************
498 ph10 240 * Find an error text *
499     *************************************************/
500    
501 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
502     some of the text is of unknown length, we can't use a table of offsets.
503     Instead, just count through the strings. This is not a performance issue
504 ph10 240 because it happens only when there has been a compilation error.
505    
506     Argument: the error number
507     Returns: pointer to the error string
508     */
509    
510     static const char *
511     find_error_text(int n)
512     {
513     const char *s = error_texts;
514 ph10 499 for (; n > 0; n--)
515     {
516     while (*s++ != 0) {};
517     if (*s == 0) return "Error text not found (please report)";
518     }
519 ph10 240 return s;
520     }
521    
522    
523     /*************************************************
524 nigel 77 * Handle escapes *
525     *************************************************/
526    
527     /* This function is called when a \ has been encountered. It either returns a
528     positive value for a simple escape such as \n, or a negative value which
529 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
530     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
531     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
532     ptr is pointing at the \. On exit, it is on the final character of the escape
533     sequence.
534 nigel 77
535     Arguments:
536     ptrptr points to the pattern position pointer
537     errorcodeptr points to the errorcode variable
538     bracount number of previous extracting brackets
539     options the options bits
540     isclass TRUE if inside a character class
541    
542     Returns: zero or positive => a data character
543     negative => a special escape sequence
544 ph10 213 on error, errorcodeptr is set
545 nigel 77 */
546    
547     static int
548     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
549     int options, BOOL isclass)
550     {
551 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
552     const uschar *ptr = *ptrptr + 1;
553 nigel 77 int c, i;
554    
555 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
556     ptr--; /* Set pointer back to the last byte */
557    
558 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
559    
560     if (c == 0) *errorcodeptr = ERR1;
561    
562 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
563     in a table. A non-zero result is something that can be returned immediately.
564 nigel 77 Otherwise further processing may be required. */
565    
566 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
567     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
568     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
569 nigel 77
570 ph10 97 #else /* EBCDIC coding */
571 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
572 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
573     #endif
574    
575     /* Escapes that need further processing, or are illegal. */
576    
577     else
578     {
579     const uschar *oldptr;
580 nigel 93 BOOL braced, negated;
581    
582 nigel 77 switch (c)
583     {
584     /* A number of Perl escapes are not handled by PCRE. We give an explicit
585     error. */
586    
587 ph10 391 case CHAR_l:
588     case CHAR_L:
589     case CHAR_N:
590     case CHAR_u:
591     case CHAR_U:
592 nigel 77 *errorcodeptr = ERR37;
593     break;
594    
595 ph10 333 /* \g must be followed by one of a number of specific things:
596 ph10 345
597 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
598     backreference. If negative, it is a relative backreference. This is a Perl
599     5.10 feature.
600 ph10 345
601 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
602     is part of Perl's movement towards a unified syntax for back references. As
603     this is synonymous with \k{name}, we fudge it up by pretending it really
604     was \k.
605 ph10 345
606     (3) For Oniguruma compatibility we also support \g followed by a name or a
607     number either in angle brackets or in single quotes. However, these are
608     (possibly recursive) subroutine calls, _not_ backreferences. Just return
609 ph10 333 the -ESC_g code (cf \k). */
610 nigel 93
611 ph10 391 case CHAR_g:
612     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
613 ph10 333 {
614     c = -ESC_g;
615 ph10 345 break;
616     }
617 ph10 333
618     /* Handle the Perl-compatible cases */
619 ph10 345
620 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
621 nigel 93 {
622 ph10 171 const uschar *p;
623 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
624     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
625     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
626 ph10 171 {
627     c = -ESC_k;
628     break;
629 ph10 172 }
630 nigel 93 braced = TRUE;
631     ptr++;
632     }
633     else braced = FALSE;
634    
635 ph10 391 if (ptr[1] == CHAR_MINUS)
636 nigel 93 {
637     negated = TRUE;
638     ptr++;
639     }
640     else negated = FALSE;
641    
642     c = 0;
643     while ((digitab[ptr[1]] & ctype_digit) != 0)
644 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
645 ph10 220
646 ph10 333 if (c < 0) /* Integer overflow */
647 ph10 213 {
648     *errorcodeptr = ERR61;
649     break;
650 ph10 220 }
651 ph10 345
652 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
653 nigel 93 {
654     *errorcodeptr = ERR57;
655 ph10 213 break;
656 nigel 93 }
657 ph10 345
658 ph10 333 if (c == 0)
659     {
660     *errorcodeptr = ERR58;
661     break;
662 ph10 345 }
663 nigel 93
664     if (negated)
665     {
666     if (c > bracount)
667     {
668     *errorcodeptr = ERR15;
669 ph10 213 break;
670 nigel 93 }
671     c = bracount - (c - 1);
672     }
673    
674     c = -(ESC_REF + c);
675     break;
676    
677 nigel 77 /* The handling of escape sequences consisting of a string of digits
678     starting with one that is not zero is not straightforward. By experiment,
679     the way Perl works seems to be as follows:
680    
681     Outside a character class, the digits are read as a decimal number. If the
682     number is less than 10, or if there are that many previous extracting
683     left brackets, then it is a back reference. Otherwise, up to three octal
684     digits are read to form an escaped byte. Thus \123 is likely to be octal
685     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
686     value is greater than 377, the least significant 8 bits are taken. Inside a
687     character class, \ followed by a digit is always an octal number. */
688    
689 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
690     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
691 nigel 77
692     if (!isclass)
693     {
694     oldptr = ptr;
695 ph10 391 c -= CHAR_0;
696 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
697 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
698 ph10 333 if (c < 0) /* Integer overflow */
699 ph10 213 {
700     *errorcodeptr = ERR61;
701 ph10 220 break;
702     }
703 nigel 77 if (c < 10 || c <= bracount)
704     {
705     c = -(ESC_REF + c);
706     break;
707     }
708     ptr = oldptr; /* Put the pointer back and fall through */
709     }
710    
711     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
712     generates a binary zero byte and treats the digit as a following literal.
713     Thus we have to pull back the pointer by one. */
714    
715 ph10 391 if ((c = *ptr) >= CHAR_8)
716 nigel 77 {
717     ptr--;
718     c = 0;
719     break;
720     }
721    
722     /* \0 always starts an octal number, but we may drop through to here with a
723 nigel 91 larger first octal digit. The original code used just to take the least
724     significant 8 bits of octal numbers (I think this is what early Perls used
725     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
726     than 3 octal digits. */
727 nigel 77
728 ph10 391 case CHAR_0:
729     c -= CHAR_0;
730     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
731     c = c * 8 + *(++ptr) - CHAR_0;
732 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
733 nigel 77 break;
734    
735 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
736     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
737     treated as a data character. */
738 nigel 77
739 ph10 391 case CHAR_x:
740     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
741 nigel 77 {
742     const uschar *pt = ptr + 2;
743 nigel 87 int count = 0;
744    
745 nigel 77 c = 0;
746     while ((digitab[*pt] & ctype_xdigit) != 0)
747     {
748 nigel 87 register int cc = *pt++;
749 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
750 nigel 77 count++;
751 nigel 87
752 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
753     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
754     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
755 ph10 97 #else /* EBCDIC coding */
756 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
757     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
758 nigel 77 #endif
759     }
760 nigel 87
761 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
762 nigel 77 {
763 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
764 nigel 77 ptr = pt;
765     break;
766     }
767 nigel 87
768 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
769     recognize this construct; fall through to the normal \x handling. */
770     }
771    
772 nigel 87 /* Read just a single-byte hex-defined char */
773 nigel 77
774     c = 0;
775     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
776     {
777 ph10 391 int cc; /* Some compilers don't like */
778     cc = *(++ptr); /* ++ in initializers */
779     #ifndef EBCDIC /* ASCII/UTF-8 coding */
780     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
781     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
782 ph10 97 #else /* EBCDIC coding */
783 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
784     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
785 nigel 77 #endif
786     }
787     break;
788    
789 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
790     This coding is ASCII-specific, but then the whole concept of \cx is
791     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
792 nigel 77
793 ph10 391 case CHAR_c:
794 nigel 77 c = *(++ptr);
795     if (c == 0)
796     {
797     *errorcodeptr = ERR2;
798 ph10 213 break;
799 nigel 77 }
800    
801 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
802     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
803 nigel 77 c ^= 0x40;
804 ph10 97 #else /* EBCDIC coding */
805 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
806 nigel 77 c ^= 0xC0;
807     #endif
808     break;
809    
810     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
811 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
812     otherwise, for Perl compatibility, it is a literal. This code looks a bit
813     odd, but there used to be some cases other than the default, and there may
814     be again in future, so I haven't "optimized" it. */
815 nigel 77
816     default:
817     if ((options & PCRE_EXTRA) != 0) switch(c)
818     {
819     default:
820     *errorcodeptr = ERR3;
821     break;
822     }
823     break;
824     }
825     }
826    
827     *ptrptr = ptr;
828     return c;
829     }
830    
831    
832    
833     #ifdef SUPPORT_UCP
834     /*************************************************
835     * Handle \P and \p *
836     *************************************************/
837    
838     /* This function is called after \P or \p has been encountered, provided that
839     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
840     pointing at the P or p. On exit, it is pointing at the final character of the
841     escape sequence.
842    
843     Argument:
844     ptrptr points to the pattern position pointer
845     negptr points to a boolean that is set TRUE for negation else FALSE
846 nigel 87 dptr points to an int that is set to the detailed property value
847 nigel 77 errorcodeptr points to the error code variable
848    
849 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
850 nigel 77 */
851    
852     static int
853 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
854 nigel 77 {
855     int c, i, bot, top;
856     const uschar *ptr = *ptrptr;
857 nigel 87 char name[32];
858 nigel 77
859     c = *(++ptr);
860     if (c == 0) goto ERROR_RETURN;
861    
862     *negptr = FALSE;
863    
864 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
865     negation. */
866 nigel 77
867 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
868 nigel 77 {
869 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
870 nigel 77 {
871     *negptr = TRUE;
872     ptr++;
873     }
874 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
875 nigel 77 {
876     c = *(++ptr);
877     if (c == 0) goto ERROR_RETURN;
878 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
879 nigel 77 name[i] = c;
880     }
881 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
882 nigel 77 name[i] = 0;
883     }
884    
885     /* Otherwise there is just one following character */
886    
887     else
888     {
889     name[0] = c;
890     name[1] = 0;
891     }
892    
893     *ptrptr = ptr;
894    
895     /* Search for a recognized property name using binary chop */
896    
897     bot = 0;
898     top = _pcre_utt_size;
899    
900     while (bot < top)
901     {
902 nigel 87 i = (bot + top) >> 1;
903 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
904 nigel 87 if (c == 0)
905     {
906     *dptr = _pcre_utt[i].value;
907     return _pcre_utt[i].type;
908     }
909 nigel 77 if (c > 0) bot = i + 1; else top = i;
910     }
911    
912     *errorcodeptr = ERR47;
913     *ptrptr = ptr;
914     return -1;
915    
916     ERROR_RETURN:
917     *errorcodeptr = ERR46;
918     *ptrptr = ptr;
919     return -1;
920     }
921     #endif
922    
923    
924    
925    
926     /*************************************************
927     * Check for counted repeat *
928     *************************************************/
929    
930     /* This function is called when a '{' is encountered in a place where it might
931     start a quantifier. It looks ahead to see if it really is a quantifier or not.
932     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
933     where the ddds are digits.
934    
935     Arguments:
936     p pointer to the first char after '{'
937    
938     Returns: TRUE or FALSE
939     */
940    
941     static BOOL
942     is_counted_repeat(const uschar *p)
943     {
944     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
945     while ((digitab[*p] & ctype_digit) != 0) p++;
946 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
947 nigel 77
948 ph10 391 if (*p++ != CHAR_COMMA) return FALSE;
949     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
950 nigel 77
951     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
952     while ((digitab[*p] & ctype_digit) != 0) p++;
953    
954 ph10 391 return (*p == CHAR_RIGHT_CURLY_BRACKET);
955 nigel 77 }
956    
957    
958    
959     /*************************************************
960     * Read repeat counts *
961     *************************************************/
962    
963     /* Read an item of the form {n,m} and return the values. This is called only
964     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
965     so the syntax is guaranteed to be correct, but we need to check the values.
966    
967     Arguments:
968     p pointer to first char after '{'
969     minp pointer to int for min
970     maxp pointer to int for max
971     returned as -1 if no max
972     errorcodeptr points to error code variable
973    
974     Returns: pointer to '}' on success;
975     current ptr on error, with errorcodeptr set non-zero
976     */
977    
978     static const uschar *
979     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
980     {
981     int min = 0;
982     int max = -1;
983    
984 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
985     an integer overflow. */
986    
987 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
988 nigel 81 if (min < 0 || min > 65535)
989     {
990     *errorcodeptr = ERR5;
991     return p;
992     }
993 nigel 77
994 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
995     Also, max must not be less than min. */
996    
997 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
998 nigel 77 {
999 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1000 nigel 77 {
1001     max = 0;
1002 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1003 nigel 81 if (max < 0 || max > 65535)
1004     {
1005     *errorcodeptr = ERR5;
1006     return p;
1007     }
1008 nigel 77 if (max < min)
1009     {
1010     *errorcodeptr = ERR4;
1011     return p;
1012     }
1013     }
1014     }
1015    
1016 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1017     '}'. */
1018 nigel 77
1019 nigel 81 *minp = min;
1020     *maxp = max;
1021 nigel 77 return p;
1022     }
1023    
1024    
1025    
1026     /*************************************************
1027 ph10 408 * Subroutine for finding forward reference *
1028 nigel 91 *************************************************/
1029    
1030 ph10 408 /* This recursive function is called only from find_parens() below. The
1031     top-level call starts at the beginning of the pattern. All other calls must
1032     start at a parenthesis. It scans along a pattern's text looking for capturing
1033 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1034     name it is given, it returns its number. Alternatively, if the name is NULL, it
1035 ph10 408 returns when it reaches a given numbered subpattern. We know that if (?P< is
1036     encountered, the name will be terminated by '>' because that is checked in the
1037 ph10 411 first pass. Recursion is used to keep track of subpatterns that reset the
1038 ph10 408 capturing group numbers - the (?| feature.
1039 nigel 91
1040     Arguments:
1041 ph10 408 ptrptr address of the current character pointer (updated)
1042 ph10 345 cd compile background data
1043 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1044     lorn name length, or subpattern number if name is NULL
1045     xmode TRUE if we are in /x mode
1046 ph10 411 count pointer to the current capturing subpattern number (updated)
1047 nigel 91
1048     Returns: the number of the named subpattern, or -1 if not found
1049     */
1050    
1051     static int
1052 ph10 408 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1053     BOOL xmode, int *count)
1054 nigel 91 {
1055 ph10 408 uschar *ptr = *ptrptr;
1056     int start_count = *count;
1057     int hwm_count = start_count;
1058     BOOL dup_parens = FALSE;
1059 nigel 93
1060 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1061 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1062    
1063     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1064     {
1065     if (ptr[1] == CHAR_QUESTION_MARK &&
1066 ph10 411 ptr[2] == CHAR_VERTICAL_LINE)
1067 ph10 408 {
1068     ptr += 3;
1069 ph10 411 dup_parens = TRUE;
1070     }
1071 ph10 408
1072     /* Handle a normal, unnamed capturing parenthesis */
1073 ph10 411
1074 ph10 408 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1075     {
1076     *count += 1;
1077     if (name == NULL && *count == lorn) return *count;
1078 ph10 411 ptr++;
1079 ph10 408 }
1080    
1081     /* Handle a condition. If it is an assertion, just carry on so that it
1082     is processed as normal. If not, skip to the closing parenthesis of the
1083 ph10 411 condition (there can't be any nested parens. */
1084    
1085 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1086     {
1087 ph10 411 ptr += 2;
1088 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1089     {
1090     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1091 ph10 411 if (*ptr != 0) ptr++;
1092 ph10 408 }
1093 ph10 411 }
1094    
1095 ph10 408 /* We have either (? or (* and not a condition */
1096    
1097     else
1098 ph10 411 {
1099 ph10 408 ptr += 2;
1100     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1101    
1102     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1103 ph10 411
1104 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1105     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1106     {
1107     int term;
1108     const uschar *thisname;
1109     *count += 1;
1110     if (name == NULL && *count == lorn) return *count;
1111     term = *ptr++;
1112     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1113     thisname = ptr;
1114     while (*ptr != term) ptr++;
1115     if (name != NULL && lorn == ptr - thisname &&
1116     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1117     return *count;
1118 ph10 461 term++;
1119 ph10 411 }
1120 ph10 408 }
1121 ph10 411 }
1122 ph10 408
1123 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1124 ph10 408 bars. */
1125    
1126 nigel 91 for (; *ptr != 0; ptr++)
1127     {
1128 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1129    
1130 ph10 391 if (*ptr == CHAR_BACKSLASH)
1131 nigel 93 {
1132 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1133 ph10 391 if (*ptr == CHAR_Q) for (;;)
1134 nigel 93 {
1135 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1136 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1137 ph10 391 if (*(++ptr) == CHAR_E) break;
1138 nigel 93 }
1139     continue;
1140     }
1141    
1142 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1143     are handled for real. If the first character is '^', skip it. Also, if the
1144     first few characters (either before or after ^) are \Q\E or \E we skip them
1145 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1146 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1147 nigel 93
1148 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1149 nigel 93 {
1150 ph10 340 BOOL negate_class = FALSE;
1151     for (;;)
1152     {
1153 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1154 ph10 340 {
1155 ph10 438 if (ptr[2] == CHAR_E)
1156     ptr+= 2;
1157     else if (strncmp((const char *)ptr+2,
1158 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1159 ph10 438 ptr += 4;
1160 ph10 392 else
1161 ph10 391 break;
1162 ph10 340 }
1163 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1164 ph10 461 {
1165 ph10 340 negate_class = TRUE;
1166 ph10 438 ptr++;
1167 ph10 461 }
1168 ph10 340 else break;
1169     }
1170    
1171     /* If the next character is ']', it is a data character that must be
1172 ph10 341 skipped, except in JavaScript compatibility mode. */
1173 ph10 345
1174 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1175 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1176 ph10 345 ptr++;
1177    
1178 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1179 nigel 93 {
1180 ph10 220 if (*ptr == 0) return -1;
1181 ph10 391 if (*ptr == CHAR_BACKSLASH)
1182 nigel 93 {
1183 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1184 ph10 391 if (*ptr == CHAR_Q) for (;;)
1185 nigel 93 {
1186 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1187 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1188 ph10 391 if (*(++ptr) == CHAR_E) break;
1189 nigel 93 }
1190     continue;
1191     }
1192     }
1193     continue;
1194     }
1195    
1196     /* Skip comments in /x mode */
1197    
1198 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1199 nigel 93 {
1200 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1201 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1202 nigel 93 continue;
1203     }
1204    
1205 ph10 408 /* Check for the special metacharacters */
1206 ph10 411
1207 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1208 nigel 93 {
1209 ph10 408 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1210     if (rc > 0) return rc;
1211     if (*ptr == 0) goto FAIL_EXIT;
1212 nigel 93 }
1213 ph10 411
1214 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1215     {
1216     if (dup_parens && *count < hwm_count) *count = hwm_count;
1217     *ptrptr = ptr;
1218     return -1;
1219     }
1220 ph10 411
1221     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1222 ph10 408 {
1223     if (*count > hwm_count) hwm_count = *count;
1224     *count = start_count;
1225 ph10 411 }
1226 ph10 408 }
1227 nigel 93
1228 ph10 408 FAIL_EXIT:
1229     *ptrptr = ptr;
1230     return -1;
1231     }
1232 nigel 93
1233    
1234    
1235    
1236 ph10 408 /*************************************************
1237     * Find forward referenced subpattern *
1238     *************************************************/
1239 nigel 93
1240 ph10 408 /* This function scans along a pattern's text looking for capturing
1241     subpatterns, and counting them. If it finds a named pattern that matches the
1242     name it is given, it returns its number. Alternatively, if the name is NULL, it
1243     returns when it reaches a given numbered subpattern. This is used for forward
1244     references to subpatterns. We used to be able to start this scan from the
1245     current compiling point, using the current count value from cd->bracount, and
1246     do it all in a single loop, but the addition of the possibility of duplicate
1247     subpattern numbers means that we have to scan from the very start, in order to
1248     take account of such duplicates, and to use a recursive function to keep track
1249     of the different types of group.
1250    
1251     Arguments:
1252     cd compile background data
1253     name name to seek, or NULL if seeking a numbered subpattern
1254     lorn name length, or subpattern number if name is NULL
1255     xmode TRUE if we are in /x mode
1256    
1257     Returns: the number of the found subpattern, or -1 if not found
1258     */
1259    
1260     static int
1261     find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1262     {
1263     uschar *ptr = (uschar *)cd->start_pattern;
1264     int count = 0;
1265     int rc;
1266    
1267     /* If the pattern does not start with an opening parenthesis, the first call
1268     to find_parens_sub() will scan right to the end (if necessary). However, if it
1269     does start with a parenthesis, find_parens_sub() will return when it hits the
1270     matching closing parens. That is why we have to have a loop. */
1271    
1272 ph10 411 for (;;)
1273     {
1274 ph10 408 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1275 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1276     }
1277    
1278 ph10 408 return rc;
1279 nigel 91 }
1280    
1281    
1282    
1283 ph10 408
1284 nigel 91 /*************************************************
1285 nigel 77 * Find first significant op code *
1286     *************************************************/
1287    
1288     /* This is called by several functions that scan a compiled expression looking
1289     for a fixed first character, or an anchoring op code etc. It skips over things
1290     that do not influence this. For some calls, a change of option is important.
1291     For some calls, it makes sense to skip negative forward and all backward
1292     assertions, and also the \b assertion; for others it does not.
1293    
1294     Arguments:
1295     code pointer to the start of the group
1296     options pointer to external options
1297     optbit the option bit whose changing is significant, or
1298     zero if none are
1299     skipassert TRUE if certain assertions are to be skipped
1300    
1301     Returns: pointer to the first significant opcode
1302     */
1303    
1304     static const uschar*
1305     first_significant_code(const uschar *code, int *options, int optbit,
1306     BOOL skipassert)
1307     {
1308     for (;;)
1309     {
1310     switch ((int)*code)
1311     {
1312     case OP_OPT:
1313     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1314     *options = (int)code[1];
1315     code += 2;
1316     break;
1317    
1318     case OP_ASSERT_NOT:
1319     case OP_ASSERTBACK:
1320     case OP_ASSERTBACK_NOT:
1321     if (!skipassert) return code;
1322     do code += GET(code, 1); while (*code == OP_ALT);
1323     code += _pcre_OP_lengths[*code];
1324     break;
1325    
1326     case OP_WORD_BOUNDARY:
1327     case OP_NOT_WORD_BOUNDARY:
1328     if (!skipassert) return code;
1329     /* Fall through */
1330    
1331     case OP_CALLOUT:
1332     case OP_CREF:
1333 ph10 459 case OP_NCREF:
1334 nigel 93 case OP_RREF:
1335 ph10 459 case OP_NRREF:
1336 nigel 93 case OP_DEF:
1337 nigel 77 code += _pcre_OP_lengths[*code];
1338     break;
1339    
1340     default:
1341     return code;
1342     }
1343     }
1344     /* Control never reaches here */
1345     }
1346    
1347    
1348    
1349    
1350     /*************************************************
1351 ph10 454 * Find the fixed length of a branch *
1352 nigel 77 *************************************************/
1353    
1354 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1355 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1356 ph10 461 In UTF8 mode, the result is in characters rather than bytes. The branch is
1357 ph10 454 temporarily terminated with OP_END when this function is called.
1358 nigel 77
1359 ph10 461 This function is called when a backward assertion is encountered, so that if it
1360     fails, the error message can point to the correct place in the pattern.
1361 ph10 454 However, we cannot do this when the assertion contains subroutine calls,
1362 ph10 461 because they can be forward references. We solve this by remembering this case
1363 ph10 454 and doing the check at the end; a flag specifies which mode we are running in.
1364    
1365 nigel 77 Arguments:
1366     code points to the start of the pattern (the bracket)
1367     options the compiling options
1368 ph10 461 atend TRUE if called when the pattern is complete
1369     cd the "compile data" structure
1370 nigel 77
1371 ph10 461 Returns: the fixed length,
1372 ph10 454 or -1 if there is no fixed length,
1373 nigel 77 or -2 if \C was encountered
1374 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1375 nigel 77 */
1376    
1377     static int
1378 ph10 454 find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1379 nigel 77 {
1380     int length = -1;
1381    
1382     register int branchlength = 0;
1383     register uschar *cc = code + 1 + LINK_SIZE;
1384    
1385     /* Scan along the opcodes for this branch. If we get to the end of the
1386     branch, check the length against that of the other branches. */
1387    
1388     for (;;)
1389     {
1390     int d;
1391 ph10 454 uschar *ce, *cs;
1392 nigel 77 register int op = *cc;
1393     switch (op)
1394     {
1395 nigel 93 case OP_CBRA:
1396 nigel 77 case OP_BRA:
1397     case OP_ONCE:
1398     case OP_COND:
1399 ph10 454 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1400 nigel 77 if (d < 0) return d;
1401     branchlength += d;
1402     do cc += GET(cc, 1); while (*cc == OP_ALT);
1403     cc += 1 + LINK_SIZE;
1404     break;
1405    
1406     /* Reached end of a branch; if it's a ket it is the end of a nested
1407     call. If it's ALT it is an alternation in a nested call. If it is
1408     END it's the end of the outer call. All can be handled by the same code. */
1409    
1410     case OP_ALT:
1411     case OP_KET:
1412     case OP_KETRMAX:
1413     case OP_KETRMIN:
1414     case OP_END:
1415     if (length < 0) length = branchlength;
1416     else if (length != branchlength) return -1;
1417     if (*cc != OP_ALT) return length;
1418     cc += 1 + LINK_SIZE;
1419     branchlength = 0;
1420     break;
1421 ph10 461
1422 ph10 454 /* A true recursion implies not fixed length, but a subroutine call may
1423     be OK. If the subroutine is a forward reference, we can't deal with
1424     it until the end of the pattern, so return -3. */
1425 ph10 461
1426 ph10 454 case OP_RECURSE:
1427     if (!atend) return -3;
1428     cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1429     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1430     if (cc > cs && cc < ce) return -1; /* Recursion */
1431     d = find_fixedlength(cs + 2, options, atend, cd);
1432 ph10 461 if (d < 0) return d;
1433 ph10 454 branchlength += d;
1434     cc += 1 + LINK_SIZE;
1435 ph10 461 break;
1436 nigel 77
1437     /* Skip over assertive subpatterns */
1438    
1439     case OP_ASSERT:
1440     case OP_ASSERT_NOT:
1441     case OP_ASSERTBACK:
1442     case OP_ASSERTBACK_NOT:
1443     do cc += GET(cc, 1); while (*cc == OP_ALT);
1444     /* Fall through */
1445    
1446     /* Skip over things that don't match chars */
1447    
1448     case OP_REVERSE:
1449     case OP_CREF:
1450 ph10 459 case OP_NCREF:
1451 nigel 93 case OP_RREF:
1452 ph10 459 case OP_NRREF:
1453 nigel 93 case OP_DEF:
1454 nigel 77 case OP_OPT:
1455     case OP_CALLOUT:
1456     case OP_SOD:
1457     case OP_SOM:
1458 ph10 500 case OP_SET_SOM:
1459 nigel 77 case OP_EOD:
1460     case OP_EODN:
1461     case OP_CIRC:
1462     case OP_DOLL:
1463     case OP_NOT_WORD_BOUNDARY:
1464     case OP_WORD_BOUNDARY:
1465     cc += _pcre_OP_lengths[*cc];
1466     break;
1467    
1468     /* Handle literal characters */
1469    
1470     case OP_CHAR:
1471     case OP_CHARNC:
1472 nigel 91 case OP_NOT:
1473 nigel 77 branchlength++;
1474     cc += 2;
1475     #ifdef SUPPORT_UTF8
1476 ph10 461 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1477 ph10 426 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1478 nigel 77 #endif
1479     break;
1480    
1481     /* Handle exact repetitions. The count is already in characters, but we
1482     need to skip over a multibyte character in UTF8 mode. */
1483    
1484     case OP_EXACT:
1485     branchlength += GET2(cc,1);
1486     cc += 4;
1487     #ifdef SUPPORT_UTF8
1488 ph10 461 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1489 ph10 426 cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1490 nigel 77 #endif
1491     break;
1492    
1493     case OP_TYPEEXACT:
1494     branchlength += GET2(cc,1);
1495 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1496 nigel 77 cc += 4;
1497     break;
1498    
1499     /* Handle single-char matchers */
1500    
1501     case OP_PROP:
1502     case OP_NOTPROP:
1503 nigel 87 cc += 2;
1504 nigel 77 /* Fall through */
1505    
1506     case OP_NOT_DIGIT:
1507     case OP_DIGIT:
1508     case OP_NOT_WHITESPACE:
1509     case OP_WHITESPACE:
1510     case OP_NOT_WORDCHAR:
1511     case OP_WORDCHAR:
1512     case OP_ANY:
1513 ph10 342 case OP_ALLANY:
1514 nigel 77 branchlength++;
1515     cc++;
1516     break;
1517    
1518     /* The single-byte matcher isn't allowed */
1519    
1520     case OP_ANYBYTE:
1521     return -2;
1522    
1523     /* Check a class for variable quantification */
1524    
1525     #ifdef SUPPORT_UTF8
1526     case OP_XCLASS:
1527     cc += GET(cc, 1) - 33;
1528     /* Fall through */
1529     #endif
1530    
1531     case OP_CLASS:
1532     case OP_NCLASS:
1533     cc += 33;
1534    
1535     switch (*cc)
1536     {
1537     case OP_CRSTAR:
1538     case OP_CRMINSTAR:
1539     case OP_CRQUERY:
1540     case OP_CRMINQUERY:
1541     return -1;
1542    
1543     case OP_CRRANGE:
1544     case OP_CRMINRANGE:
1545     if (GET2(cc,1) != GET2(cc,3)) return -1;
1546     branchlength += GET2(cc,1);
1547     cc += 5;
1548     break;
1549    
1550     default:
1551     branchlength++;
1552     }
1553     break;
1554    
1555     /* Anything else is variable length */
1556    
1557     default:
1558     return -1;
1559     }
1560     }
1561     /* Control never gets here */
1562     }
1563    
1564    
1565    
1566    
1567     /*************************************************
1568 ph10 454 * Scan compiled regex for specific bracket *
1569 nigel 77 *************************************************/
1570    
1571     /* This little function scans through a compiled pattern until it finds a
1572 ph10 454 capturing bracket with the given number, or, if the number is negative, an
1573 ph10 461 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1574     so that it can be called from pcre_study() when finding the minimum matching
1575 ph10 455 length.
1576 nigel 77
1577     Arguments:
1578     code points to start of expression
1579     utf8 TRUE in UTF-8 mode
1580 ph10 454 number the required bracket number or negative to find a lookbehind
1581 nigel 77
1582     Returns: pointer to the opcode for the bracket, or NULL if not found
1583     */
1584    
1585 ph10 455 const uschar *
1586     _pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1587 nigel 77 {
1588     for (;;)
1589     {
1590     register int c = *code;
1591     if (c == OP_END) return NULL;
1592 nigel 91
1593     /* XCLASS is used for classes that cannot be represented just by a bit
1594     map. This includes negated single high-valued characters. The length in
1595     the table is zero; the actual length is stored in the compiled code. */
1596    
1597     if (c == OP_XCLASS) code += GET(code, 1);
1598 ph10 461
1599 ph10 454 /* Handle recursion */
1600 ph10 461
1601 ph10 454 else if (c == OP_REVERSE)
1602     {
1603 ph10 461 if (number < 0) return (uschar *)code;
1604 ph10 454 code += _pcre_OP_lengths[c];
1605     }
1606 nigel 91
1607 nigel 93 /* Handle capturing bracket */
1608 nigel 91
1609 nigel 93 else if (c == OP_CBRA)
1610 nigel 77 {
1611 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1612 nigel 77 if (n == number) return (uschar *)code;
1613 nigel 93 code += _pcre_OP_lengths[c];
1614 nigel 77 }
1615 nigel 91
1616 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1617     repeated character types, we have to test for \p and \P, which have an extra
1618 ph10 218 two bytes of parameters. */
1619 nigel 91
1620 nigel 77 else
1621     {
1622 ph10 218 switch(c)
1623     {
1624     case OP_TYPESTAR:
1625     case OP_TYPEMINSTAR:
1626     case OP_TYPEPLUS:
1627     case OP_TYPEMINPLUS:
1628     case OP_TYPEQUERY:
1629     case OP_TYPEMINQUERY:
1630     case OP_TYPEPOSSTAR:
1631     case OP_TYPEPOSPLUS:
1632     case OP_TYPEPOSQUERY:
1633     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1634 ph10 220 break;
1635 ph10 221
1636     case OP_TYPEUPTO:
1637     case OP_TYPEMINUPTO:
1638     case OP_TYPEEXACT:
1639     case OP_TYPEPOSUPTO:
1640     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1641     break;
1642 ph10 220 }
1643    
1644 ph10 218 /* Add in the fixed length from the table */
1645 ph10 220
1646 nigel 77 code += _pcre_OP_lengths[c];
1647 ph10 220
1648 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1649     a multi-byte character. The length in the table is a minimum, so we have to
1650     arrange to skip the extra bytes. */
1651 ph10 220
1652 ph10 107 #ifdef SUPPORT_UTF8
1653 nigel 77 if (utf8) switch(c)
1654     {
1655     case OP_CHAR:
1656     case OP_CHARNC:
1657     case OP_EXACT:
1658     case OP_UPTO:
1659     case OP_MINUPTO:
1660 nigel 93 case OP_POSUPTO:
1661 nigel 77 case OP_STAR:
1662     case OP_MINSTAR:
1663 nigel 93 case OP_POSSTAR:
1664 nigel 77 case OP_PLUS:
1665     case OP_MINPLUS:
1666 nigel 93 case OP_POSPLUS:
1667 nigel 77 case OP_QUERY:
1668     case OP_MINQUERY:
1669 nigel 93 case OP_POSQUERY:
1670     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1671 nigel 77 break;
1672     }
1673 ph10 369 #else
1674     (void)(utf8); /* Keep compiler happy by referencing function argument */
1675 ph10 111 #endif
1676 nigel 77 }
1677     }
1678     }
1679    
1680    
1681    
1682     /*************************************************
1683     * Scan compiled regex for recursion reference *
1684     *************************************************/
1685    
1686     /* This little function scans through a compiled pattern until it finds an
1687     instance of OP_RECURSE.
1688    
1689     Arguments:
1690     code points to start of expression
1691     utf8 TRUE in UTF-8 mode
1692    
1693     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1694     */
1695    
1696     static const uschar *
1697     find_recurse(const uschar *code, BOOL utf8)
1698     {
1699     for (;;)
1700     {
1701     register int c = *code;
1702     if (c == OP_END) return NULL;
1703 nigel 91 if (c == OP_RECURSE) return code;
1704 ph10 220
1705 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1706     map. This includes negated single high-valued characters. The length in
1707     the table is zero; the actual length is stored in the compiled code. */
1708    
1709     if (c == OP_XCLASS) code += GET(code, 1);
1710    
1711 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1712     repeated character types, we have to test for \p and \P, which have an extra
1713 ph10 218 two bytes of parameters. */
1714 nigel 91
1715 nigel 77 else
1716     {
1717 ph10 218 switch(c)
1718     {
1719     case OP_TYPESTAR:
1720     case OP_TYPEMINSTAR:
1721     case OP_TYPEPLUS:
1722     case OP_TYPEMINPLUS:
1723     case OP_TYPEQUERY:
1724     case OP_TYPEMINQUERY:
1725     case OP_TYPEPOSSTAR:
1726     case OP_TYPEPOSPLUS:
1727     case OP_TYPEPOSQUERY:
1728     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1729 ph10 220 break;
1730 ph10 221
1731     case OP_TYPEPOSUPTO:
1732     case OP_TYPEUPTO:
1733     case OP_TYPEMINUPTO:
1734     case OP_TYPEEXACT:
1735     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1736     break;
1737 ph10 220 }
1738    
1739 ph10 218 /* Add in the fixed length from the table */
1740    
1741 nigel 77 code += _pcre_OP_lengths[c];
1742 ph10 220
1743 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1744     by a multi-byte character. The length in the table is a minimum, so we have
1745     to arrange to skip the extra bytes. */
1746 ph10 220
1747 ph10 107 #ifdef SUPPORT_UTF8
1748 nigel 77 if (utf8) switch(c)
1749     {
1750     case OP_CHAR:
1751     case OP_CHARNC:
1752     case OP_EXACT:
1753     case OP_UPTO:
1754     case OP_MINUPTO:
1755 nigel 93 case OP_POSUPTO:
1756 nigel 77 case OP_STAR:
1757     case OP_MINSTAR:
1758 nigel 93 case OP_POSSTAR:
1759 nigel 77 case OP_PLUS:
1760     case OP_MINPLUS:
1761 nigel 93 case OP_POSPLUS:
1762 nigel 77 case OP_QUERY:
1763     case OP_MINQUERY:
1764 nigel 93 case OP_POSQUERY:
1765     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1766 nigel 77 break;
1767     }
1768 ph10 369 #else
1769     (void)(utf8); /* Keep compiler happy by referencing function argument */
1770 ph10 111 #endif
1771 nigel 77 }
1772     }
1773     }
1774    
1775    
1776    
1777     /*************************************************
1778     * Scan compiled branch for non-emptiness *
1779     *************************************************/
1780    
1781     /* This function scans through a branch of a compiled pattern to see whether it
1782 nigel 93 can match the empty string or not. It is called from could_be_empty()
1783     below and from compile_branch() when checking for an unlimited repeat of a
1784     group that can match nothing. Note that first_significant_code() skips over
1785 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1786     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1787     bracket whose current branch will already have been scanned.
1788 nigel 77
1789     Arguments:
1790     code points to start of search
1791     endcode points to where to stop
1792     utf8 TRUE if in UTF8 mode
1793 ph10 503 cd contains pointers to tables etc.
1794 nigel 77
1795     Returns: TRUE if what is matched could be empty
1796     */
1797    
1798     static BOOL
1799 ph10 503 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1800     compile_data *cd)
1801 nigel 77 {
1802     register int c;
1803 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1804 nigel 77 code < endcode;
1805     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1806     {
1807     const uschar *ccode;
1808    
1809     c = *code;
1810 ph10 503
1811 ph10 286 /* Skip over forward assertions; the other assertions are skipped by
1812 ph10 282 first_significant_code() with a TRUE final argument. */
1813 ph10 286
1814 ph10 282 if (c == OP_ASSERT)
1815 ph10 286 {
1816 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1817     c = *code;
1818     continue;
1819 ph10 286 }
1820 ph10 172
1821 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1822 nigel 77
1823 ph10 335 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1824 ph10 170 {
1825 ph10 172 code += _pcre_OP_lengths[c];
1826 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1827     c = *code;
1828     continue;
1829     }
1830 ph10 503
1831     /* For a recursion/subroutine call, if its end has been reached, which
1832     implies a subroutine call, we can scan it. */
1833    
1834     if (c == OP_RECURSE)
1835     {
1836 ph10 504 BOOL empty_branch = FALSE;
1837 ph10 503 const uschar *scode = cd->start_code + GET(code, 1);
1838     if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
1839     do
1840     {
1841 ph10 504 if (could_be_empty_branch(scode, endcode, utf8, cd))
1842     {
1843     empty_branch = TRUE;
1844     break;
1845     }
1846 ph10 503 scode += GET(scode, 1);
1847     }
1848     while (*scode == OP_ALT);
1849 ph10 504 if (!empty_branch) return FALSE; /* All branches are non-empty */
1850 ph10 503 continue;
1851     }
1852 ph10 170
1853     /* For other groups, scan the branches. */
1854 ph10 172
1855 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1856 nigel 77 {
1857     BOOL empty_branch;
1858     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1859 ph10 406
1860     /* If a conditional group has only one branch, there is a second, implied,
1861 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
1862     Otherwise, scan the individual branches of the group. */
1863 ph10 406
1864 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1865 nigel 77 code += GET(code, 1);
1866 ph10 395 else
1867 ph10 406 {
1868 ph10 395 empty_branch = FALSE;
1869     do
1870     {
1871 ph10 503 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
1872 ph10 395 empty_branch = TRUE;
1873     code += GET(code, 1);
1874     }
1875     while (*code == OP_ALT);
1876     if (!empty_branch) return FALSE; /* All branches are non-empty */
1877 nigel 77 }
1878 ph10 406
1879 ph10 172 c = *code;
1880 nigel 93 continue;
1881 nigel 77 }
1882    
1883 nigel 93 /* Handle the other opcodes */
1884    
1885     switch (c)
1886 nigel 77 {
1887 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1888     cannot be represented just by a bit map. This includes negated single
1889     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1890 ph10 220 actual length is stored in the compiled code, so we must update "code"
1891 ph10 216 here. */
1892 nigel 77
1893     #ifdef SUPPORT_UTF8
1894     case OP_XCLASS:
1895 ph10 216 ccode = code += GET(code, 1);
1896 nigel 77 goto CHECK_CLASS_REPEAT;
1897     #endif
1898    
1899     case OP_CLASS:
1900     case OP_NCLASS:
1901     ccode = code + 33;
1902    
1903     #ifdef SUPPORT_UTF8
1904     CHECK_CLASS_REPEAT:
1905     #endif
1906    
1907     switch (*ccode)
1908     {
1909     case OP_CRSTAR: /* These could be empty; continue */
1910     case OP_CRMINSTAR:
1911     case OP_CRQUERY:
1912     case OP_CRMINQUERY:
1913     break;
1914    
1915     default: /* Non-repeat => class must match */
1916     case OP_CRPLUS: /* These repeats aren't empty */
1917     case OP_CRMINPLUS:
1918     return FALSE;
1919    
1920     case OP_CRRANGE:
1921     case OP_CRMINRANGE:
1922     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1923     break;
1924     }
1925     break;
1926    
1927     /* Opcodes that must match a character */
1928    
1929     case OP_PROP:
1930     case OP_NOTPROP:
1931     case OP_EXTUNI:
1932     case OP_NOT_DIGIT:
1933     case OP_DIGIT:
1934     case OP_NOT_WHITESPACE:
1935     case OP_WHITESPACE:
1936     case OP_NOT_WORDCHAR:
1937     case OP_WORDCHAR:
1938     case OP_ANY:
1939 ph10 345 case OP_ALLANY:
1940 nigel 77 case OP_ANYBYTE:
1941     case OP_CHAR:
1942     case OP_CHARNC:
1943     case OP_NOT:
1944     case OP_PLUS:
1945     case OP_MINPLUS:
1946 nigel 93 case OP_POSPLUS:
1947 nigel 77 case OP_EXACT:
1948     case OP_NOTPLUS:
1949     case OP_NOTMINPLUS:
1950 nigel 93 case OP_NOTPOSPLUS:
1951 nigel 77 case OP_NOTEXACT:
1952     case OP_TYPEPLUS:
1953     case OP_TYPEMINPLUS:
1954 nigel 93 case OP_TYPEPOSPLUS:
1955 nigel 77 case OP_TYPEEXACT:
1956     return FALSE;
1957 ph10 227
1958     /* These are going to continue, as they may be empty, but we have to
1959     fudge the length for the \p and \P cases. */
1960    
1961 ph10 224 case OP_TYPESTAR:
1962     case OP_TYPEMINSTAR:
1963     case OP_TYPEPOSSTAR:
1964     case OP_TYPEQUERY:
1965     case OP_TYPEMINQUERY:
1966     case OP_TYPEPOSQUERY:
1967     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1968 ph10 227 break;
1969    
1970 ph10 224 /* Same for these */
1971 ph10 227
1972 ph10 224 case OP_TYPEUPTO:
1973     case OP_TYPEMINUPTO:
1974     case OP_TYPEPOSUPTO:
1975     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1976     break;
1977 nigel 77
1978     /* End of branch */
1979    
1980     case OP_KET:
1981     case OP_KETRMAX:
1982     case OP_KETRMIN:
1983     case OP_ALT:
1984     return TRUE;
1985    
1986 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1987     MINUPTO, and POSUPTO may be followed by a multibyte character */
1988 nigel 77
1989     #ifdef SUPPORT_UTF8
1990     case OP_STAR:
1991     case OP_MINSTAR:
1992 nigel 93 case OP_POSSTAR:
1993 nigel 77 case OP_QUERY:
1994     case OP_MINQUERY:
1995 nigel 93 case OP_POSQUERY:
1996 ph10 426 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1997     break;
1998 ph10 461
1999 nigel 77 case OP_UPTO:
2000     case OP_MINUPTO:
2001 nigel 93 case OP_POSUPTO:
2002 ph10 426 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2003 nigel 77 break;
2004     #endif
2005 ph10 503
2006     /* None of the remaining opcodes are required to match a character. */
2007    
2008     default:
2009     break;
2010 nigel 77 }
2011     }
2012    
2013     return TRUE;
2014     }
2015    
2016    
2017    
2018     /*************************************************
2019     * Scan compiled regex for non-emptiness *
2020     *************************************************/
2021    
2022     /* This function is called to check for left recursive calls. We want to check
2023     the current branch of the current pattern to see if it could match the empty
2024     string. If it could, we must look outwards for branches at other levels,
2025     stopping when we pass beyond the bracket which is the subject of the recursion.
2026    
2027     Arguments:
2028     code points to start of the recursion
2029     endcode points to where to stop (current RECURSE item)
2030     bcptr points to the chain of current (unclosed) branch starts
2031     utf8 TRUE if in UTF-8 mode
2032 ph10 503 cd pointers to tables etc
2033 nigel 77
2034     Returns: TRUE if what is matched could be empty
2035     */
2036    
2037     static BOOL
2038     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2039 ph10 503 BOOL utf8, compile_data *cd)
2040 nigel 77 {
2041 ph10 475 while (bcptr != NULL && bcptr->current_branch >= code)
2042 nigel 77 {
2043 ph10 503 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2044 ph10 475 return FALSE;
2045 nigel 77 bcptr = bcptr->outer;
2046     }
2047     return TRUE;
2048     }
2049    
2050    
2051    
2052     /*************************************************
2053     * Check for POSIX class syntax *
2054     *************************************************/
2055    
2056     /* This function is called when the sequence "[:" or "[." or "[=" is
2057 ph10 295 encountered in a character class. It checks whether this is followed by a
2058 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2059 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2060 nigel 77
2061 ph10 298 Originally, this function only recognized a sequence of letters between the
2062     terminators, but it seems that Perl recognizes any sequence of characters,
2063     though of course unknown POSIX names are subsequently rejected. Perl gives an
2064     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2065     didn't consider this to be a POSIX class. Likewise for [:1234:].
2066 ph10 295
2067 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2068     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2069     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2070     below handles the special case of \], but does not try to do any other escape
2071     processing. This makes it different from Perl for cases such as [:l\ower:]
2072 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2073 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2074 ph10 295 I think.
2075    
2076     Arguments:
2077 nigel 77 ptr pointer to the initial [
2078     endptr where to return the end pointer
2079    
2080     Returns: TRUE or FALSE
2081     */
2082    
2083     static BOOL
2084 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
2085 nigel 77 {
2086     int terminator; /* Don't combine these lines; the Solaris cc */
2087     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2088 ph10 295 for (++ptr; *ptr != 0; ptr++)
2089 nigel 77 {
2090 ph10 391 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2091 ph10 298 {
2092 ph10 391 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2093     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2094 ph10 295 {
2095     *endptr = ptr;
2096     return TRUE;
2097 ph10 298 }
2098     }
2099     }
2100 nigel 77 return FALSE;
2101     }
2102    
2103    
2104    
2105    
2106     /*************************************************
2107     * Check POSIX class name *
2108     *************************************************/
2109    
2110     /* This function is called to check the name given in a POSIX-style class entry
2111     such as [:alnum:].
2112    
2113     Arguments:
2114     ptr points to the first letter
2115     len the length of the name
2116    
2117     Returns: a value representing the name, or -1 if unknown
2118     */
2119    
2120     static int
2121     check_posix_name(const uschar *ptr, int len)
2122     {
2123 ph10 240 const char *pn = posix_names;
2124 nigel 77 register int yield = 0;
2125     while (posix_name_lengths[yield] != 0)
2126     {
2127     if (len == posix_name_lengths[yield] &&
2128 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2129 ph10 243 pn += posix_name_lengths[yield] + 1;
2130 nigel 77 yield++;
2131     }
2132     return -1;
2133     }
2134    
2135    
2136     /*************************************************
2137     * Adjust OP_RECURSE items in repeated group *
2138     *************************************************/
2139    
2140     /* OP_RECURSE items contain an offset from the start of the regex to the group
2141     that is referenced. This means that groups can be replicated for fixed
2142     repetition simply by copying (because the recursion is allowed to refer to
2143     earlier groups that are outside the current group). However, when a group is
2144 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2145     inserted before it, after it has been compiled. This means that any OP_RECURSE
2146     items within it that refer to the group itself or any contained groups have to
2147     have their offsets adjusted. That one of the jobs of this function. Before it
2148     is called, the partially compiled regex must be temporarily terminated with
2149     OP_END.
2150 nigel 77
2151 nigel 93 This function has been extended with the possibility of forward references for
2152     recursions and subroutine calls. It must also check the list of such references
2153     for the group we are dealing with. If it finds that one of the recursions in
2154     the current group is on this list, it adjusts the offset in the list, not the
2155     value in the reference (which is a group number).
2156    
2157 nigel 77 Arguments:
2158     group points to the start of the group
2159     adjust the amount by which the group is to be moved
2160     utf8 TRUE in UTF-8 mode
2161     cd contains pointers to tables etc.
2162 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2163 nigel 77
2164     Returns: nothing
2165     */
2166    
2167     static void
2168 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2169     uschar *save_hwm)
2170 nigel 77 {
2171     uschar *ptr = group;
2172 ph10 224
2173 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2174     {
2175 nigel 93 int offset;
2176     uschar *hc;
2177    
2178     /* See if this recursion is on the forward reference list. If so, adjust the
2179     reference. */
2180 ph10 345
2181 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2182     {
2183     offset = GET(hc, 0);
2184     if (cd->start_code + offset == ptr + 1)
2185     {
2186     PUT(hc, 0, offset + adjust);
2187     break;
2188     }
2189     }
2190    
2191     /* Otherwise, adjust the recursion offset if it's after the start of this
2192     group. */
2193    
2194     if (hc >= cd->hwm)
2195     {
2196     offset = GET(ptr, 1);
2197     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2198     }
2199    
2200 nigel 77 ptr += 1 + LINK_SIZE;
2201     }
2202     }
2203    
2204    
2205    
2206     /*************************************************
2207     * Insert an automatic callout point *
2208     *************************************************/
2209    
2210     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2211     callout points before each pattern item.
2212    
2213     Arguments:
2214     code current code pointer
2215     ptr current pattern pointer
2216     cd pointers to tables etc
2217    
2218     Returns: new code pointer
2219     */
2220    
2221     static uschar *
2222     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2223     {
2224     *code++ = OP_CALLOUT;
2225     *code++ = 255;
2226     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2227     PUT(code, LINK_SIZE, 0); /* Default length */
2228     return code + 2*LINK_SIZE;
2229     }
2230    
2231    
2232    
2233     /*************************************************
2234     * Complete a callout item *
2235     *************************************************/
2236    
2237     /* A callout item contains the length of the next item in the pattern, which
2238     we can't fill in till after we have reached the relevant point. This is used
2239     for both automatic and manual callouts.
2240    
2241     Arguments:
2242     previous_callout points to previous callout item
2243     ptr current pattern pointer
2244     cd pointers to tables etc
2245    
2246     Returns: nothing
2247     */
2248    
2249     static void
2250     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2251     {
2252     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2253     PUT(previous_callout, 2 + LINK_SIZE, length);
2254     }
2255    
2256    
2257    
2258     #ifdef SUPPORT_UCP
2259     /*************************************************
2260     * Get othercase range *
2261     *************************************************/
2262    
2263     /* This function is passed the start and end of a class range, in UTF-8 mode
2264     with UCP support. It searches up the characters, looking for internal ranges of
2265     characters in the "other" case. Each call returns the next one, updating the
2266     start address.
2267    
2268     Arguments:
2269     cptr points to starting character value; updated
2270     d end value
2271     ocptr where to put start of othercase range
2272     odptr where to put end of othercase range
2273    
2274     Yield: TRUE when range returned; FALSE when no more
2275     */
2276    
2277     static BOOL
2278 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2279     unsigned int *odptr)
2280 nigel 77 {
2281 nigel 93 unsigned int c, othercase, next;
2282 nigel 77
2283     for (c = *cptr; c <= d; c++)
2284 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2285 nigel 77
2286     if (c > d) return FALSE;
2287    
2288     *ocptr = othercase;
2289     next = othercase + 1;
2290    
2291     for (++c; c <= d; c++)
2292     {
2293 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2294 nigel 77 next++;
2295     }
2296    
2297     *odptr = next - 1;
2298     *cptr = c;
2299    
2300     return TRUE;
2301     }
2302     #endif /* SUPPORT_UCP */
2303    
2304    
2305 nigel 93
2306 nigel 77 /*************************************************
2307 nigel 93 * Check if auto-possessifying is possible *
2308     *************************************************/
2309    
2310     /* This function is called for unlimited repeats of certain items, to see
2311     whether the next thing could possibly match the repeated item. If not, it makes
2312     sense to automatically possessify the repeated item.
2313    
2314     Arguments:
2315     op_code the repeated op code
2316     this data for this item, depends on the opcode
2317     utf8 TRUE in UTF-8 mode
2318     utf8_char used for utf8 character bytes, NULL if not relevant
2319     ptr next character in pattern
2320     options options bits
2321     cd contains pointers to tables etc.
2322    
2323     Returns: TRUE if possessifying is wanted
2324     */
2325    
2326     static BOOL
2327     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2328     const uschar *ptr, int options, compile_data *cd)
2329     {
2330     int next;
2331    
2332     /* Skip whitespace and comments in extended mode */
2333    
2334     if ((options & PCRE_EXTENDED) != 0)
2335     {
2336     for (;;)
2337     {
2338     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2339 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2340 nigel 93 {
2341     while (*(++ptr) != 0)
2342     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2343     }
2344     else break;
2345     }
2346     }
2347    
2348     /* If the next item is one that we can handle, get its value. A non-negative
2349     value is a character, a negative value is an escape value. */
2350    
2351 ph10 391 if (*ptr == CHAR_BACKSLASH)
2352 nigel 93 {
2353     int temperrorcode = 0;
2354     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2355     if (temperrorcode != 0) return FALSE;
2356     ptr++; /* Point after the escape sequence */
2357     }
2358    
2359     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2360     {
2361     #ifdef SUPPORT_UTF8
2362     if (utf8) { GETCHARINC(next, ptr); } else
2363     #endif
2364     next = *ptr++;
2365     }
2366    
2367     else return FALSE;
2368    
2369     /* Skip whitespace and comments in extended mode */
2370    
2371     if ((options & PCRE_EXTENDED) != 0)
2372     {
2373     for (;;)
2374     {
2375     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2376 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2377 nigel 93 {
2378     while (*(++ptr) != 0)
2379     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2380     }
2381     else break;
2382     }
2383     }
2384    
2385     /* If the next thing is itself optional, we have to give up. */
2386    
2387 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2388 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2389     return FALSE;
2390 nigel 93
2391     /* Now compare the next item with the previous opcode. If the previous is a
2392     positive single character match, "item" either contains the character or, if
2393     "item" is greater than 127 in utf8 mode, the character's bytes are in
2394     utf8_char. */
2395    
2396    
2397     /* Handle cases when the next item is a character. */
2398    
2399     if (next >= 0) switch(op_code)
2400     {
2401     case OP_CHAR:
2402     #ifdef SUPPORT_UTF8
2403     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2404 ph10 369 #else
2405     (void)(utf8_char); /* Keep compiler happy by referencing function argument */
2406 nigel 93 #endif
2407     return item != next;
2408    
2409     /* For CHARNC (caseless character) we must check the other case. If we have
2410     Unicode property support, we can use it to test the other case of
2411     high-valued characters. */
2412    
2413     case OP_CHARNC:
2414     #ifdef SUPPORT_UTF8
2415     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2416     #endif
2417     if (item == next) return FALSE;
2418     #ifdef SUPPORT_UTF8
2419     if (utf8)
2420     {
2421     unsigned int othercase;
2422     if (next < 128) othercase = cd->fcc[next]; else
2423     #ifdef SUPPORT_UCP
2424 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2425 nigel 93 #else
2426     othercase = NOTACHAR;
2427     #endif
2428     return (unsigned int)item != othercase;
2429     }
2430     else
2431     #endif /* SUPPORT_UTF8 */
2432     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2433    
2434     /* For OP_NOT, "item" must be a single-byte character. */
2435    
2436     case OP_NOT:
2437     if (item == next) return TRUE;
2438     if ((options & PCRE_CASELESS) == 0) return FALSE;
2439     #ifdef SUPPORT_UTF8
2440     if (utf8)
2441     {
2442     unsigned int othercase;
2443     if (next < 128) othercase = cd->fcc[next]; else
2444     #ifdef SUPPORT_UCP
2445 ph10 349 othercase = UCD_OTHERCASE(next);
2446 nigel 93 #else
2447     othercase = NOTACHAR;
2448     #endif
2449     return (unsigned int)item == othercase;
2450     }
2451     else
2452     #endif /* SUPPORT_UTF8 */
2453     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2454    
2455     case OP_DIGIT:
2456     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2457    
2458     case OP_NOT_DIGIT:
2459     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2460    
2461     case OP_WHITESPACE:
2462     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2463    
2464     case OP_NOT_WHITESPACE:
2465     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2466    
2467     case OP_WORDCHAR:
2468     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2469    
2470     case OP_NOT_WORDCHAR:
2471     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2472    
2473 ph10 180 case OP_HSPACE:
2474     case OP_NOT_HSPACE:
2475     switch(next)
2476     {
2477     case 0x09:
2478     case 0x20:
2479     case 0xa0:
2480     case 0x1680:
2481     case 0x180e:
2482     case 0x2000:
2483     case 0x2001:
2484     case 0x2002:
2485     case 0x2003:
2486     case 0x2004:
2487     case 0x2005:
2488     case 0x2006:
2489     case 0x2007:
2490     case 0x2008:
2491     case 0x2009:
2492     case 0x200A:
2493     case 0x202f:
2494     case 0x205f:
2495     case 0x3000:
2496     return op_code != OP_HSPACE;
2497     default:
2498     return op_code == OP_HSPACE;
2499     }
2500    
2501     case OP_VSPACE:
2502     case OP_NOT_VSPACE:
2503     switch(next)
2504     {
2505     case 0x0a:
2506     case 0x0b:
2507     case 0x0c:
2508     case 0x0d:
2509     case 0x85:
2510     case 0x2028:
2511     case 0x2029:
2512     return op_code != OP_VSPACE;
2513     default:
2514     return op_code == OP_VSPACE;
2515     }
2516    
2517 nigel 93 default:
2518     return FALSE;
2519     }
2520    
2521    
2522     /* Handle the case when the next item is \d, \s, etc. */
2523    
2524     switch(op_code)
2525     {
2526     case OP_CHAR:
2527     case OP_CHARNC:
2528     #ifdef SUPPORT_UTF8
2529     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2530     #endif
2531     switch(-next)
2532     {
2533     case ESC_d:
2534     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2535    
2536     case ESC_D:
2537     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2538    
2539     case ESC_s:
2540     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2541    
2542     case ESC_S:
2543     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2544    
2545     case ESC_w:
2546     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2547    
2548     case ESC_W:
2549     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2550 ph10 182
2551 ph10 180 case ESC_h:
2552     case ESC_H:
2553     switch(item)
2554     {
2555     case 0x09:
2556     case 0x20:
2557     case 0xa0:
2558     case 0x1680:
2559     case 0x180e:
2560     case 0x2000:
2561     case 0x2001:
2562     case 0x2002:
2563     case 0x2003:
2564     case 0x2004:
2565     case 0x2005:
2566     case 0x2006:
2567     case 0x2007:
2568     case 0x2008:
2569     case 0x2009:
2570     case 0x200A:
2571     case 0x202f:
2572     case 0x205f:
2573     case 0x3000:
2574     return -next != ESC_h;
2575     default:
2576     return -next == ESC_h;
2577 ph10 182 }
2578    
2579 ph10 180 case ESC_v:
2580     case ESC_V:
2581     switch(item)
2582     {
2583     case 0x0a:
2584     case 0x0b:
2585     case 0x0c:
2586     case 0x0d:
2587     case 0x85:
2588     case 0x2028:
2589     case 0x2029:
2590     return -next != ESC_v;
2591     default:
2592     return -next == ESC_v;
2593 ph10 182 }
2594 nigel 93
2595     default:
2596     return FALSE;
2597     }
2598    
2599     case OP_DIGIT:
2600 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2601     next == -ESC_h || next == -ESC_v;
2602 nigel 93
2603     case OP_NOT_DIGIT:
2604     return next == -ESC_d;
2605    
2606     case OP_WHITESPACE:
2607     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2608    
2609     case OP_NOT_WHITESPACE:
2610 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2611 nigel 93
2612 ph10 180 case OP_HSPACE:
2613     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2614    
2615     case OP_NOT_HSPACE:
2616     return next == -ESC_h;
2617 ph10 182
2618 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2619 ph10 182 case OP_VSPACE:
2620 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2621    
2622     case OP_NOT_VSPACE:
2623 ph10 182 return next == -ESC_v;
2624 ph10 180
2625 nigel 93 case OP_WORDCHAR:
2626 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2627 nigel 93
2628     case OP_NOT_WORDCHAR:
2629     return next == -ESC_w || next == -ESC_d;
2630 ph10 182
2631 nigel 93 default:
2632     return FALSE;
2633     }
2634    
2635     /* Control does not reach here */
2636     }
2637    
2638    
2639    
2640     /*************************************************
2641 nigel 77 * Compile one branch *
2642     *************************************************/
2643    
2644 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2645 nigel 77 changed during the branch, the pointer is used to change the external options
2646 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2647     to find out the amount of memory needed, as well as during the real compile
2648     phase. The value of lengthptr distinguishes the two phases.
2649 nigel 77
2650     Arguments:
2651     optionsptr pointer to the option bits
2652     codeptr points to the pointer to the current code point
2653     ptrptr points to the current pattern pointer
2654     errorcodeptr points to error code variable
2655     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2656     reqbyteptr set to the last literal character required, else < 0
2657     bcptr points to current branch chain
2658     cd contains pointers to tables etc.
2659 nigel 93 lengthptr NULL during the real compile phase
2660     points to length accumulator during pre-compile phase
2661 nigel 77
2662     Returns: TRUE on success
2663     FALSE, with *errorcodeptr set non-zero on error
2664     */
2665    
2666     static BOOL
2667 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2668     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2669     compile_data *cd, int *lengthptr)
2670 nigel 77 {
2671     int repeat_type, op_type;
2672     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2673     int bravalue = 0;
2674     int greedy_default, greedy_non_default;
2675     int firstbyte, reqbyte;
2676     int zeroreqbyte, zerofirstbyte;
2677     int req_caseopt, reqvary, tempreqvary;
2678     int options = *optionsptr;
2679     int after_manual_callout = 0;
2680 nigel 93 int length_prevgroup = 0;
2681 nigel 77 register int c;
2682     register uschar *code = *codeptr;
2683 nigel 93 uschar *last_code = code;
2684     uschar *orig_code = code;
2685 nigel 77 uschar *tempcode;
2686     BOOL inescq = FALSE;
2687     BOOL groupsetfirstbyte = FALSE;
2688     const uschar *ptr = *ptrptr;
2689     const uschar *tempptr;
2690     uschar *previous = NULL;
2691     uschar *previous_callout = NULL;
2692 nigel 93 uschar *save_hwm = NULL;
2693 nigel 77 uschar classbits[32];
2694    
2695     #ifdef SUPPORT_UTF8
2696     BOOL class_utf8;
2697     BOOL utf8 = (options & PCRE_UTF8) != 0;
2698     uschar *class_utf8data;
2699 ph10 300 uschar *class_utf8data_base;
2700 nigel 77 uschar utf8_char[6];
2701     #else
2702     BOOL utf8 = FALSE;
2703 nigel 93 uschar *utf8_char = NULL;
2704 nigel 77 #endif
2705    
2706 ph10 475 #ifdef PCRE_DEBUG
2707 nigel 93 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2708     #endif
2709    
2710 nigel 77 /* Set up the default and non-default settings for greediness */
2711    
2712     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2713     greedy_non_default = greedy_default ^ 1;
2714    
2715     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2716     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2717     matches a non-fixed char first char; reqbyte just remains unset if we never
2718     find one.
2719    
2720     When we hit a repeat whose minimum is zero, we may have to adjust these values
2721     to take the zero repeat into account. This is implemented by setting them to
2722     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2723     item types that can be repeated set these backoff variables appropriately. */
2724    
2725     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2726    
2727     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2728     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2729     value > 255. It is added into the firstbyte or reqbyte variables to record the
2730     case status of the value. This is used only for ASCII characters. */
2731    
2732     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2733    
2734     /* Switch on next character until the end of the branch */
2735    
2736     for (;; ptr++)
2737     {
2738     BOOL negate_class;
2739 ph10 286 BOOL should_flip_negation;
2740 nigel 77 BOOL possessive_quantifier;
2741     BOOL is_quantifier;
2742 nigel 93 BOOL is_recurse;
2743 ph10 180 BOOL reset_bracount;
2744 nigel 77 int class_charcount;
2745     int class_lastchar;
2746     int newoptions;
2747     int recno;
2748 ph10 172 int refsign;
2749 nigel 77 int skipbytes;
2750     int subreqbyte;
2751     int subfirstbyte;
2752 nigel 93 int terminator;
2753 nigel 77 int mclength;
2754     uschar mcbuffer[8];
2755    
2756 nigel 93 /* Get next byte in the pattern */
2757 nigel 77
2758     c = *ptr;
2759 ph10 345
2760 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2761     previous cycle of this loop. */
2762    
2763     if (lengthptr != NULL)
2764     {
2765 ph10 475 #ifdef PCRE_DEBUG
2766 nigel 93 if (code > cd->hwm) cd->hwm = code; /* High water info */
2767     #endif
2768 ph10 505 if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */
2769 nigel 93 {
2770     *errorcodeptr = ERR52;
2771     goto FAILED;
2772     }
2773    
2774     /* There is at least one situation where code goes backwards: this is the
2775     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2776     the class is simply eliminated. However, it is created first, so we have to
2777     allow memory for it. Therefore, don't ever reduce the length at this point.
2778     */
2779    
2780     if (code < last_code) code = last_code;
2781 ph10 202
2782     /* Paranoid check for integer overflow */
2783    
2784     if (OFLOW_MAX - *lengthptr < code - last_code)
2785     {
2786     *errorcodeptr = ERR20;
2787     goto FAILED;
2788     }
2789    
2790 nigel 93 *lengthptr += code - last_code;
2791     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2792    
2793     /* If "previous" is set and it is not at the start of the work space, move
2794     it back to there, in order to avoid filling up the work space. Otherwise,
2795     if "previous" is NULL, reset the current code pointer to the start. */
2796    
2797     if (previous != NULL)
2798     {
2799     if (previous > orig_code)
2800     {
2801     memmove(orig_code, previous, code - previous);
2802     code -= previous - orig_code;
2803     previous = orig_code;
2804     }
2805     }
2806     else code = orig_code;
2807    
2808     /* Remember where this code item starts so we can pick up the length
2809     next time round. */
2810    
2811     last_code = code;
2812     }
2813    
2814     /* In the real compile phase, just check the workspace used by the forward
2815     reference list. */
2816    
2817 ph10 505 else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
2818 nigel 93 {
2819     *errorcodeptr = ERR52;
2820     goto FAILED;
2821     }
2822    
2823 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2824    
2825     if (inescq && c != 0)
2826     {
2827 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2828 nigel 77 {
2829     inescq = FALSE;
2830     ptr++;
2831     continue;
2832     }
2833     else
2834     {
2835     if (previous_callout != NULL)
2836     {
2837 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2838     complete_callout(previous_callout, ptr, cd);
2839 nigel 77 previous_callout = NULL;
2840     }
2841     if ((options & PCRE_AUTO_CALLOUT) != 0)
2842     {
2843     previous_callout = code;
2844     code = auto_callout(code, ptr, cd);
2845     }
2846     goto NORMAL_CHAR;
2847     }
2848     }
2849    
2850     /* Fill in length of a previous callout, except when the next thing is
2851     a quantifier. */
2852    
2853 ph10 392 is_quantifier =
2854 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2855     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2856 nigel 77
2857     if (!is_quantifier && previous_callout != NULL &&
2858     after_manual_callout-- <= 0)
2859     {
2860 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2861     complete_callout(previous_callout, ptr, cd);
2862 nigel 77 previous_callout = NULL;
2863     }
2864    
2865     /* In extended mode, skip white space and comments */
2866    
2867     if ((options & PCRE_EXTENDED) != 0)
2868     {
2869     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2870 ph10 391 if (c == CHAR_NUMBER_SIGN)
2871 nigel 77 {
2872 nigel 93 while (*(++ptr) != 0)
2873 nigel 91 {
2874 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2875 nigel 91 }
2876 nigel 93 if (*ptr != 0) continue;
2877    
2878 nigel 91 /* Else fall through to handle end of string */
2879     c = 0;
2880 nigel 77 }
2881     }
2882    
2883     /* No auto callout for quantifiers. */
2884    
2885     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2886     {
2887     previous_callout = code;
2888     code = auto_callout(code, ptr, cd);
2889     }
2890    
2891     switch(c)
2892     {
2893 nigel 93 /* ===================================================================*/
2894     case 0: /* The branch terminates at string end */
2895 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
2896     case CHAR_RIGHT_PARENTHESIS:
2897 nigel 77 *firstbyteptr = firstbyte;
2898     *reqbyteptr = reqbyte;
2899     *codeptr = code;
2900     *ptrptr = ptr;
2901 nigel 93 if (lengthptr != NULL)
2902     {
2903 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2904     {
2905     *errorcodeptr = ERR20;
2906     goto FAILED;
2907     }
2908 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2909     DPRINTF((">> end branch\n"));
2910     }
2911 nigel 77 return TRUE;
2912    
2913 nigel 93
2914     /* ===================================================================*/
2915 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2916     the setting of any following char as a first character. */
2917    
2918 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
2919 nigel 77 if ((options & PCRE_MULTILINE) != 0)
2920     {
2921     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2922     }
2923     previous = NULL;
2924     *code++ = OP_CIRC;
2925     break;
2926    
2927 ph10 391 case CHAR_DOLLAR_SIGN:
2928 nigel 77 previous = NULL;
2929     *code++ = OP_DOLL;
2930     break;
2931    
2932     /* There can never be a first char if '.' is first, whatever happens about
2933     repeats. The value of reqbyte doesn't change either. */
2934    
2935 ph10 391 case CHAR_DOT:
2936 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2937     zerofirstbyte = firstbyte;
2938     zeroreqbyte = reqbyte;
2939     previous = code;
2940 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2941 nigel 77 break;
2942    
2943 nigel 93
2944     /* ===================================================================*/
2945 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2946     32-byte bitmap of the permitted characters, except in the special case
2947     where there is only one such character. For negated classes, we build the
2948     map as usual, then invert it at the end. However, we use a different opcode
2949     so that data characters > 255 can be handled correctly.
2950 nigel 77
2951     If the class contains characters outside the 0-255 range, a different
2952     opcode is compiled. It may optionally have a bit map for characters < 256,
2953     but those above are are explicitly listed afterwards. A flag byte tells
2954     whether the bitmap is present, and whether this is a negated class or not.
2955 ph10 345
2956 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
2957     default (Perl) mode, it is treated as a data character. */
2958 ph10 345
2959 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
2960 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2961     {
2962     *errorcodeptr = ERR64;
2963 ph10 345 goto FAILED;
2964 ph10 336 }
2965 ph10 345 goto NORMAL_CHAR;
2966 nigel 77
2967 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
2968 nigel 77 previous = code;
2969    
2970     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2971     they are encountered at the top level, so we'll do that too. */
2972    
2973 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2974 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
2975 ph10 295 check_posix_syntax(ptr, &tempptr))
2976 nigel 77 {
2977 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2978 nigel 77 goto FAILED;
2979     }
2980    
2981 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2982 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2983 ph10 205 skip them too. This makes for compatibility with Perl. */
2984 ph10 208
2985 ph10 205 negate_class = FALSE;
2986     for (;;)
2987 nigel 77 {
2988     c = *(++ptr);
2989 ph10 391 if (c == CHAR_BACKSLASH)
2990 ph10 205 {
2991 ph10 392 if (ptr[1] == CHAR_E)
2992 ph10 391 ptr++;
2993 ph10 392 else if (strncmp((const char *)ptr+1,
2994     STR_Q STR_BACKSLASH STR_E, 3) == 0)
2995 ph10 391 ptr += 3;
2996 ph10 392 else
2997 ph10 391 break;
2998 ph10 205 }
2999 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3000 ph10 205 negate_class = TRUE;
3001     else break;
3002 ph10 208 }
3003 ph10 345
3004     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3005     an initial ']' is taken as a data character -- the code below handles
3006 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3007     [^] must match any character, so generate OP_ALLANY. */
3008 ph10 345
3009 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3010 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3011 ph10 341 {
3012     *code++ = negate_class? OP_ALLANY : OP_FAIL;
3013     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3014     zerofirstbyte = firstbyte;
3015     break;
3016 ph10 345 }
3017 nigel 77
3018 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
3019     negation flag at the end, so that support for characters > 255 works
3020 ph10 264 correctly (they are all included in the class). */
3021    
3022     should_flip_negation = FALSE;
3023    
3024 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
3025 nigel 93 of just a single character (as long as it's < 256). However, For higher
3026     valued UTF-8 characters, we don't yet do any optimization. */
3027 nigel 77
3028     class_charcount = 0;
3029     class_lastchar = -1;
3030    
3031 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
3032     temporary bit of memory, in case the class contains only 1 character (less
3033     than 256), because in that case the compiled code doesn't use the bit map.
3034     */
3035    
3036     memset(classbits, 0, 32 * sizeof(uschar));
3037    
3038 nigel 77 #ifdef SUPPORT_UTF8
3039     class_utf8 = FALSE; /* No chars >= 256 */
3040 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3041 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3042 nigel 77 #endif
3043    
3044     /* Process characters until ] is reached. By writing this as a "do" it
3045 nigel 93 means that an initial ] is taken as a data character. At the start of the
3046     loop, c contains the first byte of the character. */
3047 nigel 77
3048 nigel 93 if (c != 0) do
3049 nigel 77 {
3050 nigel 93 const uschar *oldptr;
3051    
3052 nigel 77 #ifdef SUPPORT_UTF8
3053     if (utf8 && c > 127)
3054     { /* Braces are required because the */
3055     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3056     }
3057 ph10 309
3058 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3059 ph10 309 data and reset the pointer. This is so that very large classes that
3060 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
3061 ph10 309 (which is on the stack). */
3062    
3063 ph10 300 if (lengthptr != NULL)
3064     {
3065     *lengthptr += class_utf8data - class_utf8data_base;
3066 ph10 309 class_utf8data = class_utf8data_base;
3067     }
3068    
3069 nigel 77 #endif
3070    
3071     /* Inside \Q...\E everything is literal except \E */
3072    
3073     if (inescq)
3074     {
3075 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3076 nigel 77 {
3077 nigel 93 inescq = FALSE; /* Reset literal state */
3078     ptr++; /* Skip the 'E' */
3079     continue; /* Carry on with next */
3080 nigel 77 }
3081 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
3082 nigel 77 }
3083    
3084     /* Handle POSIX class names. Perl allows a negation extension of the
3085     form [:^name:]. A square bracket that doesn't match the syntax is
3086     treated as a literal. We also recognize the POSIX constructions
3087     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3088     5.6 and 5.8 do. */
3089    
3090 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3091 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3092 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3093 nigel 77 {
3094     BOOL local_negate = FALSE;
3095 nigel 87 int posix_class, taboffset, tabopt;
3096 nigel 77 register const uschar *cbits = cd->cbits;
3097 nigel 87 uschar pbits[32];
3098 nigel 77
3099 ph10 391 if (ptr[1] != CHAR_COLON)
3100 nigel 77 {
3101     *errorcodeptr = ERR31;
3102     goto FAILED;
3103     }
3104    
3105     ptr += 2;
3106 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3107 nigel 77 {
3108     local_negate = TRUE;
3109 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3110 nigel 77 ptr++;
3111     }
3112    
3113     posix_class = check_posix_name(ptr, tempptr - ptr);
3114     if (posix_class < 0)
3115     {
3116     *errorcodeptr = ERR30;
3117     goto FAILED;
3118     }
3119    
3120     /* If matching is caseless, upper and lower are converted to
3121     alpha. This relies on the fact that the class table starts with
3122     alpha, lower, upper as the first 3 entries. */
3123    
3124     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3125     posix_class = 0;
3126    
3127 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
3128     because we may be adding and subtracting from it, and we don't want to
3129     subtract bits that may be in the main map already. At the end we or the
3130     result into the bit map that is being built. */
3131 nigel 77
3132     posix_class *= 3;
3133 nigel 87
3134     /* Copy in the first table (always present) */
3135    
3136     memcpy(pbits, cbits + posix_class_maps[posix_class],
3137     32 * sizeof(uschar));
3138    
3139     /* If there is a second table, add or remove it as required. */
3140    
3141     taboffset = posix_class_maps[posix_class + 1];
3142     tabopt = posix_class_maps[posix_class + 2];
3143    
3144     if (taboffset >= 0)
3145 nigel 77 {
3146 nigel 87 if (tabopt >= 0)
3147     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3148 nigel 77 else
3149 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3150 nigel 77 }
3151    
3152 nigel 87 /* Not see if we need to remove any special characters. An option
3153     value of 1 removes vertical space and 2 removes underscore. */
3154    
3155     if (tabopt < 0) tabopt = -tabopt;
3156     if (tabopt == 1) pbits[1] &= ~0x3c;
3157     else if (tabopt == 2) pbits[11] &= 0x7f;
3158    
3159     /* Add the POSIX table or its complement into the main table that is
3160     being built and we are done. */
3161    
3162     if (local_negate)
3163     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3164     else
3165     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3166    
3167 nigel 77 ptr = tempptr + 1;
3168     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3169     continue; /* End of POSIX syntax handling */
3170     }
3171    
3172     /* Backslash may introduce a single character, or it may introduce one
3173 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3174     case. Inside a class (and only there) it is treated as backspace.
3175     Elsewhere it marks a word boundary. Other escapes have preset maps ready
3176 ph10 205 to 'or' into the one we are building. We assume they have more than one
3177 nigel 77 character in them, so set class_charcount bigger than one. */
3178    
3179 ph10 391 if (c == CHAR_BACKSLASH)
3180 nigel 77 {
3181 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3182     if (*errorcodeptr != 0) goto FAILED;
3183 nigel 77
3184 ph10 391 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3185     else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */
3186     else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */
3187 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3188     {
3189 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3190 nigel 77 {
3191     ptr += 2; /* avoid empty string */
3192     }
3193     else inescq = TRUE;
3194     continue;
3195     }
3196 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3197 nigel 77
3198     if (c < 0)
3199     {
3200     register const uschar *cbits = cd->cbits;
3201     class_charcount += 2; /* Greater than 1 is what matters */
3202 nigel 93
3203     /* Save time by not doing this in the pre-compile phase. */
3204    
3205     if (lengthptr == NULL) switch (-c)
3206 nigel 77 {
3207     case ESC_d:
3208     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3209     continue;
3210    
3211     case ESC_D:
3212 ph10 286 should_flip_negation = TRUE;
3213 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3214     continue;
3215    
3216     case ESC_w:
3217     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3218     continue;
3219    
3220     case ESC_W:
3221 ph10 286 should_flip_negation = TRUE;
3222 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3223     continue;
3224    
3225     case ESC_s:
3226     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3227     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3228     continue;
3229    
3230     case ESC_S:
3231 ph10 286 should_flip_negation = TRUE;
3232 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3233     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3234     continue;
3235    
3236 nigel 93 default: /* Not recognized; fall through */
3237     break; /* Need "default" setting to stop compiler warning. */
3238     }
3239    
3240     /* In the pre-compile phase, just do the recognition. */
3241    
3242     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3243     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3244 ph10 180
3245 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
3246     they use extra memory. */
3247 ph10 180
3248 ph10 178 if (-c == ESC_h)
3249     {
3250     SETBIT(classbits, 0x09); /* VT */
3251     SETBIT(classbits, 0x20); /* SPACE */
3252 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
3253 ph10 178 #ifdef SUPPORT_UTF8
3254     if (utf8)
3255 ph10 180 {
3256 ph10 178 class_utf8 = TRUE;
3257     *class_utf8data++ = XCL_SINGLE;
3258 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3259 ph10 178 *class_utf8data++ = XCL_SINGLE;
3260 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3261     *class_utf8data++ = XCL_RANGE;
3262     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3263     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3264 ph10 178 *class_utf8data++ = XCL_SINGLE;
3265 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3266 ph10 178 *class_utf8data++ = XCL_SINGLE;
3267 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3268 ph10 178 *class_utf8data++ = XCL_SINGLE;
3269 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3270     }
3271     #endif
3272     continue;
3273     }
3274 nigel 93
3275 ph10 178 if (-c == ESC_H)
3276     {
3277     for (c = 0; c < 32; c++)
3278     {
3279     int x = 0xff;
3280     switch (c)
3281 ph10 180 {
3282 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
3283     case 0x20/8: x ^= 1 << (0x20%8); break;
3284     case 0xa0/8: x ^= 1 << (0xa0%8); break;
3285     default: break;
3286     }
3287     classbits[c] |= x;
3288 ph10 180 }
3289    
3290 ph10 178 #ifdef SUPPORT_UTF8
3291     if (utf8)
3292 ph10 180 {
3293 ph10 178 class_utf8 = TRUE;
3294 ph10 180 *class_utf8data++ = XCL_RANGE;
3295     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3296     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3297     *class_utf8data++ = XCL_RANGE;
3298     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3299     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3300     *class_utf8data++ = XCL_RANGE;
3301     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3302     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3303     *class_utf8data++ = XCL_RANGE;
3304     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3305     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3306     *class_utf8data++ = XCL_RANGE;
3307     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3308     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3309     *class_utf8data++ = XCL_RANGE;
3310     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3311     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3312     *class_utf8data++ = XCL_RANGE;
3313     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3314     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3315     }
3316     #endif
3317     continue;
3318     }
3319 ph10 178
3320     if (-c == ESC_v)
3321     {
3322     SETBIT(classbits, 0x0a); /* LF */
3323     SETBIT(classbits, 0x0b); /* VT */
3324 ph10 180 SETBIT(classbits, 0x0c); /* FF */
3325     SETBIT(classbits, 0x0d); /* CR */
3326     SETBIT(classbits, 0x85); /* NEL */
3327 ph10 178 #ifdef SUPPORT_UTF8
3328     if (utf8)
3329 ph10 180 {
3330 ph10 178 class_utf8 = TRUE;
3331 ph10 180 *class_utf8data++ = XCL_RANGE;
3332     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3333     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3334     }
3335     #endif
3336     continue;
3337     }
3338 ph10 178
3339     if (-c == ESC_V)
3340     {
3341     for (c = 0; c < 32; c++)
3342     {
3343     int x = 0xff;
3344     switch (c)
3345 ph10 180 {
3346 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3347     x ^= 1 << (0x0b%8);
3348     x ^= 1 << (0x0c%8);
3349 ph10 180 x ^= 1 << (0x0d%8);
3350 ph10 178 break;
3351     case 0x85/8: x ^= 1 << (0x85%8); break;
3352     default: break;
3353     }
3354     classbits[c] |= x;
3355 ph10 180 }
3356    
3357 ph10 178 #ifdef SUPPORT_UTF8
3358     if (utf8)
3359 ph10 180 {
3360 ph10 178 class_utf8 = TRUE;
3361 ph10 180 *class_utf8data++ = XCL_RANGE;
3362     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3363     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3364     *class_utf8data++ = XCL_RANGE;
3365     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3366     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3367     }
3368     #endif
3369     continue;
3370     }
3371 ph10 178
3372 nigel 93 /* We need to deal with \P and \p in both phases. */
3373    
3374 nigel 77 #ifdef SUPPORT_UCP
3375 nigel 93 if (-c == ESC_p || -c == ESC_P)
3376     {
3377     BOOL negated;
3378     int pdata;
3379     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3380     if (ptype < 0) goto FAILED;
3381     class_utf8 = TRUE;
3382     *class_utf8data++ = ((-c == ESC_p) != negated)?
3383     XCL_PROP : XCL_NOTPROP;
3384     *class_utf8data++ = ptype;
3385     *class_utf8data++ = pdata;
3386     class_charcount -= 2; /* Not a < 256 character */
3387 nigel 77 continue;
3388 nigel 93 }
3389 nigel 77 #endif
3390 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
3391     strict mode. By default, for compatibility with Perl, they are
3392     treated as literals. */
3393 nigel 77
3394 nigel 93 if ((options & PCRE_EXTRA) != 0)
3395     {
3396     *errorcodeptr = ERR7;
3397     goto FAILED;
3398     }
3399 nigel 77
3400 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3401     c = *ptr; /* Get the final character and fall through */
3402 nigel 77 }
3403    
3404     /* Fall through if we have a single character (c >= 0). This may be
3405 nigel 93 greater than 256 in UTF-8 mode. */
3406 nigel 77
3407     } /* End of backslash handling */
3408    
3409     /* A single character may be followed by '-' to form a range. However,
3410     Perl does not permit ']' to be the end of the range. A '-' character
3411 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3412     entirely. The code for handling \Q and \E is messy. */
3413 nigel 77
3414 nigel 93 CHECK_RANGE:
3415 ph10 391 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3416 nigel 77 {
3417 nigel 93 inescq = FALSE;
3418     ptr += 2;
3419     }
3420    
3421     oldptr = ptr;
3422 ph10 231
3423 ph10 230 /* Remember \r or \n */
3424 ph10 231
3425 ph10 391 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3426 ph10 231
3427 ph10 230 /* Check for range */
3428 nigel 93
3429 ph10 391 if (!inescq && ptr[1] == CHAR_MINUS)
3430 nigel 93 {
3431 nigel 77 int d;
3432     ptr += 2;
3433 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3434 nigel 77
3435 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3436     mode. */
3437    
3438 ph10 391 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3439 nigel 93 {
3440     ptr += 2;
3441 ph10 392 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3442 ph10 391 { ptr += 2; continue; }
3443 nigel 93 inescq = TRUE;
3444     break;
3445     }
3446    
3447 ph10 391 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3448 nigel 93 {
3449     ptr = oldptr;
3450     goto LONE_SINGLE_CHARACTER;
3451     }
3452    
3453 nigel 77 #ifdef SUPPORT_UTF8
3454     if (utf8)
3455     { /* Braces are required because the */
3456     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3457     }
3458     else
3459     #endif
3460     d = *ptr; /* Not UTF-8 mode */
3461    
3462     /* The second part of a range can be a single-character escape, but
3463     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3464     in such circumstances. */
3465    
3466 ph10 391 if (!inescq && d == CHAR_BACKSLASH)
3467 nigel 77 {
3468 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3469     if (*errorcodeptr != 0) goto FAILED;
3470 nigel 77
3471 ph10 275 /* \b is backspace; \X is literal X; \R is literal R; any other
3472 nigel 93 special means the '-' was literal */
3473 nigel 77
3474     if (d < 0)
3475     {
3476 ph10 391 if (d == -ESC_b) d = CHAR_BS;
3477     else if (d == -ESC_X) d = CHAR_X;
3478     else if (d == -ESC_R) d = CHAR_R; else
3479 nigel 77 {
3480 nigel 93 ptr = oldptr;
3481 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3482     }
3483     }
3484     }
3485    
3486 nigel 93 /* Check that the two values are in the correct order. Optimize
3487     one-character ranges */
3488 nigel 77
3489 nigel 93 if (d < c)
3490     {
3491     *errorcodeptr = ERR8;
3492     goto FAILED;
3493     }
3494    
3495 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3496    
3497 ph10 230 /* Remember \r or \n */
3498 ph10 231
3499 ph10 391 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3500 ph10 231
3501 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3502     matching, we have to use an XCLASS with extra data items. Caseless
3503     matching for characters > 127 is available only if UCP support is
3504     available. */
3505    
3506     #ifdef SUPPORT_UTF8
3507     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3508     {
3509     class_utf8 = TRUE;
3510    
3511     /* With UCP support, we can find the other case equivalents of
3512     the relevant characters. There may be several ranges. Optimize how
3513     they fit with the basic range. */
3514    
3515     #ifdef SUPPORT_UCP
3516     if ((options & PCRE_CASELESS) != 0)
3517     {
3518 nigel 93 unsigned int occ, ocd;
3519     unsigned int cc = c;
3520     unsigned int origd = d;
3521 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3522     {
3523 ph10 180 if (occ >= (unsigned int)c &&
3524     ocd <= (unsigned int)d)
3525 ph10 176 continue; /* Skip embedded ranges */
3526 nigel 77
3527 ph10 180 if (occ < (unsigned int)c &&
3528 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3529 nigel 77 { /* if there is overlap, */
3530     c = occ; /* noting that if occ < c */
3531     continue; /* we can't have ocd > d */
3532     } /* because a subrange is */
3533 ph10 180 if (ocd > (unsigned int)d &&
3534 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3535 nigel 77 { /* the basic range. */
3536     d = ocd;
3537     continue;
3538     }
3539    
3540     if (occ == ocd)
3541     {
3542     *class_utf8data++ = XCL_SINGLE;
3543     }
3544     else
3545     {
3546     *class_utf8data++ = XCL_RANGE;
3547     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3548     }
3549     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3550     }
3551     }
3552     #endif /* SUPPORT_UCP */
3553    
3554     /* Now record the original range, possibly modified for UCP caseless
3555     overlapping ranges. */
3556    
3557     *class_utf8data++ = XCL_RANGE;
3558     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3559     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3560    
3561     /* With UCP support, we are done. Without UCP support, there is no
3562     caseless matching for UTF-8 characters > 127; we can use the bit map
3563     for the smaller ones. */
3564    
3565     #ifdef SUPPORT_UCP
3566     continue; /* With next character in the class */
3567     #else
3568     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3569    
3570     /* Adjust upper limit and fall through to set up the map */
3571    
3572     d = 127;
3573    
3574     #endif /* SUPPORT_UCP */
3575     }
3576     #endif /* SUPPORT_UTF8 */
3577    
3578     /* We use the bit map for all cases when not in UTF-8 mode; else
3579     ranges that lie entirely within 0-127 when there is UCP support; else
3580     for partial ranges without UCP support. */
3581    
3582 nigel 93 class_charcount += d - c + 1;
3583     class_lastchar = d;
3584    
3585     /* We can save a bit of time by skipping this in the pre-compile. */
3586    
3587     if (lengthptr == NULL) for (; c <= d; c++)
3588 nigel 77 {
3589     classbits[c/8] |= (1 << (c&7));
3590     if ((options & PCRE_CASELESS) != 0)
3591     {
3592     int uc = cd->fcc[c]; /* flip case */
3593     classbits[uc/8] |= (1 << (uc&7));
3594     }
3595     }
3596    
3597     continue; /* Go get the next char in the class */
3598     }
3599    
3600     /* Handle a lone single character - we can get here for a normal
3601     non-escape char, or after \ that introduces a single character or for an
3602     apparent range that isn't. */
3603    
3604     LONE_SINGLE_CHARACTER:
3605 ph10 231
3606 nigel 77 /* Handle a character that cannot go in the bit map */
3607    
3608     #ifdef SUPPORT_UTF8
3609     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3610     {
3611     class_utf8 = TRUE;
3612     *class_utf8data++ = XCL_SINGLE;
3613     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3614    
3615     #ifdef SUPPORT_UCP
3616     if ((options & PCRE_CASELESS) != 0)
3617     {
3618 nigel 93 unsigned int othercase;
3619 ph10 349 if ((othercase = UCD_OTHERCASE(c)) != c)
3620 nigel 77 {
3621     *class_utf8data++ = XCL_SINGLE;
3622     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3623     }
3624     }
3625     #endif /* SUPPORT_UCP */
3626    
3627     }
3628     else
3629     #endif /* SUPPORT_UTF8 */
3630    
3631     /* Handle a single-byte character */
3632     {
3633     classbits[c/8] |= (1 << (c&7));
3634     if ((options & PCRE_CASELESS) != 0)
3635     {
3636     c = cd->fcc[c]; /* flip case */
3637     classbits[c/8] |= (1 << (c&7));
3638     }
3639     class_charcount++;
3640     class_lastchar = c;
3641     }
3642     }
3643    
3644 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3645 nigel 77
3646 ph10 391 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3647 nigel 77
3648 nigel 93 if (c == 0) /* Missing terminating ']' */
3649     {
3650     *errorcodeptr = ERR6;
3651     goto FAILED;
3652     }
3653 ph10 231
3654    
3655 ph10 230 /* This code has been disabled because it would mean that \s counts as
3656     an explicit \r or \n reference, and that's not really what is wanted. Now
3657     we set the flag only if there is a literal "\r" or "\n" in the class. */
3658 ph10 227
3659 ph10 230 #if 0
3660 ph10 226 /* Remember whether \r or \n are in this class */
3661 ph10 227
3662 ph10 226 if (negate_class)
3663     {
3664 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3665 ph10 226 }
3666     else
3667     {
3668 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3669 ph10 227 }
3670 ph10 230 #endif
3671 ph10 227
3672 ph10 231
3673 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3674 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3675     use of \p or \P, in other words, no use of any XCLASS features, we can
3676     optimize.
3677    
3678 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3679     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3680     operate on single-bytes only. This is an historical hangover. Maybe one day
3681     we can tidy these opcodes to handle multi-byte characters.
3682 nigel 77
3683     The optimization throws away the bit map. We turn the item into a
3684     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3685     that OP_NOT does not support multibyte characters. In the positive case, it
3686     can cause firstbyte to be set. Otherwise, there can be no first char if
3687     this item is first, whatever repeat count may follow. In the case of
3688     reqbyte, save the previous value for reinstating. */
3689    
3690     #ifdef SUPPORT_UTF8
3691 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3692 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3693 nigel 77 #else
3694     if (class_charcount == 1)
3695     #endif
3696     {
3697     zeroreqbyte = reqbyte;
3698    
3699     /* The OP_NOT opcode works on one-byte characters only. */
3700    
3701     if (negate_class)
3702     {
3703     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3704     zerofirstbyte = firstbyte;
3705     *code++ = OP_NOT;
3706     *code++ = class_lastchar;
3707     break;
3708     }
3709    
3710     /* For a single, positive character, get the value into mcbuffer, and
3711     then we can handle this with the normal one-character code. */
3712    
3713     #ifdef SUPPORT_UTF8
3714     if (utf8 && class_lastchar > 127)
3715     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3716     else
3717     #endif
3718     {
3719     mcbuffer[0] = class_lastchar;
3720     mclength = 1;
3721     }
3722     goto ONE_CHAR;
3723     } /* End of 1-char optimization */
3724    
3725     /* The general case - not the one-char optimization. If this is the first
3726     thing in the branch, there can be no first char setting, whatever the
3727     repeat count. Any reqbyte setting must remain unchanged after any kind of
3728     repeat. */
3729    
3730     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3731     zerofirstbyte = firstbyte;
3732     zeroreqbyte = reqbyte;
3733    
3734     /* If there are characters with values > 255, we have to compile an
3735 ph10 286 extended class, with its own opcode, unless there was a negated special
3736     such as \S in the class, because in that case all characters > 255 are in
3737     the class, so any that were explicitly given as well can be ignored. If
3738 ph10 264 (when there are explicit characters > 255 that must be listed) there are no
3739     characters < 256, we can omit the bitmap in the actual compiled code. */
3740 nigel 77
3741     #ifdef SUPPORT_UTF8
3742 ph10 264 if (class_utf8 && !should_flip_negation)
3743 nigel 77 {
3744     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3745     *code++ = OP_XCLASS;
3746     code += LINK_SIZE;
3747     *code = negate_class? XCL_NOT : 0;
3748    
3749 nigel 93 /* If the map is required, move up the extra data to make room for it;
3750     otherwise just move the code pointer to the end of the extra data. */
3751 nigel 77
3752     if (class_charcount > 0)
3753     {
3754     *code++ |= XCL_MAP;
3755 nigel 93 memmove(code + 32, code, class_utf8data - code);
3756 nigel 77 memcpy(code, classbits, 32);
3757 nigel 93 code = class_utf8data + 32;
3758 nigel 77 }
3759 nigel 93 else code = class_utf8data;
3760 nigel 77
3761     /* Now fill in the complete length of the item */
3762    
3763     PUT(previous, 1, code - previous);
3764     break; /* End of class handling */
3765     }
3766     #endif
3767    
3768 ph10 286 /* If there are no characters > 255, set the opcode to OP_CLASS or
3769     OP_NCLASS, depending on whether the whole class was negated and whether
3770     there were negative specials such as \S in the class. Then copy the 32-byte
3771 ph10 264 map into the code vector, negating it if necessary. */
3772 ph10 286
3773 ph10 264 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3774 nigel 77 if (negate_class)
3775     {
3776 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3777     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3778 nigel 77 }
3779     else
3780     {
3781     memcpy(code, classbits, 32);
3782     }
3783     code += 32;
3784     break;
3785    
3786 nigel 93
3787     /* ===================================================================*/
3788 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3789     has been tested above. */
3790    
3791 ph10 391 case CHAR_LEFT_CURLY_BRACKET:
3792 nigel 77 if (!is_quantifier) goto NORMAL_CHAR;
3793     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3794     if (*errorcodeptr != 0) goto FAILED;
3795     goto REPEAT;
3796    
3797 ph10 391 case CHAR_ASTERISK:
3798 nigel 77 repeat_min = 0;
3799     repeat_max = -1;
3800     goto REPEAT;
3801    
3802 ph10 391 case CHAR_PLUS:
3803 nigel 77 repeat_min = 1;
3804     repeat_max = -1;
3805     goto REPEAT;
3806    
3807 ph10 391 case CHAR_QUESTION_MARK:
3808 nigel 77 repeat_min = 0;
3809     repeat_max = 1;
3810    
3811     REPEAT:
3812     if (previous == NULL)
3813     {
3814     *errorcodeptr = ERR9;
3815     goto FAILED;
3816     }
3817    
3818     if (repeat_min == 0)
3819     {
3820     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3821     reqbyte = zeroreqbyte; /* Ditto */
3822     }
3823    
3824     /* Remember whether this is a variable length repeat */
3825    
3826     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3827    
3828     op_type = 0; /* Default single-char op codes */
3829     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3830    
3831     /* Save start of previous item, in case we have to move it up to make space
3832     for an inserted OP_ONCE for the additional '+' extension. */
3833    
3834     tempcode = previous;
3835    
3836     /* If the next character is '+', we have a possessive quantifier. This
3837     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3838     If the next character is '?' this is a minimizing repeat, by default,
3839     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3840     repeat type to the non-default. */
3841    
3842 ph10 391 if (ptr[1] == CHAR_PLUS)
3843 nigel 77 {
3844     repeat_type = 0; /* Force greedy */
3845     possessive_quantifier = TRUE;
3846     ptr++;
3847     }
3848 ph10 391 else if (ptr[1] == CHAR_QUESTION_MARK)
3849 nigel 77 {
3850     repeat_type = greedy_non_default;
3851     ptr++;
3852     }
3853     else repeat_type = greedy_default;
3854    
3855     /* If previous was a character match, abolish the item and generate a
3856     repeat item instead. If a char item has a minumum of more than one, ensure
3857     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3858     the first thing in a branch because the x will have gone into firstbyte
3859     instead. */
3860    
3861     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3862     {
3863     /* Deal with UTF-8 characters that take up more than one byte. It's
3864     easier to write this out separately than try to macrify it. Use c to
3865     hold the length of the character in bytes, plus 0x80 to flag that it's a
3866     length rather than a small character. */
3867    
3868     #ifdef SUPPORT_UTF8
3869     if (utf8 && (code[-1] & 0x80) != 0)
3870     {
3871     uschar *lastchar = code - 1;
3872     while((*lastchar & 0xc0) == 0x80) lastchar--;
3873     c = code - lastchar; /* Length of UTF-8 character */
3874     memcpy(utf8_char, lastchar, c); /* Save the char */
3875     c |= 0x80; /* Flag c as a length */
3876     }
3877     else
3878     #endif
3879    
3880     /* Handle the case of a single byte - either with no UTF8 support, or
3881     with UTF-8 disabled, or for a UTF-8 character < 128. */
3882    
3883     {
3884     c = code[-1];
3885     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3886     }
3887    
3888 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3889     the line is something that cannot possibly match this character. If so,
3890     automatically possessifying this item gains some performance in the case
3891     where the match fails. */
3892    
3893     if (!possessive_quantifier &&
3894     repeat_max < 0 &&
3895     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3896     options, cd))
3897     {
3898     repeat_type = 0; /* Force greedy */
3899     possessive_quantifier = TRUE;
3900     }
3901    
3902 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3903     }
3904    
3905     /* If previous was a single negated character ([^a] or similar), we use
3906     one of the special opcodes, replacing it. The code is shared with single-
3907     character repeats by setting opt_type to add a suitable offset into
3908 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3909     currently used only for single-byte chars. */
3910 nigel 77
3911     else if (*previous == OP_NOT)
3912     {
3913     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3914     c = previous[1];
3915 nigel 93 if (!possessive_quantifier &&
3916     repeat_max < 0 &&
3917     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3918     {
3919     repeat_type = 0; /* Force greedy */
3920     possessive_quantifier = TRUE;
3921     }
3922 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3923     }
3924    
3925     /* If previous was a character type match (\d or similar), abolish it and
3926     create a suitable repeat item. The code is shared with single-character
3927     repeats by setting op_type to add a suitable offset into repeat_type. Note
3928     the the Unicode property types will be present only when SUPPORT_UCP is
3929     defined, but we don't wrap the little bits of code here because it just