/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 333 - (hide annotations) (download)
Thu Apr 10 19:55:57 2008 UTC (6 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 200103 byte(s)
Add Oniguruma syntax \g<...> and \g'...' for subroutine calls.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 305 Copyright (c) 1997-2008 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 nigel 85 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57     used by pcretest. DEBUG is not defined when building a production library. */
58    
59     #ifdef DEBUG
60     #include "pcre_printint.src"
61     #endif
62    
63    
64 ph10 178 /* Macro for setting individual bits in class bitmaps. */
65    
66     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
67    
68 ph10 202 /* Maximum length value to check against when making sure that the integer that
69     holds the compiled pattern length does not overflow. We make it a bit less than
70     INT_MAX to allow for adding in group terminating bytes, so that we don't have
71     to check them every time. */
72 ph10 178
73 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
74    
75    
76 nigel 77 /*************************************************
77     * Code parameters and static tables *
78     *************************************************/
79    
80 nigel 93 /* This value specifies the size of stack workspace that is used during the
81     first pre-compile phase that determines how much memory is required. The regex
82     is partly compiled into this space, but the compiled parts are discarded as
83     soon as they can be, so that hopefully there will never be an overrun. The code
84     does, however, check for an overrun. The largest amount I've seen used is 218,
85     so this number is very generous.
86 nigel 77
87 nigel 93 The same workspace is used during the second, actual compile phase for
88     remembering forward references to groups so that they can be filled in at the
89     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90     is 4 there is plenty of room. */
91 nigel 77
92 nigel 93 #define COMPILE_WORK_SIZE (4096)
93 nigel 77
94 nigel 93
95 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
96     are simple data values; negative values are for special things like \d and so
97     on. Zero means further processing is needed (for things like \x), or the escape
98     is invalid. */
99    
100 ph10 97 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
101 nigel 77 static const short int escapes[] = {
102     0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
103     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
104     '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
105 ph10 178 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
106     -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
107 nigel 77 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
108     '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
109 ph10 178 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
110     -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
111 nigel 77 0, 0, -ESC_z /* x - z */
112     };
113    
114 ph10 97 #else /* This is the "abnormal" table for EBCDIC systems */
115 nigel 77 static const short int escapes[] = {
116     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
117     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
118     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
119     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
120     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
121     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
122     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
123     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
124 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
125 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
126 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
127 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
128 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
129     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
130     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
131     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
132 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
133 ph10 195 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
134 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
135 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
136 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
137     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
138     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
139     };
140     #endif
141    
142    
143 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
144     searched linearly. Put all the names into a single string, in order to reduce
145 ph10 240 the number of relocations when a shared library is dynamically linked. */
146 ph10 210
147     typedef struct verbitem {
148     int len;
149     int op;
150 ph10 211 } verbitem;
151 ph10 210
152 ph10 240 static const char verbnames[] =
153 ph10 243 "ACCEPT\0"
154     "COMMIT\0"
155     "F\0"
156     "FAIL\0"
157     "PRUNE\0"
158     "SKIP\0"
159     "THEN";
160 ph10 240
161 ph10 327 static const verbitem verbs[] = {
162 ph10 240 { 6, OP_ACCEPT },
163     { 6, OP_COMMIT },
164     { 1, OP_FAIL },
165     { 4, OP_FAIL },
166     { 5, OP_PRUNE },
167     { 4, OP_SKIP },
168     { 4, OP_THEN }
169 ph10 210 };
170    
171 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
172 ph10 210
173    
174 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
175     now all in a single string, to reduce the number of relocations when a shared
176 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
177     length entry. The first three must be alpha, lower, upper, as this is assumed
178     for handling case independence. */
179 nigel 77
180 ph10 240 static const char posix_names[] =
181 ph10 243 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
182     "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
183 ph10 240 "word\0" "xdigit";
184 nigel 77
185     static const uschar posix_name_lengths[] = {
186     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
187    
188 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
189     base map, with an optional addition or removal of another map. Then, for some
190     classes, there is some additional tweaking: for [:blank:] the vertical space
191     characters are removed, and for [:alpha:] and [:alnum:] the underscore
192     character is removed. The triples in the table consist of the base map offset,
193     second map offset or -1 if no second map, and a non-negative value for map
194     addition or a negative value for map subtraction (if there are two maps). The
195     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
196     remove vertical space characters, 2 => remove underscore. */
197 nigel 77
198     static const int posix_class_maps[] = {
199 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
200     cbit_lower, -1, 0, /* lower */
201     cbit_upper, -1, 0, /* upper */
202     cbit_word, -1, 2, /* alnum - word without underscore */
203     cbit_print, cbit_cntrl, 0, /* ascii */
204     cbit_space, -1, 1, /* blank - a GNU extension */
205     cbit_cntrl, -1, 0, /* cntrl */
206     cbit_digit, -1, 0, /* digit */
207     cbit_graph, -1, 0, /* graph */
208     cbit_print, -1, 0, /* print */
209     cbit_punct, -1, 0, /* punct */
210     cbit_space, -1, 0, /* space */
211     cbit_word, -1, 0, /* word - a Perl extension */
212     cbit_xdigit,-1, 0 /* xdigit */
213 nigel 77 };
214    
215    
216 nigel 93 #define STRING(a) # a
217     #define XSTRING(s) STRING(s)
218    
219 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
220 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
221     they are documented. Always add a new error instead. Messages marked DEAD below
222 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
223     the number of relocations needed when a shared library is loaded dynamically,
224     it is now one long string. We cannot use a table of offsets, because the
225     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
226     simply count through to the one we want - this isn't a performance issue
227 ph10 240 because these strings are used only when there is a compilation error. */
228 nigel 77
229 ph10 240 static const char error_texts[] =
230     "no error\0"
231     "\\ at end of pattern\0"
232     "\\c at end of pattern\0"
233     "unrecognized character follows \\\0"
234     "numbers out of order in {} quantifier\0"
235 nigel 77 /* 5 */
236 ph10 240 "number too big in {} quantifier\0"
237     "missing terminating ] for character class\0"
238     "invalid escape sequence in character class\0"
239     "range out of order in character class\0"
240     "nothing to repeat\0"
241 nigel 77 /* 10 */
242 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
243     "internal error: unexpected repeat\0"
244 ph10 269 "unrecognized character after (? or (?-\0"
245 ph10 240 "POSIX named classes are supported only within a class\0"
246     "missing )\0"
247 nigel 77 /* 15 */
248 ph10 240 "reference to non-existent subpattern\0"
249     "erroffset passed as NULL\0"
250     "unknown option bit(s) set\0"
251     "missing ) after comment\0"
252     "parentheses nested too deeply\0" /** DEAD **/
253 nigel 77 /* 20 */
254 ph10 240 "regular expression is too large\0"
255     "failed to get memory\0"
256     "unmatched parentheses\0"
257     "internal error: code overflow\0"
258     "unrecognized character after (?<\0"
259 nigel 77 /* 25 */
260 ph10 240 "lookbehind assertion is not fixed length\0"
261     "malformed number or name after (?(\0"
262     "conditional group contains more than two branches\0"
263     "assertion expected after (?(\0"
264     "(?R or (?[+-]digits must be followed by )\0"
265 nigel 77 /* 30 */
266 ph10 240 "unknown POSIX class name\0"
267     "POSIX collating elements are not supported\0"
268     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
269     "spare error\0" /** DEAD **/
270     "character value in \\x{...} sequence is too large\0"
271 nigel 77 /* 35 */
272 ph10 240 "invalid condition (?(0)\0"
273     "\\C not allowed in lookbehind assertion\0"
274     "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
275     "number after (?C is > 255\0"
276     "closing ) for (?C expected\0"
277 nigel 77 /* 40 */
278 ph10 240 "recursive call could loop indefinitely\0"
279     "unrecognized character after (?P\0"
280     "syntax error in subpattern name (missing terminator)\0"
281     "two named subpatterns have the same name\0"
282     "invalid UTF-8 string\0"
283 nigel 77 /* 45 */
284 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
285     "malformed \\P or \\p sequence\0"
286     "unknown property name after \\P or \\p\0"
287     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
288     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
289 nigel 91 /* 50 */
290 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
291     "octal value is greater than \\377 (not in UTF-8 mode)\0"
292     "internal error: overran compiling workspace\0"
293     "internal error: previously-checked referenced subpattern not found\0"
294     "DEFINE group contains more than one branch\0"
295 nigel 93 /* 55 */
296 ph10 240 "repeating a DEFINE group is not allowed\0"
297     "inconsistent NEWLINE options\0"
298 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
299     "a numbered reference must not be zero\0"
300 ph10 240 "(*VERB) with an argument is not supported\0"
301 ph10 211 /* 60 */
302 ph10 240 "(*VERB) not recognized\0"
303 ph10 268 "number is too big\0"
304 ph10 272 "subpattern name expected\0"
305 ph10 269 "digit expected after (?+";
306 nigel 77
307    
308     /* Table to identify digits and hex digits. This is used when compiling
309     patterns. Note that the tables in chartables are dependent on the locale, and
310     may mark arbitrary characters as digits - but the PCRE compiling code expects
311     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
312     a private table here. It costs 256 bytes, but it is a lot faster than doing
313     character value tests (at least in some simple cases I timed), and in some
314     applications one wants PCRE to compile efficiently as well as match
315     efficiently.
316    
317     For convenience, we use the same bit definitions as in chartables:
318    
319     0x04 decimal digit
320     0x08 hexadecimal digit
321    
322     Then we can use ctype_digit and ctype_xdigit in the code. */
323    
324 ph10 97 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
325 nigel 77 static const unsigned char digitab[] =
326     {
327     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
328     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
329     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
330     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
331     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
332     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
333     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
334     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
335     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
336     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
337     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
338     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
339     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
340     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
341     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
342     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
343     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
344     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
345     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
346     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
347     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
348     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
349     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
350     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
351     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
352     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
353     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
354     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
355     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
356     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
357     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
358     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
359    
360 ph10 97 #else /* This is the "abnormal" case, for EBCDIC systems */
361 nigel 77 static const unsigned char digitab[] =
362     {
363     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
364     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
365     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
366     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
367     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
368     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
369     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
370     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
371     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
372     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
373     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
374 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
375 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
376     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
377     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
378     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
379     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
380     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
381     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
382     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
383     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
384     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
385     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
386     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
387     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
388     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
389     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
390     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
391     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
392     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
393     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
394     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
395    
396     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
397     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
398     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
399     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
400     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
401     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
402     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
403     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
404     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
405     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
406     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
407     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
408 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
409 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
410     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
411     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
412     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
413     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
414     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
415     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
416     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
417     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
418     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
419     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
420     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
421     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
422     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
423     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
424     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
425     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
426     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
427     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
428     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
429     #endif
430    
431    
432     /* Definition to allow mutual recursion */
433    
434     static BOOL
435 ph10 180 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
436 ph10 175 int *, int *, branch_chain *, compile_data *, int *);
437 nigel 77
438    
439    
440     /*************************************************
441 ph10 240 * Find an error text *
442     *************************************************/
443    
444 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
445     some of the text is of unknown length, we can't use a table of offsets.
446     Instead, just count through the strings. This is not a performance issue
447 ph10 240 because it happens only when there has been a compilation error.
448    
449     Argument: the error number
450     Returns: pointer to the error string
451     */
452    
453     static const char *
454     find_error_text(int n)
455     {
456     const char *s = error_texts;
457 ph10 243 for (; n > 0; n--) while (*s++ != 0);
458 ph10 240 return s;
459     }
460    
461    
462     /*************************************************
463 nigel 77 * Handle escapes *
464     *************************************************/
465    
466     /* This function is called when a \ has been encountered. It either returns a
467     positive value for a simple escape such as \n, or a negative value which
468 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
469     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
470     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
471     ptr is pointing at the \. On exit, it is on the final character of the escape
472     sequence.
473 nigel 77
474     Arguments:
475     ptrptr points to the pattern position pointer
476     errorcodeptr points to the errorcode variable
477     bracount number of previous extracting brackets
478     options the options bits
479     isclass TRUE if inside a character class
480    
481     Returns: zero or positive => a data character
482     negative => a special escape sequence
483 ph10 213 on error, errorcodeptr is set
484 nigel 77 */
485    
486     static int
487     check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
488     int options, BOOL isclass)
489     {
490 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
491     const uschar *ptr = *ptrptr + 1;
492 nigel 77 int c, i;
493    
494 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
495     ptr--; /* Set pointer back to the last byte */
496    
497 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
498    
499     if (c == 0) *errorcodeptr = ERR1;
500    
501 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
502     in a table. A non-zero result is something that can be returned immediately.
503 nigel 77 Otherwise further processing may be required. */
504    
505 ph10 97 #ifndef EBCDIC /* ASCII coding */
506 ph10 274 else if (c < '0' || c > 'z') {} /* Not alphanumeric */
507 nigel 77 else if ((i = escapes[c - '0']) != 0) c = i;
508    
509 ph10 97 #else /* EBCDIC coding */
510 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
511 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
512     #endif
513    
514     /* Escapes that need further processing, or are illegal. */
515    
516     else
517     {
518     const uschar *oldptr;
519 nigel 93 BOOL braced, negated;
520    
521 nigel 77 switch (c)
522     {
523     /* A number of Perl escapes are not handled by PCRE. We give an explicit
524     error. */
525    
526     case 'l':
527     case 'L':
528     case 'N':
529     case 'u':
530     case 'U':
531     *errorcodeptr = ERR37;
532     break;
533    
534 ph10 333 /* \g must be followed by one of a number of specific things:
535    
536     (1) A number, either plain or braced. If positive, it is an absolute
537     backreference. If negative, it is a relative backreference. This is a Perl
538     5.10 feature.
539    
540     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
541     is part of Perl's movement towards a unified syntax for back references. As
542     this is synonymous with \k{name}, we fudge it up by pretending it really
543     was \k.
544    
545     (3) For Oniguruma compatibility we also support \g followed by a name or a
546     number either in angle brackets or in single quotes. However, these are
547     (possibly recursive) subroutine calls, _not_ backreferences. Just return
548     the -ESC_g code (cf \k). */
549 nigel 93
550     case 'g':
551 ph10 333 if (ptr[1] == '<' || ptr[1] == '\'')
552     {
553     c = -ESC_g;
554     break;
555     }
556    
557     /* Handle the Perl-compatible cases */
558    
559 nigel 93 if (ptr[1] == '{')
560     {
561 ph10 171 const uschar *p;
562     for (p = ptr+2; *p != 0 && *p != '}'; p++)
563     if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
564 ph10 172 if (*p != 0 && *p != '}')
565 ph10 171 {
566     c = -ESC_k;
567     break;
568 ph10 172 }
569 nigel 93 braced = TRUE;
570     ptr++;
571     }
572     else braced = FALSE;
573    
574     if (ptr[1] == '-')
575     {
576     negated = TRUE;
577     ptr++;
578     }
579     else negated = FALSE;
580    
581     c = 0;
582     while ((digitab[ptr[1]] & ctype_digit) != 0)
583     c = c * 10 + *(++ptr) - '0';
584 ph10 220
585 ph10 333 if (c < 0) /* Integer overflow */
586 ph10 213 {
587     *errorcodeptr = ERR61;
588     break;
589 ph10 220 }
590 ph10 333
591     if (braced && *(++ptr) != '}')
592 nigel 93 {
593     *errorcodeptr = ERR57;
594 ph10 213 break;
595 nigel 93 }
596 ph10 333
597     if (c == 0)
598     {
599     *errorcodeptr = ERR58;
600     break;
601     }
602 nigel 93
603     if (negated)
604     {
605     if (c > bracount)
606     {
607     *errorcodeptr = ERR15;
608 ph10 213 break;
609 nigel 93 }
610     c = bracount - (c - 1);
611     }
612    
613     c = -(ESC_REF + c);
614     break;
615    
616 nigel 77 /* The handling of escape sequences consisting of a string of digits
617     starting with one that is not zero is not straightforward. By experiment,
618     the way Perl works seems to be as follows:
619    
620     Outside a character class, the digits are read as a decimal number. If the
621     number is less than 10, or if there are that many previous extracting
622     left brackets, then it is a back reference. Otherwise, up to three octal
623     digits are read to form an escaped byte. Thus \123 is likely to be octal
624     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
625     value is greater than 377, the least significant 8 bits are taken. Inside a
626     character class, \ followed by a digit is always an octal number. */
627    
628     case '1': case '2': case '3': case '4': case '5':
629     case '6': case '7': case '8': case '9':
630    
631     if (!isclass)
632     {
633     oldptr = ptr;
634     c -= '0';
635     while ((digitab[ptr[1]] & ctype_digit) != 0)
636     c = c * 10 + *(++ptr) - '0';
637 ph10 333 if (c < 0) /* Integer overflow */
638 ph10 213 {
639     *errorcodeptr = ERR61;
640 ph10 220 break;
641     }
642 nigel 77 if (c < 10 || c <= bracount)
643     {
644     c = -(ESC_REF + c);
645     break;
646     }
647     ptr = oldptr; /* Put the pointer back and fall through */
648     }
649    
650     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
651     generates a binary zero byte and treats the digit as a following literal.
652     Thus we have to pull back the pointer by one. */
653    
654     if ((c = *ptr) >= '8')
655     {
656     ptr--;
657     c = 0;
658     break;
659     }
660    
661     /* \0 always starts an octal number, but we may drop through to here with a
662 nigel 91 larger first octal digit. The original code used just to take the least
663     significant 8 bits of octal numbers (I think this is what early Perls used
664     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
665     than 3 octal digits. */
666 nigel 77
667     case '0':
668     c -= '0';
669     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
670     c = c * 8 + *(++ptr) - '0';
671 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
672 nigel 77 break;
673    
674 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
675     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
676     treated as a data character. */
677 nigel 77
678     case 'x':
679 nigel 87 if (ptr[1] == '{')
680 nigel 77 {
681     const uschar *pt = ptr + 2;
682 nigel 87 int count = 0;
683    
684 nigel 77 c = 0;
685     while ((digitab[*pt] & ctype_xdigit) != 0)
686     {
687 nigel 87 register int cc = *pt++;
688     if (c == 0 && cc == '0') continue; /* Leading zeroes */
689 nigel 77 count++;
690 nigel 87
691 ph10 97 #ifndef EBCDIC /* ASCII coding */
692 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
693 nigel 87 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
694 ph10 97 #else /* EBCDIC coding */
695 nigel 77 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
696 nigel 87 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
697 nigel 77 #endif
698     }
699 nigel 87
700 nigel 77 if (*pt == '}')
701     {
702 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
703 nigel 77 ptr = pt;
704     break;
705     }
706 nigel 87
707 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
708     recognize this construct; fall through to the normal \x handling. */
709     }
710    
711 nigel 87 /* Read just a single-byte hex-defined char */
712 nigel 77
713     c = 0;
714     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
715     {
716     int cc; /* Some compilers don't like ++ */
717     cc = *(++ptr); /* in initializers */
718 ph10 97 #ifndef EBCDIC /* ASCII coding */
719 nigel 77 if (cc >= 'a') cc -= 32; /* Convert to upper case */
720     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
721 ph10 97 #else /* EBCDIC coding */
722 nigel 77 if (cc <= 'z') cc += 64; /* Convert to upper case */
723     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
724     #endif
725     }
726     break;
727    
728 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
729     This coding is ASCII-specific, but then the whole concept of \cx is
730     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
731 nigel 77
732     case 'c':
733     c = *(++ptr);
734     if (c == 0)
735     {
736     *errorcodeptr = ERR2;
737 ph10 213 break;
738 nigel 77 }
739    
740 ph10 97 #ifndef EBCDIC /* ASCII coding */
741 nigel 77 if (c >= 'a' && c <= 'z') c -= 32;
742     c ^= 0x40;
743 ph10 97 #else /* EBCDIC coding */
744 nigel 77 if (c >= 'a' && c <= 'z') c += 64;
745     c ^= 0xC0;
746     #endif
747     break;
748    
749     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
750 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
751     otherwise, for Perl compatibility, it is a literal. This code looks a bit
752     odd, but there used to be some cases other than the default, and there may
753     be again in future, so I haven't "optimized" it. */
754 nigel 77
755     default:
756     if ((options & PCRE_EXTRA) != 0) switch(c)
757     {
758     default:
759     *errorcodeptr = ERR3;
760     break;
761     }
762     break;
763     }
764     }
765    
766     *ptrptr = ptr;
767     return c;
768     }
769    
770    
771    
772     #ifdef SUPPORT_UCP
773     /*************************************************
774     * Handle \P and \p *
775     *************************************************/
776    
777     /* This function is called after \P or \p has been encountered, provided that
778     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
779     pointing at the P or p. On exit, it is pointing at the final character of the
780     escape sequence.
781    
782     Argument:
783     ptrptr points to the pattern position pointer
784     negptr points to a boolean that is set TRUE for negation else FALSE
785 nigel 87 dptr points to an int that is set to the detailed property value
786 nigel 77 errorcodeptr points to the error code variable
787    
788 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
789 nigel 77 */
790    
791     static int
792 nigel 87 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
793 nigel 77 {
794     int c, i, bot, top;
795     const uschar *ptr = *ptrptr;
796 nigel 87 char name[32];
797 nigel 77
798     c = *(++ptr);
799     if (c == 0) goto ERROR_RETURN;
800    
801     *negptr = FALSE;
802    
803 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
804     negation. */
805 nigel 77
806     if (c == '{')
807     {
808     if (ptr[1] == '^')
809     {
810     *negptr = TRUE;
811     ptr++;
812     }
813 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
814 nigel 77 {
815     c = *(++ptr);
816     if (c == 0) goto ERROR_RETURN;
817     if (c == '}') break;
818     name[i] = c;
819     }
820 nigel 87 if (c !='}') goto ERROR_RETURN;
821 nigel 77 name[i] = 0;
822     }
823    
824     /* Otherwise there is just one following character */
825    
826     else
827     {
828     name[0] = c;
829     name[1] = 0;
830     }
831    
832     *ptrptr = ptr;
833    
834     /* Search for a recognized property name using binary chop */
835    
836     bot = 0;
837     top = _pcre_utt_size;
838    
839     while (bot < top)
840     {
841 nigel 87 i = (bot + top) >> 1;
842 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
843 nigel 87 if (c == 0)
844     {
845     *dptr = _pcre_utt[i].value;
846     return _pcre_utt[i].type;
847     }
848 nigel 77 if (c > 0) bot = i + 1; else top = i;
849     }
850    
851     *errorcodeptr = ERR47;
852     *ptrptr = ptr;
853     return -1;
854    
855     ERROR_RETURN:
856     *errorcodeptr = ERR46;
857     *ptrptr = ptr;
858     return -1;
859     }
860     #endif
861    
862    
863    
864    
865     /*************************************************
866     * Check for counted repeat *
867     *************************************************/
868    
869     /* This function is called when a '{' is encountered in a place where it might
870     start a quantifier. It looks ahead to see if it really is a quantifier or not.
871     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
872     where the ddds are digits.
873    
874     Arguments:
875     p pointer to the first char after '{'
876    
877     Returns: TRUE or FALSE
878     */
879    
880     static BOOL
881     is_counted_repeat(const uschar *p)
882     {
883     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
884     while ((digitab[*p] & ctype_digit) != 0) p++;
885     if (*p == '}') return TRUE;
886    
887     if (*p++ != ',') return FALSE;
888     if (*p == '}') return TRUE;
889    
890     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
891     while ((digitab[*p] & ctype_digit) != 0) p++;
892    
893     return (*p == '}');
894     }
895    
896    
897    
898     /*************************************************
899     * Read repeat counts *
900     *************************************************/
901    
902     /* Read an item of the form {n,m} and return the values. This is called only
903     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
904     so the syntax is guaranteed to be correct, but we need to check the values.
905    
906     Arguments:
907     p pointer to first char after '{'
908     minp pointer to int for min
909     maxp pointer to int for max
910     returned as -1 if no max
911     errorcodeptr points to error code variable
912    
913     Returns: pointer to '}' on success;
914     current ptr on error, with errorcodeptr set non-zero
915     */
916    
917     static const uschar *
918     read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
919     {
920     int min = 0;
921     int max = -1;
922    
923 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
924     an integer overflow. */
925    
926 nigel 77 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
927 nigel 81 if (min < 0 || min > 65535)
928     {
929     *errorcodeptr = ERR5;
930     return p;
931     }
932 nigel 77
933 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
934     Also, max must not be less than min. */
935    
936 nigel 77 if (*p == '}') max = min; else
937     {
938     if (*(++p) != '}')
939     {
940     max = 0;
941     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
942 nigel 81 if (max < 0 || max > 65535)
943     {
944     *errorcodeptr = ERR5;
945     return p;
946     }
947 nigel 77 if (max < min)
948     {
949     *errorcodeptr = ERR4;
950     return p;
951     }
952     }
953     }
954    
955 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
956     '}'. */
957 nigel 77
958 nigel 81 *minp = min;
959     *maxp = max;
960 nigel 77 return p;
961     }
962    
963    
964    
965     /*************************************************
966 nigel 93 * Find forward referenced subpattern *
967 nigel 91 *************************************************/
968    
969 nigel 93 /* This function scans along a pattern's text looking for capturing
970     subpatterns, and counting them. If it finds a named pattern that matches the
971     name it is given, it returns its number. Alternatively, if the name is NULL, it
972     returns when it reaches a given numbered subpattern. This is used for forward
973     references to subpatterns. We know that if (?P< is encountered, the name will
974     be terminated by '>' because that is checked in the first pass.
975 nigel 91
976     Arguments:
977 nigel 93 ptr current position in the pattern
978     count current count of capturing parens so far encountered
979     name name to seek, or NULL if seeking a numbered subpattern
980     lorn name length, or subpattern number if name is NULL
981     xmode TRUE if we are in /x mode
982 nigel 91
983     Returns: the number of the named subpattern, or -1 if not found
984     */
985    
986     static int
987 nigel 93 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
988     BOOL xmode)
989 nigel 91 {
990     const uschar *thisname;
991 nigel 93
992 nigel 91 for (; *ptr != 0; ptr++)
993     {
994 nigel 93 int term;
995    
996     /* Skip over backslashed characters and also entire \Q...\E */
997    
998     if (*ptr == '\\')
999     {
1000     if (*(++ptr) == 0) return -1;
1001     if (*ptr == 'Q') for (;;)
1002     {
1003     while (*(++ptr) != 0 && *ptr != '\\');
1004     if (*ptr == 0) return -1;
1005     if (*(++ptr) == 'E') break;
1006     }
1007     continue;
1008     }
1009    
1010     /* Skip over character classes */
1011    
1012     if (*ptr == '[')
1013     {
1014     while (*(++ptr) != ']')
1015     {
1016 ph10 220 if (*ptr == 0) return -1;
1017 nigel 93 if (*ptr == '\\')
1018     {
1019     if (*(++ptr) == 0) return -1;
1020     if (*ptr == 'Q') for (;;)
1021     {
1022     while (*(++ptr) != 0 && *ptr != '\\');
1023     if (*ptr == 0) return -1;
1024     if (*(++ptr) == 'E') break;
1025     }
1026     continue;
1027     }
1028     }
1029     continue;
1030     }
1031    
1032     /* Skip comments in /x mode */
1033    
1034     if (xmode && *ptr == '#')
1035     {
1036     while (*(++ptr) != 0 && *ptr != '\n');
1037     if (*ptr == 0) return -1;
1038     continue;
1039     }
1040    
1041     /* An opening parens must now be a real metacharacter */
1042    
1043 nigel 91 if (*ptr != '(') continue;
1044 ph10 210 if (ptr[1] != '?' && ptr[1] != '*')
1045 nigel 93 {
1046     count++;
1047     if (name == NULL && count == lorn) return count;
1048     continue;
1049     }
1050    
1051     ptr += 2;
1052     if (*ptr == 'P') ptr++; /* Allow optional P */
1053    
1054     /* We have to disambiguate (?<! and (?<= from (?<name> */
1055    
1056     if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1057     *ptr != '\'')
1058     continue;
1059    
1060 nigel 91 count++;
1061 nigel 93
1062     if (name == NULL && count == lorn) return count;
1063     term = *ptr++;
1064     if (term == '<') term = '>';
1065 nigel 91 thisname = ptr;
1066 nigel 93 while (*ptr != term) ptr++;
1067     if (name != NULL && lorn == ptr - thisname &&
1068     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1069 nigel 91 return count;
1070     }
1071 nigel 93
1072 nigel 91 return -1;
1073     }
1074    
1075    
1076    
1077     /*************************************************
1078 nigel 77 * Find first significant op code *
1079     *************************************************/
1080    
1081     /* This is called by several functions that scan a compiled expression looking
1082     for a fixed first character, or an anchoring op code etc. It skips over things
1083     that do not influence this. For some calls, a change of option is important.
1084     For some calls, it makes sense to skip negative forward and all backward
1085     assertions, and also the \b assertion; for others it does not.
1086    
1087     Arguments:
1088     code pointer to the start of the group
1089     options pointer to external options
1090     optbit the option bit whose changing is significant, or
1091     zero if none are
1092     skipassert TRUE if certain assertions are to be skipped
1093    
1094     Returns: pointer to the first significant opcode
1095     */
1096    
1097     static const uschar*
1098     first_significant_code(const uschar *code, int *options, int optbit,
1099     BOOL skipassert)
1100     {
1101     for (;;)
1102     {
1103     switch ((int)*code)
1104     {
1105     case OP_OPT:
1106     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1107     *options = (int)code[1];
1108     code += 2;
1109     break;
1110    
1111     case OP_ASSERT_NOT:
1112     case OP_ASSERTBACK:
1113     case OP_ASSERTBACK_NOT:
1114     if (!skipassert) return code;
1115     do code += GET(code, 1); while (*code == OP_ALT);
1116     code += _pcre_OP_lengths[*code];
1117     break;
1118    
1119     case OP_WORD_BOUNDARY:
1120     case OP_NOT_WORD_BOUNDARY:
1121     if (!skipassert) return code;
1122     /* Fall through */
1123    
1124     case OP_CALLOUT:
1125     case OP_CREF:
1126 nigel 93 case OP_RREF:
1127     case OP_DEF:
1128 nigel 77 code += _pcre_OP_lengths[*code];
1129     break;
1130    
1131     default:
1132     return code;
1133     }
1134     }
1135     /* Control never reaches here */
1136     }
1137    
1138    
1139    
1140    
1141     /*************************************************
1142     * Find the fixed length of a pattern *
1143     *************************************************/
1144    
1145     /* Scan a pattern and compute the fixed length of subject that will match it,
1146     if the length is fixed. This is needed for dealing with backward assertions.
1147     In UTF8 mode, the result is in characters rather than bytes.
1148    
1149     Arguments:
1150     code points to the start of the pattern (the bracket)
1151     options the compiling options
1152    
1153     Returns: the fixed length, or -1 if there is no fixed length,
1154     or -2 if \C was encountered
1155     */
1156    
1157     static int
1158     find_fixedlength(uschar *code, int options)
1159     {
1160     int length = -1;
1161    
1162     register int branchlength = 0;
1163     register uschar *cc = code + 1 + LINK_SIZE;
1164    
1165     /* Scan along the opcodes for this branch. If we get to the end of the
1166     branch, check the length against that of the other branches. */
1167    
1168     for (;;)
1169     {
1170     int d;
1171     register int op = *cc;
1172     switch (op)
1173     {
1174 nigel 93 case OP_CBRA:
1175 nigel 77 case OP_BRA:
1176     case OP_ONCE:
1177     case OP_COND:
1178 nigel 93 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1179 nigel 77 if (d < 0) return d;
1180     branchlength += d;
1181     do cc += GET(cc, 1); while (*cc == OP_ALT);
1182     cc += 1 + LINK_SIZE;
1183     break;
1184    
1185     /* Reached end of a branch; if it's a ket it is the end of a nested
1186     call. If it's ALT it is an alternation in a nested call. If it is
1187     END it's the end of the outer call. All can be handled by the same code. */
1188    
1189     case OP_ALT:
1190     case OP_KET:
1191     case OP_KETRMAX:
1192     case OP_KETRMIN:
1193     case OP_END:
1194     if (length < 0) length = branchlength;
1195     else if (length != branchlength) return -1;
1196     if (*cc != OP_ALT) return length;
1197     cc += 1 + LINK_SIZE;
1198     branchlength = 0;
1199     break;
1200    
1201     /* Skip over assertive subpatterns */
1202    
1203     case OP_ASSERT:
1204     case OP_ASSERT_NOT:
1205     case OP_ASSERTBACK:
1206     case OP_ASSERTBACK_NOT:
1207     do cc += GET(cc, 1); while (*cc == OP_ALT);
1208     /* Fall through */
1209    
1210     /* Skip over things that don't match chars */
1211    
1212     case OP_REVERSE:
1213     case OP_CREF:
1214 nigel 93 case OP_RREF:
1215     case OP_DEF:
1216 nigel 77 case OP_OPT:
1217     case OP_CALLOUT:
1218     case OP_SOD:
1219     case OP_SOM:
1220     case OP_EOD:
1221     case OP_EODN:
1222     case OP_CIRC:
1223     case OP_DOLL:
1224     case OP_NOT_WORD_BOUNDARY:
1225     case OP_WORD_BOUNDARY:
1226     cc += _pcre_OP_lengths[*cc];
1227     break;
1228    
1229     /* Handle literal characters */
1230    
1231     case OP_CHAR:
1232     case OP_CHARNC:
1233 nigel 91 case OP_NOT:
1234 nigel 77 branchlength++;
1235     cc += 2;
1236     #ifdef SUPPORT_UTF8
1237     if ((options & PCRE_UTF8) != 0)
1238     {
1239     while ((*cc & 0xc0) == 0x80) cc++;
1240     }
1241     #endif
1242     break;
1243    
1244     /* Handle exact repetitions. The count is already in characters, but we
1245     need to skip over a multibyte character in UTF8 mode. */
1246    
1247     case OP_EXACT:
1248     branchlength += GET2(cc,1);
1249     cc += 4;
1250     #ifdef SUPPORT_UTF8
1251     if ((options & PCRE_UTF8) != 0)
1252     {
1253     while((*cc & 0x80) == 0x80) cc++;
1254     }
1255     #endif
1256     break;
1257    
1258     case OP_TYPEEXACT:
1259     branchlength += GET2(cc,1);
1260 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1261 nigel 77 cc += 4;
1262     break;
1263    
1264     /* Handle single-char matchers */
1265    
1266     case OP_PROP:
1267     case OP_NOTPROP:
1268 nigel 87 cc += 2;
1269 nigel 77 /* Fall through */
1270    
1271     case OP_NOT_DIGIT:
1272     case OP_DIGIT:
1273     case OP_NOT_WHITESPACE:
1274     case OP_WHITESPACE:
1275     case OP_NOT_WORDCHAR:
1276     case OP_WORDCHAR:
1277     case OP_ANY:
1278     branchlength++;
1279     cc++;
1280     break;
1281    
1282     /* The single-byte matcher isn't allowed */
1283    
1284     case OP_ANYBYTE:
1285     return -2;
1286    
1287     /* Check a class for variable quantification */
1288    
1289     #ifdef SUPPORT_UTF8
1290     case OP_XCLASS:
1291     cc += GET(cc, 1) - 33;
1292     /* Fall through */
1293     #endif
1294    
1295     case OP_CLASS:
1296     case OP_NCLASS:
1297     cc += 33;
1298    
1299     switch (*cc)
1300     {
1301     case OP_CRSTAR:
1302     case OP_CRMINSTAR:
1303     case OP_CRQUERY:
1304     case OP_CRMINQUERY:
1305     return -1;
1306    
1307     case OP_CRRANGE:
1308     case OP_CRMINRANGE:
1309     if (GET2(cc,1) != GET2(cc,3)) return -1;
1310     branchlength += GET2(cc,1);
1311     cc += 5;
1312     break;
1313    
1314     default:
1315     branchlength++;
1316     }
1317     break;
1318    
1319     /* Anything else is variable length */
1320    
1321     default:
1322     return -1;
1323     }
1324     }
1325     /* Control never gets here */
1326     }
1327    
1328    
1329    
1330    
1331     /*************************************************
1332     * Scan compiled regex for numbered bracket *
1333     *************************************************/
1334    
1335     /* This little function scans through a compiled pattern until it finds a
1336     capturing bracket with the given number.
1337    
1338     Arguments:
1339     code points to start of expression
1340     utf8 TRUE in UTF-8 mode
1341     number the required bracket number
1342    
1343     Returns: pointer to the opcode for the bracket, or NULL if not found
1344     */
1345    
1346     static const uschar *
1347     find_bracket(const uschar *code, BOOL utf8, int number)
1348     {
1349     for (;;)
1350     {
1351     register int c = *code;
1352     if (c == OP_END) return NULL;
1353 nigel 91
1354     /* XCLASS is used for classes that cannot be represented just by a bit
1355     map. This includes negated single high-valued characters. The length in
1356     the table is zero; the actual length is stored in the compiled code. */
1357    
1358     if (c == OP_XCLASS) code += GET(code, 1);
1359    
1360 nigel 93 /* Handle capturing bracket */
1361 nigel 91
1362 nigel 93 else if (c == OP_CBRA)
1363 nigel 77 {
1364 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1365 nigel 77 if (n == number) return (uschar *)code;
1366 nigel 93 code += _pcre_OP_lengths[c];
1367 nigel 77 }
1368 nigel 91
1369 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1370     repeated character types, we have to test for \p and \P, which have an extra
1371 ph10 218 two bytes of parameters. */
1372 nigel 91
1373 nigel 77 else
1374     {
1375 ph10 218 switch(c)
1376     {
1377     case OP_TYPESTAR:
1378     case OP_TYPEMINSTAR:
1379     case OP_TYPEPLUS:
1380     case OP_TYPEMINPLUS:
1381     case OP_TYPEQUERY:
1382     case OP_TYPEMINQUERY:
1383     case OP_TYPEPOSSTAR:
1384     case OP_TYPEPOSPLUS:
1385     case OP_TYPEPOSQUERY:
1386     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1387 ph10 220 break;
1388 ph10 221
1389     case OP_TYPEUPTO:
1390     case OP_TYPEMINUPTO:
1391     case OP_TYPEEXACT:
1392     case OP_TYPEPOSUPTO:
1393     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1394     break;
1395 ph10 220 }
1396    
1397 ph10 218 /* Add in the fixed length from the table */
1398 ph10 220
1399 nigel 77 code += _pcre_OP_lengths[c];
1400 ph10 220
1401 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1402     a multi-byte character. The length in the table is a minimum, so we have to
1403     arrange to skip the extra bytes. */
1404 ph10 220
1405 ph10 107 #ifdef SUPPORT_UTF8
1406 nigel 77 if (utf8) switch(c)
1407     {
1408     case OP_CHAR:
1409     case OP_CHARNC:
1410     case OP_EXACT:
1411     case OP_UPTO:
1412     case OP_MINUPTO:
1413 nigel 93 case OP_POSUPTO:
1414 nigel 77 case OP_STAR:
1415     case OP_MINSTAR:
1416 nigel 93 case OP_POSSTAR:
1417 nigel 77 case OP_PLUS:
1418     case OP_MINPLUS:
1419 nigel 93 case OP_POSPLUS:
1420 nigel 77 case OP_QUERY:
1421     case OP_MINQUERY:
1422 nigel 93 case OP_POSQUERY:
1423     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1424 nigel 77 break;
1425     }
1426 ph10 111 #endif
1427 nigel 77 }
1428     }
1429     }
1430    
1431    
1432    
1433     /*************************************************
1434     * Scan compiled regex for recursion reference *
1435     *************************************************/
1436    
1437     /* This little function scans through a compiled pattern until it finds an
1438     instance of OP_RECURSE.
1439    
1440     Arguments:
1441     code points to start of expression
1442     utf8 TRUE in UTF-8 mode
1443    
1444     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1445     */
1446    
1447     static const uschar *
1448     find_recurse(const uschar *code, BOOL utf8)
1449     {
1450     for (;;)
1451     {
1452     register int c = *code;
1453     if (c == OP_END) return NULL;
1454 nigel 91 if (c == OP_RECURSE) return code;
1455 ph10 220
1456 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
1457     map. This includes negated single high-valued characters. The length in
1458     the table is zero; the actual length is stored in the compiled code. */
1459    
1460     if (c == OP_XCLASS) code += GET(code, 1);
1461    
1462 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1463     repeated character types, we have to test for \p and \P, which have an extra
1464 ph10 218 two bytes of parameters. */
1465 nigel 91
1466 nigel 77 else
1467     {
1468 ph10 218 switch(c)
1469     {
1470     case OP_TYPESTAR:
1471     case OP_TYPEMINSTAR:
1472     case OP_TYPEPLUS:
1473     case OP_TYPEMINPLUS:
1474     case OP_TYPEQUERY:
1475     case OP_TYPEMINQUERY:
1476     case OP_TYPEPOSSTAR:
1477     case OP_TYPEPOSPLUS:
1478     case OP_TYPEPOSQUERY:
1479     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1480 ph10 220 break;
1481 ph10 221
1482     case OP_TYPEPOSUPTO:
1483     case OP_TYPEUPTO:
1484     case OP_TYPEMINUPTO:
1485     case OP_TYPEEXACT:
1486     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1487     break;
1488 ph10 220 }
1489    
1490 ph10 218 /* Add in the fixed length from the table */
1491    
1492 nigel 77 code += _pcre_OP_lengths[c];
1493 ph10 220
1494 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1495     by a multi-byte character. The length in the table is a minimum, so we have
1496     to arrange to skip the extra bytes. */
1497 ph10 220
1498 ph10 107 #ifdef SUPPORT_UTF8
1499 nigel 77 if (utf8) switch(c)
1500     {
1501     case OP_CHAR:
1502     case OP_CHARNC:
1503     case OP_EXACT:
1504     case OP_UPTO:
1505     case OP_MINUPTO:
1506 nigel 93 case OP_POSUPTO:
1507 nigel 77 case OP_STAR:
1508     case OP_MINSTAR:
1509 nigel 93 case OP_POSSTAR:
1510 nigel 77 case OP_PLUS:
1511     case OP_MINPLUS:
1512 nigel 93 case OP_POSPLUS:
1513 nigel 77 case OP_QUERY:
1514     case OP_MINQUERY:
1515 nigel 93 case OP_POSQUERY:
1516     if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1517 nigel 77 break;
1518     }
1519 ph10 111 #endif
1520 nigel 77 }
1521     }
1522     }
1523    
1524    
1525    
1526     /*************************************************
1527     * Scan compiled branch for non-emptiness *
1528     *************************************************/
1529    
1530     /* This function scans through a branch of a compiled pattern to see whether it
1531 nigel 93 can match the empty string or not. It is called from could_be_empty()
1532     below and from compile_branch() when checking for an unlimited repeat of a
1533     group that can match nothing. Note that first_significant_code() skips over
1534 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
1535     hit an unclosed bracket, we return "empty" - this means we've struck an inner
1536     bracket whose current branch will already have been scanned.
1537 nigel 77
1538     Arguments:
1539     code points to start of search
1540     endcode points to where to stop
1541     utf8 TRUE if in UTF8 mode
1542    
1543     Returns: TRUE if what is matched could be empty
1544     */
1545    
1546     static BOOL
1547     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1548     {
1549     register int c;
1550 nigel 93 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1551 nigel 77 code < endcode;
1552     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1553     {
1554     const uschar *ccode;
1555    
1556     c = *code;
1557 ph10 286
1558     /* Skip over forward assertions; the other assertions are skipped by
1559 ph10 282 first_significant_code() with a TRUE final argument. */
1560 ph10 286
1561 ph10 282 if (c == OP_ASSERT)
1562 ph10 286 {
1563 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
1564     c = *code;
1565     continue;
1566 ph10 286 }
1567 ph10 172
1568 ph10 170 /* Groups with zero repeats can of course be empty; skip them. */
1569 nigel 77
1570 ph10 170 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1571     {
1572 ph10 172 code += _pcre_OP_lengths[c];
1573 ph10 170 do code += GET(code, 1); while (*code == OP_ALT);
1574     c = *code;
1575     continue;
1576     }
1577    
1578     /* For other groups, scan the branches. */
1579 ph10 172
1580 ph10 206 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1581 nigel 77 {
1582     BOOL empty_branch;
1583     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1584    
1585     /* Scan a closed bracket */
1586    
1587     empty_branch = FALSE;
1588     do
1589     {
1590     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1591     empty_branch = TRUE;
1592     code += GET(code, 1);
1593     }
1594     while (*code == OP_ALT);
1595     if (!empty_branch) return FALSE; /* All branches are non-empty */
1596 ph10 172 c = *code;
1597 nigel 93 continue;
1598 nigel 77 }
1599    
1600 nigel 93 /* Handle the other opcodes */
1601    
1602     switch (c)
1603 nigel 77 {
1604 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
1605     cannot be represented just by a bit map. This includes negated single
1606     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1607 ph10 220 actual length is stored in the compiled code, so we must update "code"
1608 ph10 216 here. */
1609 nigel 77
1610     #ifdef SUPPORT_UTF8
1611     case OP_XCLASS:
1612 ph10 216 ccode = code += GET(code, 1);
1613 nigel 77 goto CHECK_CLASS_REPEAT;
1614     #endif
1615    
1616     case OP_CLASS:
1617     case OP_NCLASS:
1618     ccode = code + 33;
1619    
1620     #ifdef SUPPORT_UTF8
1621     CHECK_CLASS_REPEAT:
1622     #endif
1623    
1624     switch (*ccode)
1625     {
1626     case OP_CRSTAR: /* These could be empty; continue */
1627     case OP_CRMINSTAR:
1628     case OP_CRQUERY:
1629     case OP_CRMINQUERY:
1630     break;
1631    
1632     default: /* Non-repeat => class must match */
1633     case OP_CRPLUS: /* These repeats aren't empty */
1634     case OP_CRMINPLUS:
1635     return FALSE;
1636    
1637     case OP_CRRANGE:
1638     case OP_CRMINRANGE:
1639     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1640     break;
1641     }
1642     break;
1643    
1644     /* Opcodes that must match a character */
1645    
1646     case OP_PROP:
1647     case OP_NOTPROP:
1648     case OP_EXTUNI:
1649     case OP_NOT_DIGIT:
1650     case OP_DIGIT:
1651     case OP_NOT_WHITESPACE:
1652     case OP_WHITESPACE:
1653     case OP_NOT_WORDCHAR:
1654     case OP_WORDCHAR:
1655     case OP_ANY:
1656     case OP_ANYBYTE:
1657     case OP_CHAR:
1658     case OP_CHARNC:
1659     case OP_NOT:
1660     case OP_PLUS:
1661     case OP_MINPLUS:
1662 nigel 93 case OP_POSPLUS:
1663 nigel 77 case OP_EXACT:
1664     case OP_NOTPLUS:
1665     case OP_NOTMINPLUS:
1666 nigel 93 case OP_NOTPOSPLUS:
1667 nigel 77 case OP_NOTEXACT:
1668     case OP_TYPEPLUS:
1669     case OP_TYPEMINPLUS:
1670 nigel 93 case OP_TYPEPOSPLUS:
1671 nigel 77 case OP_TYPEEXACT:
1672     return FALSE;
1673 ph10 227
1674     /* These are going to continue, as they may be empty, but we have to
1675     fudge the length for the \p and \P cases. */
1676    
1677 ph10 224 case OP_TYPESTAR:
1678     case OP_TYPEMINSTAR:
1679     case OP_TYPEPOSSTAR:
1680     case OP_TYPEQUERY:
1681     case OP_TYPEMINQUERY:
1682     case OP_TYPEPOSQUERY:
1683     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1684 ph10 227 break;
1685    
1686 ph10 224 /* Same for these */
1687 ph10 227
1688 ph10 224 case OP_TYPEUPTO:
1689     case OP_TYPEMINUPTO:
1690     case OP_TYPEPOSUPTO:
1691     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1692     break;
1693 nigel 77
1694     /* End of branch */
1695    
1696     case OP_KET:
1697     case OP_KETRMAX:
1698     case OP_KETRMIN:
1699     case OP_ALT:
1700     return TRUE;
1701    
1702 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1703     MINUPTO, and POSUPTO may be followed by a multibyte character */
1704 nigel 77
1705     #ifdef SUPPORT_UTF8
1706     case OP_STAR:
1707     case OP_MINSTAR:
1708 nigel 93 case OP_POSSTAR:
1709 nigel 77 case OP_QUERY:
1710     case OP_MINQUERY:
1711 nigel 93 case OP_POSQUERY:
1712 nigel 77 case OP_UPTO:
1713     case OP_MINUPTO:
1714 nigel 93 case OP_POSUPTO:
1715 nigel 77 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1716     break;
1717     #endif
1718     }
1719     }
1720    
1721     return TRUE;
1722     }
1723    
1724    
1725    
1726     /*************************************************
1727     * Scan compiled regex for non-emptiness *
1728     *************************************************/
1729    
1730     /* This function is called to check for left recursive calls. We want to check
1731     the current branch of the current pattern to see if it could match the empty
1732     string. If it could, we must look outwards for branches at other levels,
1733     stopping when we pass beyond the bracket which is the subject of the recursion.
1734    
1735     Arguments:
1736     code points to start of the recursion
1737     endcode points to where to stop (current RECURSE item)
1738     bcptr points to the chain of current (unclosed) branch starts
1739     utf8 TRUE if in UTF-8 mode
1740    
1741     Returns: TRUE if what is matched could be empty
1742     */
1743    
1744     static BOOL
1745     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1746     BOOL utf8)
1747     {
1748     while (bcptr != NULL && bcptr->current >= code)
1749     {
1750     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1751     bcptr = bcptr->outer;
1752     }
1753     return TRUE;
1754     }
1755    
1756    
1757    
1758     /*************************************************
1759     * Check for POSIX class syntax *
1760     *************************************************/
1761    
1762     /* This function is called when the sequence "[:" or "[." or "[=" is
1763 ph10 295 encountered in a character class. It checks whether this is followed by a
1764 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1765 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
1766 nigel 77
1767 ph10 298 Originally, this function only recognized a sequence of letters between the
1768     terminators, but it seems that Perl recognizes any sequence of characters,
1769     though of course unknown POSIX names are subsequently rejected. Perl gives an
1770     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1771     didn't consider this to be a POSIX class. Likewise for [:1234:].
1772 ph10 295
1773 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
1774     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1775     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1776     below handles the special case of \], but does not try to do any other escape
1777     processing. This makes it different from Perl for cases such as [:l\ower:]
1778 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1779 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1780 ph10 295 I think.
1781    
1782     Arguments:
1783 nigel 77 ptr pointer to the initial [
1784     endptr where to return the end pointer
1785    
1786     Returns: TRUE or FALSE
1787     */
1788    
1789     static BOOL
1790 ph10 295 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1791 nigel 77 {
1792     int terminator; /* Don't combine these lines; the Solaris cc */
1793     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1794 ph10 295 for (++ptr; *ptr != 0; ptr++)
1795 nigel 77 {
1796 ph10 295 if (*ptr == '\\' && ptr[1] == ']') ptr++; else
1797 ph10 298 {
1798     if (*ptr == ']') return FALSE;
1799 ph10 295 if (*ptr == terminator && ptr[1] == ']')
1800     {
1801     *endptr = ptr;
1802     return TRUE;
1803 ph10 298 }
1804     }
1805     }
1806 nigel 77 return FALSE;
1807     }
1808    
1809    
1810    
1811    
1812     /*************************************************
1813     * Check POSIX class name *
1814     *************************************************/
1815    
1816     /* This function is called to check the name given in a POSIX-style class entry
1817     such as [:alnum:].
1818    
1819     Arguments:
1820     ptr points to the first letter
1821     len the length of the name
1822    
1823     Returns: a value representing the name, or -1 if unknown
1824     */
1825    
1826     static int
1827     check_posix_name(const uschar *ptr, int len)
1828     {
1829 ph10 240 const char *pn = posix_names;
1830 nigel 77 register int yield = 0;
1831     while (posix_name_lengths[yield] != 0)
1832     {
1833     if (len == posix_name_lengths[yield] &&
1834 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
1835 ph10 243 pn += posix_name_lengths[yield] + 1;
1836 nigel 77 yield++;
1837     }
1838     return -1;
1839     }
1840    
1841    
1842     /*************************************************
1843     * Adjust OP_RECURSE items in repeated group *
1844     *************************************************/
1845    
1846     /* OP_RECURSE items contain an offset from the start of the regex to the group
1847     that is referenced. This means that groups can be replicated for fixed
1848     repetition simply by copying (because the recursion is allowed to refer to
1849     earlier groups that are outside the current group). However, when a group is
1850     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1851     it, after it has been compiled. This means that any OP_RECURSE items within it
1852     that refer to the group itself or any contained groups have to have their
1853 nigel 93 offsets adjusted. That one of the jobs of this function. Before it is called,
1854     the partially compiled regex must be temporarily terminated with OP_END.
1855 nigel 77
1856 nigel 93 This function has been extended with the possibility of forward references for
1857     recursions and subroutine calls. It must also check the list of such references
1858     for the group we are dealing with. If it finds that one of the recursions in
1859     the current group is on this list, it adjusts the offset in the list, not the
1860     value in the reference (which is a group number).
1861    
1862 nigel 77 Arguments:
1863     group points to the start of the group
1864     adjust the amount by which the group is to be moved
1865     utf8 TRUE in UTF-8 mode
1866     cd contains pointers to tables etc.
1867 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
1868 nigel 77
1869     Returns: nothing
1870     */
1871    
1872     static void
1873 nigel 93 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1874     uschar *save_hwm)
1875 nigel 77 {
1876     uschar *ptr = group;
1877 ph10 224
1878 nigel 77 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1879     {
1880 nigel 93 int offset;
1881     uschar *hc;
1882    
1883     /* See if this recursion is on the forward reference list. If so, adjust the
1884     reference. */
1885    
1886     for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1887     {
1888     offset = GET(hc, 0);
1889     if (cd->start_code + offset == ptr + 1)
1890     {
1891     PUT(hc, 0, offset + adjust);
1892     break;
1893     }
1894     }
1895    
1896     /* Otherwise, adjust the recursion offset if it's after the start of this
1897     group. */
1898    
1899     if (hc >= cd->hwm)
1900     {
1901     offset = GET(ptr, 1);
1902     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1903     }
1904    
1905 nigel 77 ptr += 1 + LINK_SIZE;
1906     }
1907     }
1908    
1909    
1910    
1911     /*************************************************
1912     * Insert an automatic callout point *
1913     *************************************************/
1914    
1915     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1916     callout points before each pattern item.
1917    
1918     Arguments:
1919     code current code pointer
1920     ptr current pattern pointer
1921     cd pointers to tables etc
1922    
1923     Returns: new code pointer
1924     */
1925    
1926     static uschar *
1927     auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1928     {
1929     *code++ = OP_CALLOUT;
1930     *code++ = 255;
1931     PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1932     PUT(code, LINK_SIZE, 0); /* Default length */
1933     return code + 2*LINK_SIZE;
1934     }
1935    
1936    
1937    
1938     /*************************************************
1939     * Complete a callout item *
1940     *************************************************/
1941    
1942     /* A callout item contains the length of the next item in the pattern, which
1943     we can't fill in till after we have reached the relevant point. This is used
1944     for both automatic and manual callouts.
1945    
1946     Arguments:
1947     previous_callout points to previous callout item
1948     ptr current pattern pointer
1949     cd pointers to tables etc
1950    
1951     Returns: nothing
1952     */
1953    
1954     static void
1955     complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1956     {
1957     int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1958     PUT(previous_callout, 2 + LINK_SIZE, length);
1959     }
1960    
1961    
1962    
1963     #ifdef SUPPORT_UCP
1964     /*************************************************
1965     * Get othercase range *
1966     *************************************************/
1967    
1968     /* This function is passed the start and end of a class range, in UTF-8 mode
1969     with UCP support. It searches up the characters, looking for internal ranges of
1970     characters in the "other" case. Each call returns the next one, updating the
1971     start address.
1972    
1973     Arguments:
1974     cptr points to starting character value; updated
1975     d end value
1976     ocptr where to put start of othercase range
1977     odptr where to put end of othercase range
1978    
1979     Yield: TRUE when range returned; FALSE when no more
1980     */
1981    
1982     static BOOL
1983 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1984     unsigned int *odptr)
1985 nigel 77 {
1986 nigel 93 unsigned int c, othercase, next;
1987 nigel 77
1988     for (c = *cptr; c <= d; c++)
1989 nigel 93 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1990 nigel 77
1991     if (c > d) return FALSE;
1992    
1993     *ocptr = othercase;
1994     next = othercase + 1;
1995    
1996     for (++c; c <= d; c++)
1997     {
1998 nigel 87 if (_pcre_ucp_othercase(c) != next) break;
1999 nigel 77 next++;
2000     }
2001    
2002     *odptr = next - 1;
2003     *cptr = c;
2004    
2005     return TRUE;
2006     }
2007     #endif /* SUPPORT_UCP */
2008    
2009    
2010 nigel 93
2011 nigel 77 /*************************************************
2012 nigel 93 * Check if auto-possessifying is possible *
2013     *************************************************/
2014    
2015     /* This function is called for unlimited repeats of certain items, to see
2016     whether the next thing could possibly match the repeated item. If not, it makes
2017     sense to automatically possessify the repeated item.
2018    
2019     Arguments:
2020     op_code the repeated op code
2021     this data for this item, depends on the opcode
2022     utf8 TRUE in UTF-8 mode
2023     utf8_char used for utf8 character bytes, NULL if not relevant
2024     ptr next character in pattern
2025     options options bits
2026     cd contains pointers to tables etc.
2027    
2028     Returns: TRUE if possessifying is wanted
2029     */
2030    
2031     static BOOL
2032     check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2033     const uschar *ptr, int options, compile_data *cd)
2034     {
2035     int next;
2036    
2037     /* Skip whitespace and comments in extended mode */
2038    
2039     if ((options & PCRE_EXTENDED) != 0)
2040     {
2041     for (;;)
2042     {
2043     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2044     if (*ptr == '#')
2045     {
2046     while (*(++ptr) != 0)
2047     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2048     }
2049     else break;
2050     }
2051     }
2052    
2053     /* If the next item is one that we can handle, get its value. A non-negative
2054     value is a character, a negative value is an escape value. */
2055    
2056     if (*ptr == '\\')
2057     {
2058     int temperrorcode = 0;
2059     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2060     if (temperrorcode != 0) return FALSE;
2061     ptr++; /* Point after the escape sequence */
2062     }
2063    
2064     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2065     {
2066     #ifdef SUPPORT_UTF8
2067     if (utf8) { GETCHARINC(next, ptr); } else
2068     #endif
2069     next = *ptr++;
2070     }
2071    
2072     else return FALSE;
2073    
2074     /* Skip whitespace and comments in extended mode */
2075    
2076     if ((options & PCRE_EXTENDED) != 0)
2077     {
2078     for (;;)
2079     {
2080     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2081     if (*ptr == '#')
2082     {
2083     while (*(++ptr) != 0)
2084     if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2085     }
2086     else break;
2087     }
2088     }
2089    
2090     /* If the next thing is itself optional, we have to give up. */
2091    
2092     if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2093     return FALSE;
2094    
2095     /* Now compare the next item with the previous opcode. If the previous is a
2096     positive single character match, "item" either contains the character or, if
2097     "item" is greater than 127 in utf8 mode, the character's bytes are in
2098     utf8_char. */
2099    
2100    
2101     /* Handle cases when the next item is a character. */
2102    
2103     if (next >= 0) switch(op_code)
2104     {
2105     case OP_CHAR:
2106     #ifdef SUPPORT_UTF8
2107     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2108     #endif
2109     return item != next;
2110    
2111     /* For CHARNC (caseless character) we must check the other case. If we have
2112     Unicode property support, we can use it to test the other case of
2113     high-valued characters. */
2114    
2115     case OP_CHARNC:
2116     #ifdef SUPPORT_UTF8
2117     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2118     #endif
2119     if (item == next) return FALSE;
2120     #ifdef SUPPORT_UTF8
2121     if (utf8)
2122     {
2123     unsigned int othercase;
2124     if (next < 128) othercase = cd->fcc[next]; else
2125     #ifdef SUPPORT_UCP
2126     othercase = _pcre_ucp_othercase((unsigned int)next);
2127     #else
2128     othercase = NOTACHAR;
2129     #endif
2130     return (unsigned int)item != othercase;
2131     }
2132     else
2133     #endif /* SUPPORT_UTF8 */
2134     return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2135    
2136     /* For OP_NOT, "item" must be a single-byte character. */
2137    
2138     case OP_NOT:
2139     if (item == next) return TRUE;
2140     if ((options & PCRE_CASELESS) == 0) return FALSE;
2141     #ifdef SUPPORT_UTF8
2142     if (utf8)
2143     {
2144     unsigned int othercase;
2145     if (next < 128) othercase = cd->fcc[next]; else
2146     #ifdef SUPPORT_UCP
2147     othercase = _pcre_ucp_othercase(next);
2148     #else
2149     othercase = NOTACHAR;
2150     #endif
2151     return (unsigned int)item == othercase;
2152     }
2153     else
2154     #endif /* SUPPORT_UTF8 */
2155     return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2156    
2157     case OP_DIGIT:
2158     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2159    
2160     case OP_NOT_DIGIT:
2161     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2162    
2163     case OP_WHITESPACE:
2164     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2165    
2166     case OP_NOT_WHITESPACE:
2167     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2168    
2169     case OP_WORDCHAR:
2170     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2171    
2172     case OP_NOT_WORDCHAR:
2173     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2174    
2175 ph10 180 case OP_HSPACE:
2176     case OP_NOT_HSPACE:
2177     switch(next)
2178     {
2179     case 0x09:
2180     case 0x20:
2181     case 0xa0:
2182     case 0x1680:
2183     case 0x180e:
2184     case 0x2000:
2185     case 0x2001:
2186     case 0x2002:
2187     case 0x2003:
2188     case 0x2004:
2189     case 0x2005:
2190     case 0x2006:
2191     case 0x2007:
2192     case 0x2008:
2193     case 0x2009:
2194     case 0x200A:
2195     case 0x202f:
2196     case 0x205f:
2197     case 0x3000:
2198     return op_code != OP_HSPACE;
2199     default:
2200     return op_code == OP_HSPACE;
2201     }
2202    
2203     case OP_VSPACE:
2204     case OP_NOT_VSPACE:
2205     switch(next)
2206     {
2207     case 0x0a:
2208     case 0x0b:
2209     case 0x0c:
2210     case 0x0d:
2211     case 0x85:
2212     case 0x2028:
2213     case 0x2029:
2214     return op_code != OP_VSPACE;
2215     default:
2216     return op_code == OP_VSPACE;
2217     }
2218    
2219 nigel 93 default:
2220     return FALSE;
2221     }
2222    
2223    
2224     /* Handle the case when the next item is \d, \s, etc. */
2225    
2226     switch(op_code)
2227     {
2228     case OP_CHAR:
2229     case OP_CHARNC:
2230     #ifdef SUPPORT_UTF8
2231     if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2232     #endif
2233     switch(-next)
2234     {
2235     case ESC_d:
2236     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2237    
2238     case ESC_D:
2239     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2240    
2241     case ESC_s:
2242     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2243    
2244     case ESC_S:
2245     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2246    
2247     case ESC_w:
2248     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2249    
2250     case ESC_W:
2251     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2252 ph10 182
2253 ph10 180 case ESC_h:
2254     case ESC_H:
2255     switch(item)
2256     {
2257     case 0x09:
2258     case 0x20:
2259     case 0xa0:
2260     case 0x1680:
2261     case 0x180e:
2262     case 0x2000:
2263     case 0x2001:
2264     case 0x2002:
2265     case 0x2003:
2266     case 0x2004:
2267     case 0x2005:
2268     case 0x2006:
2269     case 0x2007:
2270     case 0x2008:
2271     case 0x2009:
2272     case 0x200A:
2273     case 0x202f:
2274     case 0x205f:
2275     case 0x3000:
2276     return -next != ESC_h;
2277     default:
2278     return -next == ESC_h;
2279 ph10 182 }
2280    
2281 ph10 180 case ESC_v:
2282     case ESC_V:
2283     switch(item)
2284     {
2285     case 0x0a:
2286     case 0x0b:
2287     case 0x0c:
2288     case 0x0d:
2289     case 0x85:
2290     case 0x2028:
2291     case 0x2029:
2292     return -next != ESC_v;
2293     default:
2294     return -next == ESC_v;
2295 ph10 182 }
2296 nigel 93
2297     default:
2298     return FALSE;
2299     }
2300    
2301     case OP_DIGIT:
2302 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2303     next == -ESC_h || next == -ESC_v;
2304 nigel 93
2305     case OP_NOT_DIGIT:
2306     return next == -ESC_d;
2307    
2308     case OP_WHITESPACE:
2309     return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2310    
2311     case OP_NOT_WHITESPACE:
2312 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2313 nigel 93
2314 ph10 180 case OP_HSPACE:
2315     return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2316    
2317     case OP_NOT_HSPACE:
2318     return next == -ESC_h;
2319 ph10 182
2320 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2321 ph10 182 case OP_VSPACE:
2322 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2323    
2324     case OP_NOT_VSPACE:
2325 ph10 182 return next == -ESC_v;
2326 ph10 180
2327 nigel 93 case OP_WORDCHAR:
2328 ph10 180 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2329 nigel 93
2330     case OP_NOT_WORDCHAR:
2331     return next == -ESC_w || next == -ESC_d;
2332 ph10 182
2333 nigel 93 default:
2334     return FALSE;
2335     }
2336    
2337     /* Control does not reach here */
2338     }
2339    
2340    
2341    
2342     /*************************************************
2343 nigel 77 * Compile one branch *
2344     *************************************************/
2345    
2346 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
2347 nigel 77 changed during the branch, the pointer is used to change the external options
2348 nigel 93 bits. This function is used during the pre-compile phase when we are trying
2349     to find out the amount of memory needed, as well as during the real compile
2350     phase. The value of lengthptr distinguishes the two phases.
2351 nigel 77
2352     Arguments:
2353     optionsptr pointer to the option bits
2354     codeptr points to the pointer to the current code point
2355     ptrptr points to the current pattern pointer
2356     errorcodeptr points to error code variable
2357     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2358     reqbyteptr set to the last literal character required, else < 0
2359     bcptr points to current branch chain
2360     cd contains pointers to tables etc.
2361 nigel 93 lengthptr NULL during the real compile phase
2362     points to length accumulator during pre-compile phase
2363 nigel 77
2364     Returns: TRUE on success
2365     FALSE, with *errorcodeptr set non-zero on error
2366     */
2367    
2368     static BOOL
2369 nigel 93 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2370     int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2371     compile_data *cd, int *lengthptr)
2372 nigel 77 {
2373     int repeat_type, op_type;
2374     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2375     int bravalue = 0;
2376     int greedy_default, greedy_non_default;
2377     int firstbyte, reqbyte;
2378     int zeroreqbyte, zerofirstbyte;
2379     int req_caseopt, reqvary, tempreqvary;
2380     int options = *optionsptr;
2381     int after_manual_callout = 0;
2382 nigel 93 int length_prevgroup = 0;
2383 nigel 77 register int c;
2384     register uschar *code = *codeptr;
2385 nigel 93 uschar *last_code = code;
2386     uschar *orig_code = code;
2387 nigel 77 uschar *tempcode;
2388     BOOL inescq = FALSE;
2389     BOOL groupsetfirstbyte = FALSE;
2390     const uschar *ptr = *ptrptr;
2391     const uschar *tempptr;
2392     uschar *previous = NULL;
2393     uschar *previous_callout = NULL;
2394 nigel 93 uschar *save_hwm = NULL;
2395 nigel 77 uschar classbits[32];
2396    
2397     #ifdef SUPPORT_UTF8
2398     BOOL class_utf8;
2399     BOOL utf8 = (options & PCRE_UTF8) != 0;
2400     uschar *class_utf8data;
2401 ph10 300 uschar *class_utf8data_base;
2402 nigel 77 uschar utf8_char[6];
2403     #else
2404     BOOL utf8 = FALSE;
2405 nigel 93 uschar *utf8_char = NULL;
2406 nigel 77 #endif
2407    
2408 nigel 93 #ifdef DEBUG
2409     if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2410     #endif
2411    
2412 nigel 77 /* Set up the default and non-default settings for greediness */
2413    
2414     greedy_default = ((options & PCRE_UNGREEDY) != 0);
2415     greedy_non_default = greedy_default ^ 1;
2416    
2417     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2418     matching encountered yet". It gets changed to REQ_NONE if we hit something that
2419     matches a non-fixed char first char; reqbyte just remains unset if we never
2420     find one.
2421    
2422     When we hit a repeat whose minimum is zero, we may have to adjust these values
2423     to take the zero repeat into account. This is implemented by setting them to
2424     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2425     item types that can be repeated set these backoff variables appropriately. */
2426    
2427     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2428    
2429     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2430     according to the current setting of the caseless flag. REQ_CASELESS is a bit
2431     value > 255. It is added into the firstbyte or reqbyte variables to record the
2432     case status of the value. This is used only for ASCII characters. */
2433    
2434     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2435    
2436     /* Switch on next character until the end of the branch */
2437    
2438     for (;; ptr++)
2439     {
2440     BOOL negate_class;
2441 ph10 286 BOOL should_flip_negation;
2442 nigel 77 BOOL possessive_quantifier;
2443     BOOL is_quantifier;
2444 nigel 93 BOOL is_recurse;
2445 ph10 180 BOOL reset_bracount;
2446 nigel 77 int class_charcount;
2447     int class_lastchar;
2448     int newoptions;
2449     int recno;
2450 ph10 172 int refsign;
2451 nigel 77 int skipbytes;
2452     int subreqbyte;
2453     int subfirstbyte;
2454 nigel 93 int terminator;
2455 nigel 77 int mclength;
2456     uschar mcbuffer[8];
2457    
2458 nigel 93 /* Get next byte in the pattern */
2459 nigel 77
2460     c = *ptr;
2461    
2462 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
2463     previous cycle of this loop. */
2464    
2465     if (lengthptr != NULL)
2466     {
2467     #ifdef DEBUG
2468     if (code > cd->hwm) cd->hwm = code; /* High water info */
2469     #endif
2470     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2471     {
2472     *errorcodeptr = ERR52;
2473     goto FAILED;
2474     }
2475    
2476     /* There is at least one situation where code goes backwards: this is the
2477     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2478     the class is simply eliminated. However, it is created first, so we have to
2479     allow memory for it. Therefore, don't ever reduce the length at this point.
2480     */
2481    
2482     if (code < last_code) code = last_code;
2483 ph10 202
2484     /* Paranoid check for integer overflow */
2485    
2486     if (OFLOW_MAX - *lengthptr < code - last_code)
2487     {
2488     *errorcodeptr = ERR20;
2489     goto FAILED;
2490     }
2491    
2492 nigel 93 *lengthptr += code - last_code;
2493     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2494    
2495     /* If "previous" is set and it is not at the start of the work space, move
2496     it back to there, in order to avoid filling up the work space. Otherwise,
2497     if "previous" is NULL, reset the current code pointer to the start. */
2498    
2499     if (previous != NULL)
2500     {
2501     if (previous > orig_code)
2502     {
2503     memmove(orig_code, previous, code - previous);
2504     code -= previous - orig_code;
2505     previous = orig_code;
2506     }
2507     }
2508     else code = orig_code;
2509    
2510     /* Remember where this code item starts so we can pick up the length
2511     next time round. */
2512    
2513     last_code = code;
2514     }
2515    
2516     /* In the real compile phase, just check the workspace used by the forward
2517     reference list. */
2518    
2519     else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2520     {
2521     *errorcodeptr = ERR52;
2522     goto FAILED;
2523     }
2524    
2525 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
2526    
2527     if (inescq && c != 0)
2528     {
2529     if (c == '\\' && ptr[1] == 'E')
2530     {
2531     inescq = FALSE;
2532     ptr++;
2533     continue;
2534     }
2535     else
2536     {
2537     if (previous_callout != NULL)
2538     {
2539 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2540     complete_callout(previous_callout, ptr, cd);
2541 nigel 77 previous_callout = NULL;
2542     }
2543     if ((options & PCRE_AUTO_CALLOUT) != 0)
2544     {
2545     previous_callout = code;
2546     code = auto_callout(code, ptr, cd);
2547     }
2548     goto NORMAL_CHAR;
2549     }
2550     }
2551    
2552     /* Fill in length of a previous callout, except when the next thing is
2553     a quantifier. */
2554    
2555     is_quantifier = c == '*' || c == '+' || c == '?' ||
2556     (c == '{' && is_counted_repeat(ptr+1));
2557    
2558     if (!is_quantifier && previous_callout != NULL &&
2559     after_manual_callout-- <= 0)
2560     {
2561 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2562     complete_callout(previous_callout, ptr, cd);
2563 nigel 77 previous_callout = NULL;
2564     }
2565    
2566     /* In extended mode, skip white space and comments */
2567    
2568     if ((options & PCRE_EXTENDED) != 0)
2569     {
2570     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2571     if (c == '#')
2572     {
2573 nigel 93 while (*(++ptr) != 0)
2574 nigel 91 {
2575 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2576 nigel 91 }
2577 nigel 93 if (*ptr != 0) continue;
2578    
2579 nigel 91 /* Else fall through to handle end of string */
2580     c = 0;
2581 nigel 77 }
2582     }
2583    
2584     /* No auto callout for quantifiers. */
2585    
2586     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2587     {
2588     previous_callout = code;
2589     code = auto_callout(code, ptr, cd);
2590     }
2591    
2592     switch(c)
2593     {
2594 nigel 93 /* ===================================================================*/
2595     case 0: /* The branch terminates at string end */
2596     case '|': /* or | or ) */
2597 nigel 77 case ')':
2598     *firstbyteptr = firstbyte;
2599     *reqbyteptr = reqbyte;
2600     *codeptr = code;
2601     *ptrptr = ptr;
2602 nigel 93 if (lengthptr != NULL)
2603     {
2604 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
2605     {
2606     *errorcodeptr = ERR20;
2607     goto FAILED;
2608     }
2609 nigel 93 *lengthptr += code - last_code; /* To include callout length */
2610     DPRINTF((">> end branch\n"));
2611     }
2612 nigel 77 return TRUE;
2613    
2614 nigel 93
2615     /* ===================================================================*/
2616 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
2617     the setting of any following char as a first character. */
2618    
2619     case '^':
2620     if ((options & PCRE_MULTILINE) != 0)
2621     {
2622     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2623     }
2624     previous = NULL;
2625     *code++ = OP_CIRC;
2626     break;
2627    
2628     case '$':
2629     previous = NULL;
2630     *code++ = OP_DOLL;
2631     break;
2632    
2633     /* There can never be a first char if '.' is first, whatever happens about
2634     repeats. The value of reqbyte doesn't change either. */
2635    
2636     case '.':
2637     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2638     zerofirstbyte = firstbyte;
2639     zeroreqbyte = reqbyte;
2640     previous = code;
2641     *code++ = OP_ANY;
2642     break;
2643    
2644 nigel 93
2645     /* ===================================================================*/
2646 nigel 87 /* Character classes. If the included characters are all < 256, we build a
2647     32-byte bitmap of the permitted characters, except in the special case
2648     where there is only one such character. For negated classes, we build the
2649     map as usual, then invert it at the end. However, we use a different opcode
2650     so that data characters > 255 can be handled correctly.
2651 nigel 77
2652     If the class contains characters outside the 0-255 range, a different
2653     opcode is compiled. It may optionally have a bit map for characters < 256,
2654     but those above are are explicitly listed afterwards. A flag byte tells
2655     whether the bitmap is present, and whether this is a negated class or not.
2656     */
2657    
2658     case '[':
2659     previous = code;
2660    
2661     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2662     they are encountered at the top level, so we'll do that too. */
2663    
2664     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2665 ph10 295 check_posix_syntax(ptr, &tempptr))
2666 nigel 77 {
2667     *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2668     goto FAILED;
2669     }
2670    
2671 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
2672 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
2673 ph10 205 skip them too. This makes for compatibility with Perl. */
2674 ph10 208
2675 ph10 205 negate_class = FALSE;
2676     for (;;)
2677 nigel 77 {
2678     c = *(++ptr);
2679 ph10 205 if (c == '\\')
2680     {
2681 ph10 208 if (ptr[1] == 'E') ptr++;
2682 ph10 205 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2683 ph10 208 else break;
2684 ph10 205 }
2685     else if (!negate_class && c == '^')
2686     negate_class = TRUE;
2687     else break;
2688 ph10 208 }
2689 nigel 77
2690 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
2691     negation flag at the end, so that support for characters > 255 works
2692 ph10 264 correctly (they are all included in the class). */
2693    
2694     should_flip_negation = FALSE;
2695    
2696 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
2697 nigel 93 of just a single character (as long as it's < 256). However, For higher
2698     valued UTF-8 characters, we don't yet do any optimization. */
2699 nigel 77
2700     class_charcount = 0;
2701     class_lastchar = -1;
2702    
2703 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
2704     temporary bit of memory, in case the class contains only 1 character (less
2705     than 256), because in that case the compiled code doesn't use the bit map.
2706     */
2707    
2708     memset(classbits, 0, 32 * sizeof(uschar));
2709    
2710 nigel 77 #ifdef SUPPORT_UTF8
2711     class_utf8 = FALSE; /* No chars >= 256 */
2712 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2713 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2714 nigel 77 #endif
2715    
2716     /* Process characters until ] is reached. By writing this as a "do" it
2717 nigel 93 means that an initial ] is taken as a data character. At the start of the
2718     loop, c contains the first byte of the character. */
2719 nigel 77
2720 nigel 93 if (c != 0) do
2721 nigel 77 {
2722 nigel 93 const uschar *oldptr;
2723    
2724 nigel 77 #ifdef SUPPORT_UTF8
2725     if (utf8 && c > 127)
2726     { /* Braces are required because the */
2727     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2728     }
2729 ph10 309
2730 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2731 ph10 309 data and reset the pointer. This is so that very large classes that
2732 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
2733 ph10 309 (which is on the stack). */
2734    
2735 ph10 300 if (lengthptr != NULL)
2736     {
2737     *lengthptr += class_utf8data - class_utf8data_base;
2738 ph10 309 class_utf8data = class_utf8data_base;
2739     }
2740    
2741 nigel 77 #endif
2742    
2743     /* Inside \Q...\E everything is literal except \E */
2744    
2745     if (inescq)
2746     {
2747 nigel 93 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2748 nigel 77 {
2749 nigel 93 inescq = FALSE; /* Reset literal state */
2750     ptr++; /* Skip the 'E' */
2751     continue; /* Carry on with next */
2752 nigel 77 }
2753 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
2754 nigel 77 }
2755    
2756     /* Handle POSIX class names. Perl allows a negation extension of the
2757     form [:^name:]. A square bracket that doesn't match the syntax is
2758     treated as a literal. We also recognize the POSIX constructions
2759     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2760     5.6 and 5.8 do. */
2761    
2762     if (c == '[' &&
2763     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2764 ph10 295 check_posix_syntax(ptr, &tempptr))
2765 nigel 77 {
2766     BOOL local_negate = FALSE;
2767 nigel 87 int posix_class, taboffset, tabopt;
2768 nigel 77 register const uschar *cbits = cd->cbits;
2769 nigel 87 uschar pbits[32];
2770 nigel 77
2771     if (ptr[1] != ':')
2772     {
2773     *errorcodeptr = ERR31;
2774     goto FAILED;
2775     }
2776    
2777     ptr += 2;
2778     if (*ptr == '^')
2779     {
2780     local_negate = TRUE;
2781 ph10 286 should_flip_negation = TRUE; /* Note negative special */
2782 nigel 77 ptr++;
2783     }
2784    
2785     posix_class = check_posix_name(ptr, tempptr - ptr);
2786     if (posix_class < 0)
2787     {
2788     *errorcodeptr = ERR30;
2789     goto FAILED;
2790     }
2791    
2792     /* If matching is caseless, upper and lower are converted to
2793     alpha. This relies on the fact that the class table starts with
2794     alpha, lower, upper as the first 3 entries. */
2795    
2796     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2797     posix_class = 0;
2798    
2799 nigel 87 /* We build the bit map for the POSIX class in a chunk of local store
2800     because we may be adding and subtracting from it, and we don't want to
2801     subtract bits that may be in the main map already. At the end we or the
2802     result into the bit map that is being built. */
2803 nigel 77
2804     posix_class *= 3;
2805 nigel 87
2806     /* Copy in the first table (always present) */
2807    
2808     memcpy(pbits, cbits + posix_class_maps[posix_class],
2809     32 * sizeof(uschar));
2810    
2811     /* If there is a second table, add or remove it as required. */
2812    
2813     taboffset = posix_class_maps[posix_class + 1];
2814     tabopt = posix_class_maps[posix_class + 2];
2815    
2816     if (taboffset >= 0)
2817 nigel 77 {
2818 nigel 87 if (tabopt >= 0)
2819     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2820 nigel 77 else
2821 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2822 nigel 77 }
2823    
2824 nigel 87 /* Not see if we need to remove any special characters. An option
2825     value of 1 removes vertical space and 2 removes underscore. */
2826    
2827     if (tabopt < 0) tabopt = -tabopt;
2828     if (tabopt == 1) pbits[1] &= ~0x3c;
2829     else if (tabopt == 2) pbits[11] &= 0x7f;
2830    
2831     /* Add the POSIX table or its complement into the main table that is
2832     being built and we are done. */
2833    
2834     if (local_negate)
2835     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2836     else
2837     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2838    
2839 nigel 77 ptr = tempptr + 1;
2840     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2841     continue; /* End of POSIX syntax handling */
2842     }
2843    
2844     /* Backslash may introduce a single character, or it may introduce one
2845 nigel 93 of the specials, which just set a flag. The sequence \b is a special
2846     case. Inside a class (and only there) it is treated as backspace.
2847     Elsewhere it marks a word boundary. Other escapes have preset maps ready
2848 ph10 205 to 'or' into the one we are building. We assume they have more than one
2849 nigel 77 character in them, so set class_charcount bigger than one. */
2850    
2851     if (c == '\\')
2852     {
2853 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2854     if (*errorcodeptr != 0) goto FAILED;
2855 nigel 77
2856 ph10 275 if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */
2857 nigel 77 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2858 nigel 93 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2859 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
2860     {
2861     if (ptr[1] == '\\' && ptr[2] == 'E')
2862     {
2863     ptr += 2; /* avoid empty string */
2864     }
2865     else inescq = TRUE;
2866     continue;
2867     }
2868 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2869 nigel 77
2870     if (c < 0)
2871     {
2872     register const uschar *cbits = cd->cbits;
2873     class_charcount += 2; /* Greater than 1 is what matters */
2874 nigel 93
2875     /* Save time by not doing this in the pre-compile phase. */
2876    
2877     if (lengthptr == NULL) switch (-c)
2878 nigel 77 {
2879     case ESC_d:
2880     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2881     continue;
2882    
2883     case ESC_D:
2884 ph10 286 should_flip_negation = TRUE;
2885 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2886     continue;
2887    
2888     case ESC_w:
2889     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2890     continue;
2891    
2892     case ESC_W:
2893 ph10 286 should_flip_negation = TRUE;
2894 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2895     continue;
2896    
2897     case ESC_s:
2898     for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2899     classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2900     continue;
2901    
2902     case ESC_S:
2903 ph10 286 should_flip_negation = TRUE;
2904 nigel 77 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2905     classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2906     continue;
2907    
2908 nigel 93 default: /* Not recognized; fall through */
2909     break; /* Need "default" setting to stop compiler warning. */
2910     }
2911    
2912     /* In the pre-compile phase, just do the recognition. */
2913    
2914     else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2915     c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2916 ph10 180
2917 ph10 178 /* We need to deal with \H, \h, \V, and \v in both phases because
2918     they use extra memory. */
2919 ph10 180
2920 ph10 178 if (-c == ESC_h)
2921     {
2922     SETBIT(classbits, 0x09); /* VT */
2923     SETBIT(classbits, 0x20); /* SPACE */
2924 ph10 180 SETBIT(classbits, 0xa0); /* NSBP */
2925 ph10 178 #ifdef SUPPORT_UTF8
2926     if (utf8)
2927 ph10 180 {
2928 ph10 178 class_utf8 = TRUE;
2929     *class_utf8data++ = XCL_SINGLE;
2930 ph10 180 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2931 ph10 178 *class_utf8data++ = XCL_SINGLE;
2932 ph10 180 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2933     *class_utf8data++ = XCL_RANGE;
2934     class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2935     class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2936 ph10 178 *class_utf8data++ = XCL_SINGLE;
2937 ph10 180 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2938 ph10 178 *class_utf8data++ = XCL_SINGLE;
2939 ph10 180 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2940 ph10 178 *class_utf8data++ = XCL_SINGLE;
2941 ph10 180 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2942     }
2943     #endif
2944     continue;
2945     }
2946 nigel 93
2947 ph10 178 if (-c == ESC_H)
2948     {
2949     for (c = 0; c < 32; c++)
2950     {
2951     int x = 0xff;
2952     switch (c)
2953 ph10 180 {
2954 ph10 178 case 0x09/8: x ^= 1 << (0x09%8); break;
2955     case 0x20/8: x ^= 1 << (0x20%8); break;
2956     case 0xa0/8: x ^= 1 << (0xa0%8); break;
2957     default: break;
2958     }
2959     classbits[c] |= x;
2960 ph10 180 }
2961    
2962 ph10 178 #ifdef SUPPORT_UTF8
2963     if (utf8)
2964 ph10 180 {
2965 ph10 178 class_utf8 = TRUE;
2966 ph10 180 *class_utf8data++ = XCL_RANGE;
2967     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2968     class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2969     *class_utf8data++ = XCL_RANGE;
2970     class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2971     class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2972     *class_utf8data++ = XCL_RANGE;
2973     class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2974     class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2975     *class_utf8data++ = XCL_RANGE;
2976     class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2977     class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2978     *class_utf8data++ = XCL_RANGE;
2979     class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2980     class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2981     *class_utf8data++ = XCL_RANGE;
2982     class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2983     class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2984     *class_utf8data++ = XCL_RANGE;
2985     class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2986     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2987     }
2988     #endif
2989     continue;
2990     }
2991 ph10 178
2992     if (-c == ESC_v)
2993     {
2994     SETBIT(classbits, 0x0a); /* LF */
2995     SETBIT(classbits, 0x0b); /* VT */
2996 ph10 180 SETBIT(classbits, 0x0c); /* FF */
2997     SETBIT(classbits, 0x0d); /* CR */
2998     SETBIT(classbits, 0x85); /* NEL */
2999 ph10 178 #ifdef SUPPORT_UTF8
3000     if (utf8)
3001 ph10 180 {
3002 ph10 178 class_utf8 = TRUE;
3003 ph10 180 *class_utf8data++ = XCL_RANGE;
3004     class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3005     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3006     }
3007     #endif
3008     continue;
3009     }
3010 ph10 178
3011     if (-c == ESC_V)
3012     {
3013     for (c = 0; c < 32; c++)
3014     {
3015     int x = 0xff;
3016     switch (c)
3017 ph10 180 {
3018 ph10 178 case 0x0a/8: x ^= 1 << (0x0a%8);
3019     x ^= 1 << (0x0b%8);
3020     x ^= 1 << (0x0c%8);
3021 ph10 180 x ^= 1 << (0x0d%8);
3022 ph10 178 break;
3023     case 0x85/8: x ^= 1 << (0x85%8); break;
3024     default: break;
3025     }
3026     classbits[c] |= x;
3027 ph10 180 }
3028    
3029 ph10 178 #ifdef SUPPORT_UTF8
3030     if (utf8)
3031 ph10 180 {
3032 ph10 178 class_utf8 = TRUE;
3033 ph10 180 *class_utf8data++ = XCL_RANGE;
3034     class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3035     class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3036     *class_utf8data++ = XCL_RANGE;
3037     class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3038     class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3039     }
3040     #endif
3041     continue;
3042     }
3043 ph10 178
3044 nigel 93 /* We need to deal with \P and \p in both phases. */
3045    
3046 nigel 77 #ifdef SUPPORT_UCP
3047 nigel 93 if (-c == ESC_p || -c == ESC_P)
3048     {
3049     BOOL negated;
3050     int pdata;
3051     int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3052     if (ptype < 0) goto FAILED;
3053     class_utf8 = TRUE;
3054     *class_utf8data++ = ((-c == ESC_p) != negated)?
3055     XCL_PROP : XCL_NOTPROP;
3056     *class_utf8data++ = ptype;
3057     *class_utf8data++ = pdata;
3058     class_charcount -= 2; /* Not a < 256 character */
3059 nigel 77 continue;
3060 nigel 93 }
3061 nigel 77 #endif
3062 nigel 93 /* Unrecognized escapes are faulted if PCRE is running in its
3063     strict mode. By default, for compatibility with Perl, they are
3064     treated as literals. */
3065 nigel 77
3066 nigel 93 if ((options & PCRE_EXTRA) != 0)
3067     {
3068     *errorcodeptr = ERR7;
3069     goto FAILED;
3070     }
3071 nigel 77
3072 nigel 93 class_charcount -= 2; /* Undo the default count from above */
3073     c = *ptr; /* Get the final character and fall through */
3074 nigel 77 }
3075    
3076     /* Fall through if we have a single character (c >= 0). This may be
3077 nigel 93 greater than 256 in UTF-8 mode. */
3078 nigel 77
3079     } /* End of backslash handling */
3080    
3081     /* A single character may be followed by '-' to form a range. However,
3082     Perl does not permit ']' to be the end of the range. A '-' character
3083 nigel 93 at the end is treated as a literal. Perl ignores orphaned \E sequences
3084     entirely. The code for handling \Q and \E is messy. */
3085 nigel 77
3086 nigel 93 CHECK_RANGE:
3087     while (ptr[1] == '\\' && ptr[2] == 'E')
3088 nigel 77 {
3089 nigel 93 inescq = FALSE;
3090     ptr += 2;
3091     }
3092    
3093     oldptr = ptr;
3094 ph10 231
3095 ph10 230 /* Remember \r or \n */
3096 ph10 231
3097     if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3098    
3099 ph10 230 /* Check for range */
3100 nigel 93
3101     if (!inescq && ptr[1] == '-')
3102     {
3103 nigel 77 int d;
3104     ptr += 2;
3105 nigel 93 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3106 nigel 77
3107 nigel 93 /* If we hit \Q (not followed by \E) at this point, go into escaped
3108     mode. */
3109    
3110     while (*ptr == '\\' && ptr[1] == 'Q')
3111     {
3112     ptr += 2;
3113     if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3114     inescq = TRUE;
3115     break;
3116     }
3117    
3118     if (*ptr == 0 || (!inescq && *ptr == ']'))
3119     {
3120     ptr = oldptr;
3121     goto LONE_SINGLE_CHARACTER;
3122     }
3123    
3124 nigel 77 #ifdef SUPPORT_UTF8
3125     if (utf8)
3126     { /* Braces are required because the */
3127     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3128     }
3129     else
3130     #endif
3131     d = *ptr; /* Not UTF-8 mode */
3132    
3133     /* The second part of a range can be a single-character escape, but
3134     not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3135     in such circumstances. */
3136    
3137 nigel 93 if (!inescq && d == '\\')
3138 nigel 77 {
3139 nigel 93 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3140     if (*errorcodeptr != 0) goto FAILED;
3141 nigel 77
3142 ph10 275 /* \b is backspace; \X is literal X; \R is literal R; any other
3143 nigel 93 special means the '-' was literal */
3144 nigel 77
3145     if (d < 0)
3146     {
3147     if (d == -ESC_b) d = '\b';
3148 nigel 93 else if (d == -ESC_X) d = 'X';
3149     else if (d == -ESC_R) d = 'R'; else
3150 nigel 77 {
3151 nigel 93 ptr = oldptr;
3152 nigel 77 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3153     }
3154     }
3155     }
3156    
3157 nigel 93 /* Check that the two values are in the correct order. Optimize
3158     one-character ranges */
3159 nigel 77
3160 nigel 93 if (d < c)
3161     {
3162     *errorcodeptr = ERR8;
3163     goto FAILED;
3164     }
3165    
3166 nigel 77 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3167    
3168 ph10 230 /* Remember \r or \n */
3169 ph10 231
3170     if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3171    
3172 nigel 77 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3173     matching, we have to use an XCLASS with extra data items. Caseless
3174     matching for characters > 127 is available only if UCP support is
3175     available. */
3176    
3177     #ifdef SUPPORT_UTF8
3178     if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3179     {
3180     class_utf8 = TRUE;
3181    
3182     /* With UCP support, we can find the other case equivalents of
3183     the relevant characters. There may be several ranges. Optimize how
3184     they fit with the basic range. */
3185    
3186     #ifdef SUPPORT_UCP
3187     if ((options & PCRE_CASELESS) != 0)
3188     {
3189 nigel 93 unsigned int occ, ocd;
3190     unsigned int cc = c;
3191     unsigned int origd = d;
3192 nigel 77 while (get_othercase_range(&cc, origd, &occ, &ocd))
3193     {
3194 ph10 180 if (occ >= (unsigned int)c &&
3195     ocd <= (unsigned int)d)
3196 ph10 176 continue; /* Skip embedded ranges */
3197 nigel 77
3198 ph10 180 if (occ < (unsigned int)c &&
3199 ph10 176 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3200 nigel 77 { /* if there is overlap, */
3201     c = occ; /* noting that if occ < c */
3202     continue; /* we can't have ocd > d */
3203     } /* because a subrange is */
3204 ph10 180 if (ocd > (unsigned int)d &&
3205 ph10 176 occ <= (unsigned int)d + 1) /* always shorter than */
3206 nigel 77 { /* the basic range. */
3207     d = ocd;
3208     continue;
3209     }
3210    
3211     if (occ == ocd)
3212     {
3213     *class_utf8data++ = XCL_SINGLE;
3214     }
3215     else
3216     {
3217     *class_utf8data++ = XCL_RANGE;
3218     class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3219     }
3220     class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3221     }
3222     }
3223     #endif /* SUPPORT_UCP */
3224    
3225     /* Now record the original range, possibly modified for UCP caseless
3226     overlapping ranges. */
3227    
3228     *class_utf8data++ = XCL_RANGE;
3229     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3230     class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3231    
3232     /* With UCP support, we are done. Without UCP support, there is no
3233     caseless matching for UTF-8 characters > 127; we can use the bit map
3234     for the smaller ones. */
3235    
3236     #ifdef SUPPORT_UCP
3237     continue; /* With next character in the class */
3238     #else
3239     if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3240    
3241     /* Adjust upper limit and fall through to set up the map */
3242    
3243     d = 127;
3244    
3245     #endif /* SUPPORT_UCP */
3246     }
3247     #endif /* SUPPORT_UTF8 */
3248    
3249     /* We use the bit map for all cases when not in UTF-8 mode; else
3250     ranges that lie entirely within 0-127 when there is UCP support; else
3251     for partial ranges without UCP support. */
3252    
3253 nigel 93 class_charcount += d - c + 1;
3254     class_lastchar = d;
3255    
3256     /* We can save a bit of time by skipping this in the pre-compile. */
3257    
3258     if (lengthptr == NULL) for (; c <= d; c++)
3259 nigel 77 {
3260     classbits[c/8] |= (1 << (c&7));
3261     if ((options & PCRE_CASELESS) != 0)
3262     {
3263     int uc = cd->fcc[c]; /* flip case */
3264     classbits[uc/8] |= (1 << (uc&7));
3265     }
3266     }
3267    
3268     continue; /* Go get the next char in the class */
3269     }
3270    
3271     /* Handle a lone single character - we can get here for a normal
3272     non-escape char, or after \ that introduces a single character or for an
3273     apparent range that isn't. */
3274    
3275     LONE_SINGLE_CHARACTER:
3276 ph10 231
3277 nigel 77 /* Handle a character that cannot go in the bit map */
3278    
3279     #ifdef SUPPORT_UTF8
3280     if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3281     {
3282     class_utf8 = TRUE;
3283     *class_utf8data++ = XCL_SINGLE;
3284     class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3285    
3286     #ifdef SUPPORT_UCP
3287     if ((options & PCRE_CASELESS) != 0)
3288     {
3289 nigel 93 unsigned int othercase;
3290     if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3291 nigel 77 {
3292     *class_utf8data++ = XCL_SINGLE;
3293     class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3294     }
3295     }
3296     #endif /* SUPPORT_UCP */
3297    
3298     }
3299     else
3300     #endif /* SUPPORT_UTF8 */
3301    
3302     /* Handle a single-byte character */
3303     {
3304     classbits[c/8] |= (1 << (c&7));
3305     if ((options & PCRE_CASELESS) != 0)
3306     {
3307     c = cd->fcc[c]; /* flip case */
3308     classbits[c/8] |= (1 << (c&7));
3309     }
3310     class_charcount++;
3311     class_lastchar = c;
3312     }
3313     }
3314    
3315 nigel 93 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3316 nigel 77
3317 nigel 93 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3318 nigel 77
3319 nigel 93 if (c == 0) /* Missing terminating ']' */
3320     {
3321     *errorcodeptr = ERR6;
3322     goto FAILED;
3323     }
3324 ph10 231
3325    
3326 ph10 230 /* This code has been disabled because it would mean that \s counts as
3327     an explicit \r or \n reference, and that's not really what is wanted. Now
3328     we set the flag only if there is a literal "\r" or "\n" in the class. */
3329 ph10 227
3330 ph10 230 #if 0
3331 ph10 226 /* Remember whether \r or \n are in this class */
3332 ph10 227
3333 ph10 226 if (negate_class)
3334     {
3335 ph10 230 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3336 ph10 226 }
3337     else
3338     {
3339 ph10 230 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3340 ph10 227 }
3341 ph10 230 #endif
3342 ph10 227
3343 ph10 231
3344 nigel 77 /* If class_charcount is 1, we saw precisely one character whose value is
3345 ph10 227 less than 256. As long as there were no characters >= 128 and there was no
3346     use of \p or \P, in other words, no use of any XCLASS features, we can
3347     optimize.
3348    
3349 ph10 223 In UTF-8 mode, we can optimize the negative case only if there were no
3350     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3351     operate on single-bytes only. This is an historical hangover. Maybe one day
3352     we can tidy these opcodes to handle multi-byte characters.
3353 nigel 77
3354     The optimization throws away the bit map. We turn the item into a
3355     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3356     that OP_NOT does not support multibyte characters. In the positive case, it
3357     can cause firstbyte to be set. Otherwise, there can be no first char if
3358     this item is first, whatever repeat count may follow. In the case of
3359     reqbyte, save the previous value for reinstating. */
3360    
3361     #ifdef SUPPORT_UTF8
3362 ph10 227 if (class_charcount == 1 && !class_utf8 &&
3363 ph10 223 (!utf8 || !negate_class || class_lastchar < 128))
3364 nigel 77 #else
3365     if (class_charcount == 1)
3366     #endif
3367     {
3368     zeroreqbyte = reqbyte;
3369    
3370     /* The OP_NOT opcode works on one-byte characters only. */
3371    
3372     if (negate_class)
3373     {
3374     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3375     zerofirstbyte = firstbyte;
3376     *code++ = OP_NOT;
3377     *code++ = class_lastchar;
3378     break;
3379     }
3380    
3381     /* For a single, positive character, get the value into mcbuffer, and
3382     then we can handle this with the normal one-character code. */
3383    
3384     #ifdef SUPPORT_UTF8
3385     if (utf8 && class_lastchar > 127)
3386     mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3387     else
3388     #endif
3389     {
3390     mcbuffer[0] = class_lastchar;
3391     mclength = 1;
3392     }
3393     goto ONE_CHAR;
3394     } /* End of 1-char optimization */
3395    
3396     /* The general case - not the one-char optimization. If this is the first
3397     thing in the branch, there can be no first char setting, whatever the
3398     repeat count. Any reqbyte setting must remain unchanged after any kind of
3399     repeat. */
3400    
3401     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3402     zerofirstbyte = firstbyte;
3403     zeroreqbyte = reqbyte;
3404    
3405     /* If there are characters with values > 255, we have to compile an
3406 ph10 286 extended class, with its own opcode, unless there was a negated special
3407     such as \S in the class, because in that case all characters > 255 are in
3408     the class, so any that were explicitly given as well can be ignored. If
3409 ph10 264 (when there are explicit characters > 255 that must be listed) there are no
3410     characters < 256, we can omit the bitmap in the actual compiled code. */
3411 nigel 77
3412     #ifdef SUPPORT_UTF8
3413 ph10 264 if (class_utf8 && !should_flip_negation)
3414 nigel 77 {
3415     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3416     *code++ = OP_XCLASS;
3417     code += LINK_SIZE;
3418     *code = negate_class? XCL_NOT : 0;
3419    
3420 nigel 93 /* If the map is required, move up the extra data to make room for it;
3421     otherwise just move the code pointer to the end of the extra data. */
3422 nigel 77
3423     if (class_charcount > 0)
3424     {
3425     *code++ |= XCL_MAP;
3426 nigel 93 memmove(code + 32, code, class_utf8data - code);
3427 nigel 77 memcpy(code, classbits, 32);
3428 nigel 93 code = class_utf8data + 32;
3429 nigel 77 }
3430 nigel 93 else code = class_utf8data;
3431 nigel 77
3432     /* Now fill in the complete length of the item */
3433    
3434     PUT(previous, 1, code - previous);
3435     break; /* End of class handling */
3436     }
3437     #endif
3438    
3439 ph10 286 /* If there are no characters > 255, set the opcode to OP_CLASS or
3440     OP_NCLASS, depending on whether the whole class was negated and whether
3441     there were negative specials such as \S in the class. Then copy the 32-byte
3442 ph10 264 map into the code vector, negating it if necessary. */
3443 ph10 286
3444 ph10 264 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3445 nigel 77 if (negate_class)
3446     {
3447 nigel 93 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3448     for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3449 nigel 77 }
3450     else
3451     {
3452     memcpy(code, classbits, 32);
3453     }
3454     code += 32;
3455     break;
3456    
3457 nigel 93
3458     /* ===================================================================*/
3459 nigel 77 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3460     has been tested above. */
3461    
3462     case '{':
3463     if (!is_quantifier) goto NORMAL_CHAR;
3464     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3465     if (*errorcodeptr != 0) goto FAILED;
3466     goto REPEAT;
3467    
3468     case '*':
3469     repeat_min = 0;
3470     repeat_max = -1;
3471     goto REPEAT;
3472    
3473     case '+':
3474     repeat_min = 1;
3475     repeat_max = -1;
3476     goto REPEAT;
3477    
3478     case '?':
3479     repeat_min = 0;
3480     repeat_max = 1;
3481    
3482     REPEAT:
3483     if (previous == NULL)
3484     {
3485     *errorcodeptr = ERR9;
3486     goto FAILED;
3487     }
3488    
3489     if (repeat_min == 0)
3490     {
3491     firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3492     reqbyte = zeroreqbyte; /* Ditto */
3493     }
3494    
3495     /* Remember whether this is a variable length repeat */
3496    
3497     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3498    
3499     op_type = 0; /* Default single-char op codes */
3500     possessive_quantifier = FALSE; /* Default not possessive quantifier */
3501    
3502     /* Save start of previous item, in case we have to move it up to make space
3503     for an inserted OP_ONCE for the additional '+' extension. */
3504    
3505     tempcode = previous;
3506    
3507     /* If the next character is '+', we have a possessive quantifier. This
3508     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3509     If the next character is '?' this is a minimizing repeat, by default,
3510     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3511     repeat type to the non-default. */
3512    
3513     if (ptr[1] == '+')
3514     {
3515     repeat_type = 0; /* Force greedy */
3516     possessive_quantifier = TRUE;
3517     ptr++;
3518     }
3519     else if (ptr[1] == '?')
3520     {
3521     repeat_type = greedy_non_default;
3522     ptr++;
3523     }
3524     else repeat_type = greedy_default;
3525    
3526     /* If previous was a character match, abolish the item and generate a
3527     repeat item instead. If a char item has a minumum of more than one, ensure
3528     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3529     the first thing in a branch because the x will have gone into firstbyte
3530     instead. */
3531    
3532     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3533     {
3534     /* Deal with UTF-8 characters that take up more than one byte. It's
3535     easier to write this out separately than try to macrify it. Use c to
3536     hold the length of the character in bytes, plus 0x80 to flag that it's a
3537     length rather than a small character. */
3538    
3539     #ifdef SUPPORT_UTF8
3540     if (utf8 && (code[-1] & 0x80) != 0)
3541     {
3542     uschar *lastchar = code - 1;
3543     while((*lastchar & 0xc0) == 0x80) lastchar--;
3544     c = code - lastchar; /* Length of UTF-8 character */
3545     memcpy(utf8_char, lastchar, c); /* Save the char */
3546     c |= 0x80; /* Flag c as a length */
3547     }
3548     else
3549     #endif
3550    
3551     /* Handle the case of a single byte - either with no UTF8 support, or
3552     with UTF-8 disabled, or for a UTF-8 character < 128. */
3553    
3554     {
3555     c = code[-1];
3556     if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3557     }
3558    
3559 nigel 93 /* If the repetition is unlimited, it pays to see if the next thing on
3560     the line is something that cannot possibly match this character. If so,
3561     automatically possessifying this item gains some performance in the case
3562     where the match fails. */
3563    
3564     if (!possessive_quantifier &&
3565     repeat_max < 0 &&
3566     check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3567     options, cd))
3568     {
3569     repeat_type = 0; /* Force greedy */
3570     possessive_quantifier = TRUE;
3571     }
3572    
3573 nigel 77 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3574     }
3575    
3576     /* If previous was a single negated character ([^a] or similar), we use
3577     one of the special opcodes, replacing it. The code is shared with single-
3578     character repeats by setting opt_type to add a suitable offset into
3579 nigel 93 repeat_type. We can also test for auto-possessification. OP_NOT is
3580     currently used only for single-byte chars. */
3581 nigel 77
3582     else if (*previous == OP_NOT)
3583     {
3584     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3585     c = previous[1];
3586 nigel 93 if (!possessive_quantifier &&
3587     repeat_max < 0 &&
3588     check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3589     {
3590     repeat_type = 0; /* Force greedy */
3591     possessive_quantifier = TRUE;
3592     }
3593 nigel 77 goto OUTPUT_SINGLE_REPEAT;
3594     }
3595    
3596     /* If previous was a character type match (\d or similar), abolish it and
3597     create a suitable repeat item. The code is shared with single-character
3598     repeats by setting op_type to add a suitable offset into repeat_type. Note
3599     the the Unicode property types will be present only when SUPPORT_UCP is
3600     defined, but we don't wrap the little bits of code here because it just
3601     makes it horribly messy. */
3602    
3603     else if (*previous < OP_EODN)
3604     {
3605     uschar *oldcode;
3606 nigel 87 int prop_type, prop_value;
3607 nigel 77 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3608     c = *previous;
3609    
3610 nigel 93 if (!possessive_quantifier &&
3611     repeat_max < 0 &&
3612     check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3613     {
3614     repeat_type = 0; /* Force greedy */
3615     possessive_quantifier = TRUE;
3616     }
3617    
3618 nigel 77 OUTPUT_SINGLE_REPEAT:
3619 nigel 87 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3620     {
3621     prop_type = previous[1];
3622     prop_value = previous[2];
3623     }
3624     else prop_type = prop_value = -1;
3625 nigel 77
3626     oldcode = code;
3627     code = previous; /* Usually overwrite previous item */
3628    
3629     /* If the maximum is zero then the minimum must also be zero; Perl allows
3630     this case, so we do too - by simply omitting the item altogether. */
3631    
3632     if (repeat_max == 0) goto END_REPEAT;
3633    
3634     /* All real repeats make it impossible to handle partial matching (maybe
3635     one day we will be able to remove this restriction). */
3636    
3637 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3638 nigel 77
3639     /* Combine the op_type with the repeat_type */
3640    
3641     repeat_type += op_type;
3642    
3643     /* A minimum of zero is handled either as the special case * or ?, or as
3644     an UPTO, with the maximum given. */
3645    
3646     if (repeat_min == 0)
3647     {
3648     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3649     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3650     else
3651     {
3652     *code++ = OP_UPTO + repeat_type;
3653     PUT2INC(code, 0, repeat_max);
3654     }
3655     }
3656    
3657     /* A repeat minimum of 1 is optimized into some special cases. If the
3658 nigel 93 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3659 nigel 77 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3660     one less than the maximum. */
3661    
3662     else if (repeat_min == 1)
3663     {
3664     if (repeat_max == -1)
3665     *code++ = OP_PLUS + repeat_type;
3666     else
3667     {
3668     code = oldcode; /* leave previous item in place */
3669     if (repeat_max == 1) goto END_REPEAT;
3670     *code++ = OP_UPTO + repeat_type;
3671     PUT2INC(code, 0, repeat_max - 1);
3672     }
3673     }
3674    
3675     /* The case {n,n} is just an EXACT, while the general case {n,m} is
3676     handled as an EXACT followed by an UPTO. */
3677    
3678     else
3679     {
3680     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3681     PUT2INC(code, 0, repeat_min);
3682    
3683     /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3684     we have to insert the character for the previous code. For a repeated
3685 nigel 87 Unicode property match, there are two extra bytes that define the
3686 nigel 77 required property. In UTF-8 mode, long characters have their length in
3687     c, with the 0x80 bit as a flag. */
3688    
3689     if (repeat_max < 0)
3690     {
3691     #ifdef SUPPORT_UTF8
3692     if (utf8 && c >= 128)
3693     {
3694     memcpy(code, utf8_char, c & 7);
3695     code += c & 7;
3696     }
3697     else
3698     #endif
3699     {
3700     *code++ = c;
3701 nigel 87 if (prop_type >= 0)
3702     {
3703     *code++ = prop_type;
3704     *code++ = prop_value;
3705     }
3706 nigel 77 }
3707     *code++ = OP_STAR + repeat_type;
3708     }
3709    
3710     /* Else insert an UPTO if the max is greater than the min, again
3711 nigel 93 preceded by the character, for the previously inserted code. If the
3712     UPTO is just for 1 instance, we can use QUERY instead. */
3713 nigel 77
3714     else if (repeat_max != repeat_min)
3715     {
3716     #ifdef SUPPORT_UTF8
3717     if (utf8 && c >= 128)
3718     {
3719     memcpy(code, utf8_char, c & 7);
3720     code += c & 7;
3721     }
3722     else
3723     #endif
3724     *code++ = c;
3725 nigel 87 if (prop_type >= 0)
3726     {
3727     *code++ = prop_type;
3728     *code++ = prop_value;
3729     }
3730 nigel 77 repeat_max -= repeat_min;
3731 nigel 93
3732     if (repeat_max == 1)
3733     {
3734     *code++ = OP_QUERY + repeat_type;
3735     }
3736     else
3737     {
3738     *code++ = OP_UPTO + repeat_type;
3739     PUT2INC(code, 0, repeat_max);
3740     }
3741 nigel 77 }
3742     }
3743    
3744     /* The character or character type itself comes last in all cases. */
3745    
3746     #ifdef SUPPORT_UTF8
3747     if (utf8 && c >= 128)
3748     {
3749     memcpy(code, utf8_char, c & 7);
3750     code += c & 7;
3751     }
3752     else
3753     #endif
3754     *code++ = c;
3755    
3756 nigel 87 /* For a repeated Unicode property match, there are two extra bytes that
3757     define the required property. */
3758 nigel 77
3759     #ifdef SUPPORT_UCP
3760 nigel 87 if (prop_type >= 0)
3761     {
3762     *code++ = prop_type;
3763     *code++ = prop_value;
3764     }
3765 nigel 77 #endif
3766     }
3767    
3768     /* If previous was a character class or a back reference, we put the repeat
3769     stuff after it, but just skip the item if the repeat was {0,0}. */
3770    
3771     else if (*previous == OP_CLASS ||
3772     *previous == OP_NCLASS ||
3773     #ifdef SUPPORT_UTF8
3774     *previous == OP_XCLASS ||
3775     #endif
3776     *previous == OP_REF)
3777     {
3778     if (repeat_max == 0)
3779     {
3780     code = previous;
3781     goto END_REPEAT;
3782     }
3783    
3784     /* All real repeats make it impossible to handle partial matching (maybe
3785     one day we will be able to remove this restriction). */
3786    
3787 ph10 230 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3788 nigel 77
3789     if (repeat_min == 0 && repeat_max == -1)
3790     *code++ = OP_CRSTAR + repeat_type;
3791     else if (repeat_min == 1 && repeat_max == -1)
3792     *code++ = OP_CRPLUS + repeat_type;
3793     else if (repeat_min == 0 && repeat_max == 1)
3794     *code++ = OP_CRQUERY + repeat_type;
3795     else
3796     {
3797     *code++ = OP_CRRANGE + repeat_type;
3798     PUT2INC(code, 0, repeat_min);
3799     if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3800     PUT2INC(code, 0, repeat_max);
3801     }
3802     }
3803    
3804     /* If previous was a bracket group, we may have to replicate it in certain
3805     cases. */
3806    
3807 nigel 93 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3808     *previous == OP_ONCE || *previous == OP_COND)
3809 nigel 77 {
3810     register int i;
3811     int ketoffset = 0;
3812     int len = code - previous;
3813     uschar *bralink = NULL;
3814    
3815 nigel 93 /* Repeating a DEFINE group is pointless */
3816    
3817     if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3818     {
3819     *errorcodeptr = ERR55;
3820     goto FAILED;
3821     }
3822    
3823 nigel 77 /* If the maximum repeat count is unlimited, find the end of the bracket
3824     by scanning through from the start, and compute the offset back to it
3825     from the current code pointer. There may be an OP_OPT setting following
3826     the final KET, so we can't find the end just by going back from the code
3827     pointer. */
3828    
3829     if (repeat_max == -1)
3830     {
3831     register uschar *ket = previous;
3832     do ket += GET(ket, 1); while (*ket != OP_KET);
3833     ketoffset = code - ket;
3834     }
3835    
3836     /* The case of a zero minimum is special because of the need to stick
3837     OP_BRAZERO in front of it, and because the group appears once in the
3838     data, whereas in other cases it appears the minimum number of times. For
3839     this reason, it is simplest to treat this case separately, as otherwise
3840     the code gets far too messy. There are several special subcases when the
3841     minimum is zero. */
3842    
3843     if (repeat_min == 0)
3844     {
3845     /* If the maximum is also zero, we just omit the group from the output
3846     altogether. */
3847    
3848     if (repeat_max == 0)
3849     {
3850     code = previous;
3851     goto END_REPEAT;
3852     }
3853    
3854     /* If the maximum is 1 or unlimited, we just have to stick in the
3855     BRAZERO and do no more at this point. However, we do need to adjust
3856     any OP_RECURSE calls inside the group that refer to the group itself or
3857 nigel 93 any internal or forward referenced group, because the offset is from
3858     the start of the whole regex. Temporarily terminate the pattern while
3859     doing this. */
3860 nigel 77
3861     if (repeat_max <= 1)
3862     {
3863     *code = OP_END;
3864 nigel 93 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3865 nigel 77 memmove(previous+1, previous, len);
3866     code++;
3867     *previous++ = OP_BRAZERO + repeat_type;
3868     }
3869    
3870     /* If the maximum is greater than 1 and limited, we have to replicate
3871     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3872     The first one has to be handled carefully because it's the original
3873     copy, which has to be moved up. The remainder can be handled by code
3874     that is common with the non-zero minimum case below. We have to
3875     adjust the value or repeat_max, since one less copy is required. Once
3876     again, we may have to adjust any OP_RECURSE calls inside the group. */
3877    
3878     else
3879     {
3880     int offset;
3881     *code = OP_END;
3882 nigel 93 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3883 nigel 77 memmove(previous + 2 + LINK_SIZE, previous, len);
3884     code += 2 + LINK_SIZE;
3885     *previous++ = OP_BRAZERO + repeat_type;
3886     *previous++ = OP_BRA;
3887    
3888     /* We chain together the bracket offset fields that have to be
3889     filled in later when the ends of the brackets are reached. */
3890    
3891     offset = (bralink == NULL)? 0 : previous - bralink;
3892     bralink = previous;
3893     PUTINC(previous, 0, offset);
3894     }
3895    
3896     repeat_max--;
3897     }
3898    
3899     /* If the minimum is greater than zero, replicate the group as many
3900     times as necessary, and adjust the maximum to the number of subsequent
3901     copies that we need. If we set a first char from the group, and didn't
3902 nigel 93 set a required char, copy the latter from the former. If there are any
3903     forward reference subroutine calls in the group, there will be entries on
3904     the workspace list; replicate these with an appropriate increment. */
3905 nigel 77
3906     else
3907     {
3908     if (repeat_min > 1)
3909     {
3910 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3911 ph10 202 just adjust the length as if we had. Do some paranoid checks for
3912     potential integer overflow. */
3913 nigel 93
3914     if (lengthptr != NULL)
3915 ph10 202 {
3916     int delta = (repeat_min - 1)*length_prevgroup;
3917     if ((double)(repeat_min - 1)*(double)length_prevgroup >
3918     (double)INT_MAX ||
3919     OFLOW_MAX - *lengthptr < delta)
3920     {
3921     *errorcodeptr = ERR20;
3922     goto FAILED;
3923     }
3924     *lengthptr += delta;
3925     }
3926 nigel 93
3927     /* This is compiling for real */
3928    
3929     else
3930 nigel 77 {
3931 nigel 93 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3932     for (i = 1; i < repeat_min; i++)
3933     {
3934     uschar *hc;
3935     uschar *this_hwm = cd->hwm;
3936     memcpy(code, previous, len);
3937     for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3938     {
3939     PUT(cd->hwm, 0, GET(hc, 0) + len);
3940     cd->hwm += LINK_SIZE;
3941     }
3942     save_hwm = this_hwm;
3943     code += len;
3944     }
3945 nigel 77 }
3946     }
3947 nigel 93
3948 nigel 77 if (repeat_max > 0) repeat_max -= repeat_min;
3949     }
3950    
3951     /* This code is common to both the zero and non-zero minimum cases. If
3952     the maximum is limited, it replicates the group in a nested fashion,
3953     remembering the bracket starts on a stack. In the case of a zero minimum,
3954     the first one was set up above. In all cases the repeat_max now specifies
3955 nigel 93 the number of additional copies needed. Again, we must remember to
3956     replicate entries on the forward reference list. */
3957 nigel 77
3958     if (repeat_max >= 0)
3959     {
3960 nigel 93 /* In the pre-compile phase, we don't actually do the replication. We
3961     just adjust the length as if we had. For each repetition we must add 1
3962     to the length for BRAZERO and for all but the last repetition we must
3963 ph10 202 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3964     paranoid checks to avoid integer overflow. */
3965 nigel 93
3966     if (lengthptr != NULL && repeat_max > 0)
3967 ph10 202 {
3968     int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3969     2 - 2*LINK_SIZE; /* Last one doesn't nest */
3970     if ((double)repeat_max *
3971     (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3972     > (double)INT_MAX ||
3973     OFLOW_MAX - *lengthptr < delta)
3974     {
3975     *errorcodeptr = ERR20;
3976     goto FAILED;
3977     }
3978     *lengthptr += delta;
3979     }
3980 nigel 93
3981     /* This is compiling for real */
3982    
3983     else for (i = repeat_max - 1; i >= 0; i--)
3984 nigel 77 {
3985 nigel 93 uschar *hc;
3986     uschar *this_hwm = cd->hwm;
3987    
3988 nigel 77 *code++ = OP_BRAZERO + repeat_type;
3989    
3990     /* All but the final copy start a new nesting, maintaining the
3991     chain of brackets outstanding. */
3992    
3993     if (i != 0)
3994     {
3995     int offset;
3996     *code++ = OP_BRA;
3997     offset = (bralink == NULL)? 0 : code - bralink;
3998     bralink = code;
3999     PUTINC(code, 0, offset);
4000     }
4001    
4002     memcpy(code, previous, len);
4003 nigel 93 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4004     {
4005     PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4006     cd->hwm += LINK_SIZE;
4007     }
4008     save_hwm = this_hwm;
4009 nigel 77 code += len;
4010     }
4011    
4012     /* Now chain through the pending brackets, and fill in their length
4013     fields (which are holding the chain links pro tem). */
4014    
4015     while (bralink != NULL)
4016     {
4017     int oldlinkoffset;
4018     int offset = code - bralink + 1;
4019     uschar *bra = code - offset;
4020     oldlinkoffset = GET(bra, 1);
4021     bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4022     *code++ = OP_KET;
4023     PUTINC(code, 0, offset);
4024     PUT(bra, 1, offset);
4025     }
4026     }
4027    
4028     /* If the maximum is unlimited, set a repeater in the final copy. We
4029     can't just offset backwards from the current code point, because we
4030     don't know if there's been an options resetting after the ket. The
4031 nigel 93 correct offset was computed above.
4032 nigel 77
4033 nigel 93 Then, when we are doing the actual compile phase, check to see whether
4034     this group is a non-atomic one that could match an empty string. If so,
4035     convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4036     that runtime checking can be done. [This check is also applied to
4037     atomic groups at runtime, but in a different way.] */
4038    
4039     else
4040     {
4041     uschar *ketcode = code - ketoffset;
4042     uschar *bracode = ketcode - GET(ketcode, 1);
4043     *ketcode = OP_KETRMAX + repeat_type;
4044     if (lengthptr == NULL && *bracode != OP_ONCE)
4045     {
4046     uschar *scode = bracode;
4047     do
4048     {
4049     if (could_be_empty_branch(scode, ketcode, utf8))
4050     {
4051     *bracode += OP_SBRA - OP_BRA;
4052     break;
4053     }
4054     scode += GET(scode, 1);
4055     }
4056     while (*scode == OP_ALT);
4057     }
4058     }
4059 nigel 77 }
4060    
4061     /* Else there's some kind of shambles */
4062    
4063     else
4064     {
4065     *errorcodeptr = ERR11;
4066     goto FAILED;
4067     }
4068    
4069 nigel 93 /* If the character following a repeat is '+', or if certain optimization
4070     tests above succeeded, possessive_quantifier is TRUE. For some of the
4071     simpler opcodes, there is an special alternative opcode for this. For
4072     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4073     The '+' notation is just syntactic sugar, taken from Sun's Java package,
4074     but the special opcodes can optimize it a bit. The repeated item starts at
4075     tempcode, not at previous, which might be the first part of a string whose
4076     (former) last char we repeated.
4077 nigel 77
4078 nigel 93 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4079     an 'upto' may follow. We skip over an 'exact' item, and then test the
4080     length of what remains before proceeding. */
4081    
4082 nigel 77 if (possessive_quantifier)
4083     {
4084 nigel 93 int len;
4085     if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4086     *tempcode == OP_NOTEXACT)
4087 ph10 285 tempcode += _pcre_OP_lengths[*tempcode] +
4088 ph10 286 ((*tempcode == OP_TYPEEXACT &&
4089     (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4090 nigel 93 len = code - tempcode;
4091     if (len > 0) switch (*tempcode)
4092     {
4093     case OP_STAR: *tempcode = OP_POSSTAR; break;
4094     case OP_PLUS: *tempcode = OP_POSPLUS; break;
4095     case OP_QUERY: *tempcode = OP_POSQUERY; break;
4096     case OP_UPTO: *tempcode = OP_POSUPTO; break;
4097    
4098     case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4099     case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4100     case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4101     case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4102    
4103     case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4104     case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4105     case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4106     case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4107    
4108     default:
4109     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4110     code += 1 + LINK_SIZE;
4111     len += 1 + LINK_SIZE;