/[pcre]/code/trunk/pcre.c
ViewVC logotype

Contents of /code/trunk/pcre.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 73 - (hide annotations) (download)
Sat Feb 24 21:40:30 2007 UTC (7 years, 1 month ago) by nigel
File MIME type: text/plain
File size: 255554 byte(s)
Load pcre-4.5 into code/trunk.

1 nigel 3 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /*
6     This is a library of functions to support regular expressions whose syntax
7     and semantics are as close as possible to those of the Perl 5 language. See
8     the file Tech.Notes for some information on the internals.
9    
10     Written by: Philip Hazel <ph10@cam.ac.uk>
11    
12 nigel 63 Copyright (c) 1997-2003 University of Cambridge
13 nigel 3
14     -----------------------------------------------------------------------------
15     Permission is granted to anyone to use this software for any purpose on any
16     computer system, and to redistribute it freely, subject to the following
17     restrictions:
18    
19     1. This software is distributed in the hope that it will be useful,
20     but WITHOUT ANY WARRANTY; without even the implied warranty of
21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22    
23     2. The origin of this software must not be misrepresented, either by
24     explicit claim or by omission.
25    
26     3. Altered versions must be plainly marked as such, and must not be
27     misrepresented as being the original software.
28 nigel 29
29     4. If PCRE is embedded in any software that is released under the GNU
30     General Purpose Licence (GPL), then the terms of that licence shall
31     supersede any condition above with which it is incompatible.
32 nigel 3 -----------------------------------------------------------------------------
33     */
34    
35 nigel 73
36 nigel 3 /* Define DEBUG to get debugging output on stdout. */
37     /* #define DEBUG */
38    
39 nigel 23 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
40     inline, and there are *still* stupid compilers about that don't like indented
41     pre-processor statements. I suppose it's only been 10 years... */
42 nigel 3
43 nigel 9 #ifdef DEBUG
44     #define DPRINTF(p) printf p
45     #else
46     #define DPRINTF(p) /*nothing*/
47     #endif
48    
49 nigel 73 /* Include the internals header, which itself includes "config.h", the Standard
50     C headers, and the external pcre header. */
51 nigel 3
52     #include "internal.h"
53    
54    
55 nigel 15 /* Allow compilation as C++ source code, should anybody want to do that. */
56    
57     #ifdef __cplusplus
58     #define class pcre_class
59     #endif
60    
61    
62 nigel 53 /* Maximum number of items on the nested bracket stacks at compile time. This
63     applies to the nesting of all kinds of parentheses. It does not limit
64     un-nested, non-capturing parentheses. This number can be made bigger if
65     necessary - it is used to dimension one int and one unsigned char vector at
66     compile time. */
67 nigel 23
68     #define BRASTACK_SIZE 200
69    
70    
71 nigel 63 /* Maximum number of ints of offset to save on the stack for recursive calls.
72     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
73     because the offset vector is always a multiple of 3 long. */
74    
75     #define REC_STACK_SAVE_MAX 30
76    
77    
78 nigel 49 /* The number of bytes in a literal character string above which we can't add
79 nigel 63 any more is set at 250 in order to allow for UTF-8 characters. (In theory it
80     could be 255 when UTF-8 support is excluded, but that means that some of the
81     test output would be different, which just complicates things.) */
82 nigel 49
83     #define MAXLIT 250
84    
85    
86 nigel 65 /* The maximum remaining length of subject we are prepared to search for a
87     req_byte match. */
88    
89     #define REQ_BYTE_MAX 1000
90    
91    
92 nigel 63 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
93     the definition is next to the definition of the opcodes in internal.h. */
94    
95 nigel 73 static const uschar OP_lengths[] = { OP_LENGTHS };
96 nigel 63
97 nigel 3 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
98    
99 nigel 15 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
100     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
101 nigel 3
102     /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103     are simple data values; negative values are for special things like \d and so
104     on. Zero means further processing is needed (for things like \x), or the escape
105     is invalid. */
106    
107 nigel 73 #if !EBCDIC /* This is the "normal" table for ASCII systems */
108 nigel 15 static const short int escapes[] = {
109 nigel 3 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
110     0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
111 nigel 63 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
112 nigel 3 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
113 nigel 63 0, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
114 nigel 3 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
115 nigel 63 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
116     0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
117 nigel 69 0, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
118 nigel 23 0, 0, -ESC_z /* x - z */
119 nigel 3 };
120    
121 nigel 73 #else /* This is the "abnormal" table for EBCDIC systems */
122     static const short int escapes[] = {
123     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
124     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
125     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
126     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
127     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
128     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
129     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
130     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
131     /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
132     /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, 0,
133     /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
134     /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
135     /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
136     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
137     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
138     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
139     /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
140     /* D0 */ '}', 0, 0, 0, 0, 0, 0, 0,
141     /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
142     /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, 0,
143     /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
144     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
145     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
146     };
147     #endif
148    
149    
150 nigel 43 /* Tables of names of POSIX character classes and their lengths. The list is
151     terminated by a zero length entry. The first three must be alpha, upper, lower,
152     as this is assumed for handling case independence. */
153    
154 nigel 73 static const char *const posix_names[] = {
155 nigel 43 "alpha", "lower", "upper",
156 nigel 63 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
157 nigel 43 "print", "punct", "space", "word", "xdigit" };
158    
159     static const uschar posix_name_lengths[] = {
160 nigel 63 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
161 nigel 43
162     /* Table of class bit maps for each POSIX class; up to three may be combined
163 nigel 63 to form the class. The table for [:blank:] is dynamically modified to remove
164     the vertical space characters. */
165 nigel 43
166     static const int posix_class_maps[] = {
167     cbit_lower, cbit_upper, -1, /* alpha */
168     cbit_lower, -1, -1, /* lower */
169     cbit_upper, -1, -1, /* upper */
170     cbit_digit, cbit_lower, cbit_upper, /* alnum */
171     cbit_print, cbit_cntrl, -1, /* ascii */
172 nigel 63 cbit_space, -1, -1, /* blank - a GNU extension */
173 nigel 43 cbit_cntrl, -1, -1, /* cntrl */
174     cbit_digit, -1, -1, /* digit */
175     cbit_graph, -1, -1, /* graph */
176     cbit_print, -1, -1, /* print */
177     cbit_punct, -1, -1, /* punct */
178     cbit_space, -1, -1, /* space */
179 nigel 63 cbit_word, -1, -1, /* word - a Perl extension */
180 nigel 43 cbit_xdigit,-1, -1 /* xdigit */
181     };
182    
183 nigel 73 /* Table to identify digits and hex digits. This is used when compiling
184 nigel 69 patterns. Note that the tables in chartables are dependent on the locale, and
185     may mark arbitrary characters as digits - but the PCRE compiling code expects
186     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
187     a private table here. It costs 256 bytes, but it is a lot faster than doing
188     character value tests (at least in some simple cases I timed), and in some
189     applications one wants PCRE to compile efficiently as well as match
190     efficiently.
191 nigel 43
192 nigel 69 For convenience, we use the same bit definitions as in chartables:
193    
194     0x04 decimal digit
195     0x08 hexadecimal digit
196    
197     Then we can use ctype_digit and ctype_xdigit in the code. */
198    
199 nigel 73 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
200 nigel 69 static const unsigned char digitab[] =
201     {
202     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
203     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
204     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
205     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
206     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
207     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
208     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
209     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
210     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
211     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
212     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
213     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
214     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
215     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
216     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
217     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
218     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
219     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
220     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
221     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
222     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
223     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
224     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
225     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
226     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
227     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
228     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
229     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
230     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
231     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
232     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
233     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
234    
235 nigel 73 #else /* This is the "abnormal" case, for EBCDIC systems */
236     static const unsigned char digitab[] =
237     {
238     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
239     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
240     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
241     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
242     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
243     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
244     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
245     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
246     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
247     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
248     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
249     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- */
250     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
251     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
252     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
253     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
254     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
255     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
256     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
257     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
258     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
259     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
260     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
261     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
262     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
263     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
264     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
265     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
266     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
267     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
268     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
269     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
270    
271     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
272     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
273     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
274     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
275     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
276     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
277     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
278     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
279     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
280     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
281     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
282     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
283     0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- */
284     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
285     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
286     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
287     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
288     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
289     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
290     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
291     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
292     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
293     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
294     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
295     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
296     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
297     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
298     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
299     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
300     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
301     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
302     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
303     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
304     #endif
305    
306    
307 nigel 3 /* Definition to allow mutual recursion */
308    
309 nigel 13 static BOOL
310 nigel 23 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
311 nigel 63 BOOL, int, int *, int *, branch_chain *, compile_data *);
312 nigel 3
313 nigel 47 /* Structure for building a chain of data that actually lives on the
314     stack, for holding the values of the subject pointer at the start of each
315     subpattern, so as to detect when an empty string has been matched by a
316 nigel 73 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
317     are on the heap, not on the stack. */
318 nigel 3
319 nigel 47 typedef struct eptrblock {
320 nigel 73 struct eptrblock *epb_prev;
321     const uschar *epb_saved_eptr;
322 nigel 47 } eptrblock;
323 nigel 3
324 nigel 47 /* Flag bits for the match() function */
325    
326     #define match_condassert 0x01 /* Called to check a condition assertion */
327     #define match_isgroup 0x02 /* Set if start of bracketed group */
328    
329 nigel 63 /* Non-error returns from the match() function. Error returns are externally
330     defined PCRE_ERROR_xxx codes, which are all negative. */
331 nigel 47
332 nigel 63 #define MATCH_MATCH 1
333     #define MATCH_NOMATCH 0
334 nigel 47
335 nigel 63
336    
337 nigel 3 /*************************************************
338     * Global variables *
339     *************************************************/
340    
341     /* PCRE is thread-clean and doesn't use any global variables in the normal
342 nigel 73 sense. However, it calls memory allocation and free functions via the four
343 nigel 63 indirections below, and it can optionally do callouts. These values can be
344     changed by the caller, but are shared between all threads. However, when
345     compiling for Virtual Pascal, things are done differently (see pcre.in). */
346 nigel 3
347 nigel 63 #ifndef VPCOMPAT
348 nigel 71 #ifdef __cplusplus
349     extern "C" void *(*pcre_malloc)(size_t) = malloc;
350     extern "C" void (*pcre_free)(void *) = free;
351 nigel 73 extern "C" void *(*pcre_stack_malloc)(size_t) = malloc;
352     extern "C" void (*pcre_stack_free)(void *) = free;
353 nigel 71 extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL;
354     #else
355 nigel 3 void *(*pcre_malloc)(size_t) = malloc;
356     void (*pcre_free)(void *) = free;
357 nigel 73 void *(*pcre_stack_malloc)(size_t) = malloc;
358     void (*pcre_stack_free)(void *) = free;
359 nigel 63 int (*pcre_callout)(pcre_callout_block *) = NULL;
360     #endif
361 nigel 71 #endif
362 nigel 3
363    
364 nigel 49 /*************************************************
365     * Macros and tables for character handling *
366     *************************************************/
367 nigel 3
368 nigel 49 /* When UTF-8 encoding is being used, a character is no longer just a single
369     byte. The macros for character handling generate simple sequences when used in
370     byte-mode, and more complicated ones for UTF-8 characters. */
371    
372     #ifndef SUPPORT_UTF8
373 nigel 63 #define GETCHAR(c, eptr) c = *eptr;
374 nigel 49 #define GETCHARINC(c, eptr) c = *eptr++;
375 nigel 63 #define GETCHARINCTEST(c, eptr) c = *eptr++;
376 nigel 49 #define GETCHARLEN(c, eptr, len) c = *eptr;
377     #define BACKCHAR(eptr)
378    
379     #else /* SUPPORT_UTF8 */
380    
381 nigel 63 /* Get the next UTF-8 character, not advancing the pointer. This is called when
382     we know we are in UTF-8 mode. */
383 nigel 49
384 nigel 63 #define GETCHAR(c, eptr) \
385     c = *eptr; \
386     if ((c & 0xc0) == 0xc0) \
387     { \
388 nigel 67 int gcii; \
389     int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
390     int gcss = 6*gcaa; \
391     c = (c & utf8_table3[gcaa]) << gcss; \
392     for (gcii = 1; gcii <= gcaa; gcii++) \
393 nigel 63 { \
394 nigel 67 gcss -= 6; \
395     c |= (eptr[gcii] & 0x3f) << gcss; \
396 nigel 63 } \
397     }
398    
399     /* Get the next UTF-8 character, advancing the pointer. This is called when we
400     know we are in UTF-8 mode. */
401    
402 nigel 49 #define GETCHARINC(c, eptr) \
403     c = *eptr++; \
404 nigel 63 if ((c & 0xc0) == 0xc0) \
405     { \
406 nigel 67 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
407     int gcss = 6*gcaa; \
408     c = (c & utf8_table3[gcaa]) << gcss; \
409     while (gcaa-- > 0) \
410 nigel 63 { \
411 nigel 67 gcss -= 6; \
412     c |= (*eptr++ & 0x3f) << gcss; \
413 nigel 63 } \
414     }
415    
416     /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
417    
418     #define GETCHARINCTEST(c, eptr) \
419     c = *eptr++; \
420 nigel 49 if (md->utf8 && (c & 0xc0) == 0xc0) \
421     { \
422 nigel 67 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
423     int gcss = 6*gcaa; \
424     c = (c & utf8_table3[gcaa]) << gcss; \
425     while (gcaa-- > 0) \
426 nigel 49 { \
427 nigel 67 gcss -= 6; \
428     c |= (*eptr++ & 0x3f) << gcss; \
429 nigel 49 } \
430     }
431    
432 nigel 63 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
433     if there are extra bytes. This is called when we know we are in UTF-8 mode. */
434 nigel 49
435     #define GETCHARLEN(c, eptr, len) \
436     c = *eptr; \
437 nigel 63 if ((c & 0xc0) == 0xc0) \
438 nigel 49 { \
439 nigel 67 int gcii; \
440     int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
441     int gcss = 6*gcaa; \
442     c = (c & utf8_table3[gcaa]) << gcss; \
443     for (gcii = 1; gcii <= gcaa; gcii++) \
444 nigel 49 { \
445 nigel 67 gcss -= 6; \
446     c |= (eptr[gcii] & 0x3f) << gcss; \
447 nigel 49 } \
448 nigel 67 len += gcaa; \
449 nigel 49 }
450    
451     /* If the pointer is not at the start of a character, move it back until
452 nigel 63 it is. Called only in UTF-8 mode. */
453 nigel 49
454     #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
455    
456     #endif
457    
458    
459    
460 nigel 3 /*************************************************
461 nigel 25 * Default character tables *
462     *************************************************/
463    
464     /* A default set of character tables is included in the PCRE binary. Its source
465     is built by the maketables auxiliary program, which uses the default C ctypes
466     functions, and put in the file chartables.c. These tables are used by PCRE
467     whenever the caller of pcre_compile() does not provide an alternate set of
468     tables. */
469    
470     #include "chartables.c"
471    
472    
473    
474 nigel 49 #ifdef SUPPORT_UTF8
475 nigel 25 /*************************************************
476 nigel 49 * Tables for UTF-8 support *
477     *************************************************/
478    
479     /* These are the breakpoints for different numbers of bytes in a UTF-8
480     character. */
481    
482 nigel 69 static const int utf8_table1[] =
483     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
484 nigel 49
485     /* These are the indicator bits and the mask for the data bits to set in the
486     first byte of a character, indexed by the number of additional bytes. */
487    
488 nigel 69 static const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
489     static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
490 nigel 49
491     /* Table of the number of extra characters, indexed by the first character
492     masked with 0x3f. The highest number for a valid UTF-8 character is in fact
493     0x3d. */
494    
495 nigel 69 static const uschar utf8_table4[] = {
496 nigel 49 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
497     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
498     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
499     3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
500    
501    
502     /*************************************************
503     * Convert character value to UTF-8 *
504     *************************************************/
505    
506     /* This function takes an integer value in the range 0 - 0x7fffffff
507     and encodes it as a UTF-8 character in 0 to 6 bytes.
508    
509     Arguments:
510     cvalue the character value
511     buffer pointer to buffer for result - at least 6 bytes long
512    
513     Returns: number of characters placed in the buffer
514     */
515    
516     static int
517     ord2utf8(int cvalue, uschar *buffer)
518     {
519     register int i, j;
520     for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
521     if (cvalue <= utf8_table1[i]) break;
522 nigel 59 buffer += i;
523     for (j = i; j > 0; j--)
524     {
525     *buffer-- = 0x80 | (cvalue & 0x3f);
526     cvalue >>= 6;
527     }
528     *buffer = utf8_table2[i] | cvalue;
529 nigel 49 return i + 1;
530     }
531     #endif
532    
533    
534    
535     /*************************************************
536 nigel 63 * Print compiled regex *
537     *************************************************/
538    
539     /* The code for doing this is held in a separate file that is also included in
540     pcretest.c. It defines a function called print_internals(). */
541    
542     #ifdef DEBUG
543     #include "printint.c"
544     #endif
545    
546    
547    
548     /*************************************************
549 nigel 3 * Return version string *
550     *************************************************/
551    
552 nigel 39 #define STRING(a) # a
553     #define XSTRING(s) STRING(s)
554    
555 nigel 73 EXPORT const char *
556 nigel 3 pcre_version(void)
557     {
558 nigel 39 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
559 nigel 3 }
560    
561    
562    
563    
564     /*************************************************
565 nigel 43 * (Obsolete) Return info about compiled pattern *
566 nigel 3 *************************************************/
567    
568 nigel 43 /* This is the original "info" function. It picks potentially useful data out
569     of the private structure, but its interface was too rigid. It remains for
570     backwards compatibility. The public options are passed back in an int - though
571     the re->options field has been expanded to a long int, all the public options
572 nigel 37 at the low end of it, and so even on 16-bit systems this will still be OK.
573     Therefore, I haven't changed the API for pcre_info().
574 nigel 3
575     Arguments:
576     external_re points to compiled code
577     optptr where to pass back the options
578 nigel 63 first_byte where to pass back the first character,
579 nigel 3 or -1 if multiline and all branches start ^,
580     or -2 otherwise
581    
582 nigel 43 Returns: number of capturing subpatterns
583 nigel 3 or negative values on error
584     */
585    
586 nigel 73 EXPORT int
587 nigel 63 pcre_info(const pcre *external_re, int *optptr, int *first_byte)
588 nigel 3 {
589 nigel 7 const real_pcre *re = (const real_pcre *)external_re;
590 nigel 3 if (re == NULL) return PCRE_ERROR_NULL;
591     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
592 nigel 37 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
593 nigel 63 if (first_byte != NULL)
594     *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
595 nigel 3 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
596     return re->top_bracket;
597     }
598    
599    
600    
601 nigel 43 /*************************************************
602     * Return info about compiled pattern *
603     *************************************************/
604 nigel 3
605 nigel 43 /* This is a newer "info" function which has an extensible interface so
606     that additional items can be added compatibly.
607    
608     Arguments:
609     external_re points to compiled code
610 nigel 63 extra_data points extra data, or NULL
611 nigel 43 what what information is required
612     where where to put the information
613    
614     Returns: 0 if data returned, negative on error
615     */
616    
617 nigel 73 EXPORT int
618 nigel 63 pcre_fullinfo(const pcre *external_re, const pcre_extra *extra_data, int what,
619 nigel 43 void *where)
620     {
621     const real_pcre *re = (const real_pcre *)external_re;
622 nigel 63 const pcre_study_data *study = NULL;
623 nigel 43
624     if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
625     if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
626    
627 nigel 63 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
628 nigel 71 study = (const pcre_study_data *)extra_data->study_data;
629 nigel 63
630 nigel 43 switch (what)
631     {
632     case PCRE_INFO_OPTIONS:
633     *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
634     break;
635    
636     case PCRE_INFO_SIZE:
637     *((size_t *)where) = re->size;
638     break;
639    
640 nigel 63 case PCRE_INFO_STUDYSIZE:
641     *((size_t *)where) = (study == NULL)? 0 : study->size;
642     break;
643    
644 nigel 43 case PCRE_INFO_CAPTURECOUNT:
645     *((int *)where) = re->top_bracket;
646     break;
647    
648     case PCRE_INFO_BACKREFMAX:
649     *((int *)where) = re->top_backref;
650     break;
651    
652 nigel 63 case PCRE_INFO_FIRSTBYTE:
653 nigel 43 *((int *)where) =
654 nigel 63 ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
655 nigel 43 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
656     break;
657    
658     case PCRE_INFO_FIRSTTABLE:
659     *((const uschar **)where) =
660     (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
661     study->start_bits : NULL;
662     break;
663    
664     case PCRE_INFO_LASTLITERAL:
665     *((int *)where) =
666 nigel 63 ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
667 nigel 43 break;
668    
669 nigel 63 case PCRE_INFO_NAMEENTRYSIZE:
670     *((int *)where) = re->name_entry_size;
671     break;
672    
673     case PCRE_INFO_NAMECOUNT:
674     *((int *)where) = re->name_count;
675     break;
676    
677     case PCRE_INFO_NAMETABLE:
678     *((const uschar **)where) = (const uschar *)re + sizeof(real_pcre);
679     break;
680    
681 nigel 43 default: return PCRE_ERROR_BADOPTION;
682     }
683    
684     return 0;
685     }
686    
687    
688    
689 nigel 63 /*************************************************
690     * Return info about what features are configured *
691     *************************************************/
692    
693     /* This is function which has an extensible interface so that additional items
694     can be added compatibly.
695    
696     Arguments:
697     what what information is required
698     where where to put the information
699    
700     Returns: 0 if data returned, negative on error
701     */
702    
703 nigel 73 EXPORT int
704 nigel 63 pcre_config(int what, void *where)
705     {
706     switch (what)
707     {
708     case PCRE_CONFIG_UTF8:
709 nigel 71 #ifdef SUPPORT_UTF8
710 nigel 63 *((int *)where) = 1;
711 nigel 71 #else
712 nigel 63 *((int *)where) = 0;
713 nigel 71 #endif
714 nigel 63 break;
715    
716     case PCRE_CONFIG_NEWLINE:
717     *((int *)where) = NEWLINE;
718     break;
719    
720     case PCRE_CONFIG_LINK_SIZE:
721     *((int *)where) = LINK_SIZE;
722     break;
723    
724     case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
725     *((int *)where) = POSIX_MALLOC_THRESHOLD;
726     break;
727    
728     case PCRE_CONFIG_MATCH_LIMIT:
729     *((unsigned int *)where) = MATCH_LIMIT;
730     break;
731    
732 nigel 73 case PCRE_CONFIG_STACKRECURSE:
733     #ifdef NO_RECURSE
734     *((int *)where) = 0;
735     #else
736     *((int *)where) = 1;
737     #endif
738     break;
739    
740 nigel 63 default: return PCRE_ERROR_BADOPTION;
741     }
742    
743     return 0;
744     }
745    
746    
747    
748 nigel 3 #ifdef DEBUG
749     /*************************************************
750     * Debugging function to print chars *
751     *************************************************/
752    
753     /* Print a sequence of chars in printable format, stopping at the end of the
754     subject if the requested.
755    
756     Arguments:
757     p points to characters
758     length number to print
759     is_subject TRUE if printing from within md->start_subject
760     md pointer to matching data block, if is_subject is TRUE
761    
762     Returns: nothing
763     */
764    
765 nigel 9 static void
766     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
767 nigel 3 {
768     int c;
769     if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
770     while (length-- > 0)
771     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
772     }
773     #endif
774    
775    
776    
777    
778     /*************************************************
779     * Handle escapes *
780     *************************************************/
781    
782     /* This function is called when a \ has been encountered. It either returns a
783     positive value for a simple escape such as \n, or a negative value which
784 nigel 49 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
785     a positive value greater than 255 may be returned. On entry, ptr is pointing at
786     the \. On exit, it is on the final character of the escape sequence.
787 nigel 3
788     Arguments:
789     ptrptr points to the pattern position pointer
790     errorptr points to the pointer to the error message
791     bracount number of previous extracting brackets
792     options the options bits
793     isclass TRUE if inside a character class
794    
795     Returns: zero or positive => a data character
796     negative => a special escape sequence
797     on error, errorptr is set
798     */
799    
800     static int
801 nigel 7 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
802 nigel 71 int options, BOOL isclass)
803 nigel 3 {
804 nigel 7 const uschar *ptr = *ptrptr;
805 nigel 43 int c, i;
806 nigel 3
807 nigel 49 /* If backslash is at the end of the pattern, it's an error. */
808    
809     c = *(++ptr);
810 nigel 3 if (c == 0) *errorptr = ERR1;
811    
812 nigel 73 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
813     a table. A non-zero result is something that can be returned immediately.
814     Otherwise further processing may be required. */
815 nigel 3
816 nigel 73 #if !EBCDIC /* ASCII coding */
817     else if (c < '0' || c > 'z') {} /* Not alphameric */
818     else if ((i = escapes[c - '0']) != 0) c = i;
819 nigel 3
820 nigel 73 #else /* EBCDIC coding */
821     else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
822     else if ((i = escapes[c - 0x48]) != 0) c = i;
823     #endif
824 nigel 3
825     /* Escapes that need further processing, or are illegal. */
826    
827     else
828     {
829 nigel 7 const uschar *oldptr;
830 nigel 3 switch (c)
831     {
832 nigel 63 /* A number of Perl escapes are not handled by PCRE. We give an explicit
833     error. */
834    
835     case 'l':
836     case 'L':
837     case 'N':
838     case 'p':
839     case 'P':
840     case 'u':
841     case 'U':
842     case 'X':
843     *errorptr = ERR37;
844     break;
845    
846 nigel 3 /* The handling of escape sequences consisting of a string of digits
847     starting with one that is not zero is not straightforward. By experiment,
848     the way Perl works seems to be as follows:
849    
850     Outside a character class, the digits are read as a decimal number. If the
851     number is less than 10, or if there are that many previous extracting
852     left brackets, then it is a back reference. Otherwise, up to three octal
853     digits are read to form an escaped byte. Thus \123 is likely to be octal
854     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
855     value is greater than 377, the least significant 8 bits are taken. Inside a
856     character class, \ followed by a digit is always an octal number. */
857    
858     case '1': case '2': case '3': case '4': case '5':
859     case '6': case '7': case '8': case '9':
860    
861     if (!isclass)
862     {
863     oldptr = ptr;
864     c -= '0';
865 nigel 69 while ((digitab[ptr[1]] & ctype_digit) != 0)
866 nigel 3 c = c * 10 + *(++ptr) - '0';
867     if (c < 10 || c <= bracount)
868     {
869     c = -(ESC_REF + c);
870     break;
871     }
872     ptr = oldptr; /* Put the pointer back and fall through */
873     }
874    
875     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
876     generates a binary zero byte and treats the digit as a following literal.
877     Thus we have to pull back the pointer by one. */
878    
879     if ((c = *ptr) >= '8')
880     {
881     ptr--;
882     c = 0;
883     break;
884     }
885    
886     /* \0 always starts an octal number, but we may drop through to here with a
887 nigel 49 larger first octal digit. */
888 nigel 3
889     case '0':
890     c -= '0';
891 nigel 69 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
892 nigel 3 c = c * 8 + *(++ptr) - '0';
893 nigel 49 c &= 255; /* Take least significant 8 bits */
894 nigel 3 break;
895    
896 nigel 49 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
897     which can be greater than 0xff, but only if the ddd are hex digits. */
898 nigel 3
899     case 'x':
900 nigel 49 #ifdef SUPPORT_UTF8
901     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
902     {
903     const uschar *pt = ptr + 2;
904     register int count = 0;
905     c = 0;
906 nigel 69 while ((digitab[*pt] & ctype_xdigit) != 0)
907 nigel 49 {
908 nigel 69 int cc = *pt++;
909 nigel 49 count++;
910 nigel 73 #if !EBCDIC /* ASCII coding */
911     if (cc >= 'a') cc -= 32; /* Convert to upper case */
912 nigel 69 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
913 nigel 73 #else /* EBCDIC coding */
914     if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
915     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
916     #endif
917 nigel 49 }
918     if (*pt == '}')
919     {
920     if (c < 0 || count > 8) *errorptr = ERR34;
921     ptr = pt;
922     break;
923     }
924     /* If the sequence of hex digits does not end with '}', then we don't
925     recognize this construct; fall through to the normal \x handling. */
926     }
927     #endif
928    
929     /* Read just a single hex char */
930    
931 nigel 3 c = 0;
932 nigel 69 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
933 nigel 3 {
934 nigel 71 int cc; /* Some compilers don't like ++ */
935     cc = *(++ptr); /* in initializers */
936 nigel 73 #if !EBCDIC /* ASCII coding */
937 nigel 69 if (cc >= 'a') cc -= 32; /* Convert to upper case */
938     c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
939 nigel 73 #else /* EBCDIC coding */
940     if (cc <= 'z') cc += 64; /* Convert to upper case */
941     c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
942     #endif
943 nigel 3 }
944     break;
945    
946 nigel 49 /* Other special escapes not starting with a digit are straightforward */
947    
948 nigel 3 case 'c':
949     c = *(++ptr);
950     if (c == 0)
951     {
952     *errorptr = ERR2;
953     return 0;
954     }
955    
956 nigel 69 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
957 nigel 73 is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
958     (However, an EBCDIC equivalent has now been added.) */
959 nigel 3
960 nigel 73 #if !EBCDIC /* ASCII coding */
961 nigel 69 if (c >= 'a' && c <= 'z') c -= 32;
962 nigel 3 c ^= 0x40;
963 nigel 73 #else /* EBCDIC coding */
964     if (c >= 'a' && c <= 'z') c += 64;
965     c ^= 0xC0;
966     #endif
967 nigel 3 break;
968    
969     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
970     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
971 nigel 25 for Perl compatibility, it is a literal. This code looks a bit odd, but
972     there used to be some cases other than the default, and there may be again
973     in future, so I haven't "optimized" it. */
974 nigel 3
975     default:
976     if ((options & PCRE_EXTRA) != 0) switch(c)
977     {
978     default:
979     *errorptr = ERR3;
980     break;
981     }
982     break;
983     }
984     }
985    
986     *ptrptr = ptr;
987     return c;
988     }
989    
990    
991    
992     /*************************************************
993     * Check for counted repeat *
994     *************************************************/
995    
996     /* This function is called when a '{' is encountered in a place where it might
997     start a quantifier. It looks ahead to see if it really is a quantifier or not.
998     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
999     where the ddds are digits.
1000    
1001     Arguments:
1002     p pointer to the first char after '{'
1003    
1004     Returns: TRUE or FALSE
1005     */
1006    
1007     static BOOL
1008 nigel 71 is_counted_repeat(const uschar *p)
1009 nigel 3 {
1010 nigel 73 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1011 nigel 69 while ((digitab[*p] & ctype_digit) != 0) p++;
1012 nigel 3 if (*p == '}') return TRUE;
1013    
1014     if (*p++ != ',') return FALSE;
1015     if (*p == '}') return TRUE;
1016    
1017 nigel 73 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1018 nigel 69 while ((digitab[*p] & ctype_digit) != 0) p++;
1019    
1020 nigel 3 return (*p == '}');
1021     }
1022    
1023    
1024    
1025     /*************************************************
1026     * Read repeat counts *
1027     *************************************************/
1028    
1029     /* Read an item of the form {n,m} and return the values. This is called only
1030     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1031     so the syntax is guaranteed to be correct, but we need to check the values.
1032    
1033     Arguments:
1034     p pointer to first char after '{'
1035     minp pointer to int for min
1036     maxp pointer to int for max
1037     returned as -1 if no max
1038     errorptr points to pointer to error message
1039    
1040     Returns: pointer to '}' on success;
1041     current ptr on error, with errorptr set
1042     */
1043    
1044 nigel 7 static const uschar *
1045 nigel 71 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1046 nigel 3 {
1047     int min = 0;
1048     int max = -1;
1049    
1050 nigel 69 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1051 nigel 3
1052     if (*p == '}') max = min; else
1053     {
1054     if (*(++p) != '}')
1055     {
1056     max = 0;
1057 nigel 69 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1058 nigel 3 if (max < min)
1059     {
1060     *errorptr = ERR4;
1061     return p;
1062     }
1063     }
1064     }
1065    
1066     /* Do paranoid checks, then fill in the required variables, and pass back the
1067     pointer to the terminating '}'. */
1068    
1069     if (min > 65535 || max > 65535)
1070     *errorptr = ERR5;
1071     else
1072     {
1073     *minp = min;
1074     *maxp = max;
1075     }
1076     return p;
1077     }
1078    
1079    
1080    
1081     /*************************************************
1082 nigel 63 * Find first significant op code *
1083     *************************************************/
1084    
1085     /* This is called by several functions that scan a compiled expression looking
1086     for a fixed first character, or an anchoring op code etc. It skips over things
1087     that do not influence this. For some calls, a change of option is important.
1088    
1089     Arguments:
1090     code pointer to the start of the group
1091     options pointer to external options
1092     optbit the option bit whose changing is significant, or
1093     zero if none are
1094    
1095     Returns: pointer to the first significant opcode
1096     */
1097    
1098     static const uschar*
1099     first_significant_code(const uschar *code, int *options, int optbit)
1100     {
1101     for (;;)
1102     {
1103     switch ((int)*code)
1104     {
1105     case OP_OPT:
1106     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1107     *options = (int)code[1];
1108     code += 2;
1109     break;
1110    
1111     case OP_ASSERT_NOT:
1112     case OP_ASSERTBACK:
1113     case OP_ASSERTBACK_NOT:
1114     do code += GET(code, 1); while (*code == OP_ALT);
1115     /* Fall through */
1116    
1117     case OP_CALLOUT:
1118     case OP_CREF:
1119     case OP_BRANUMBER:
1120     case OP_WORD_BOUNDARY:
1121     case OP_NOT_WORD_BOUNDARY:
1122     code += OP_lengths[*code];
1123     break;
1124    
1125     default:
1126     return code;
1127     }
1128     }
1129     /* Control never reaches here */
1130     }
1131    
1132    
1133    
1134    
1135     /*************************************************
1136 nigel 23 * Find the fixed length of a pattern *
1137     *************************************************/
1138    
1139     /* Scan a pattern and compute the fixed length of subject that will match it,
1140     if the length is fixed. This is needed for dealing with backward assertions.
1141 nigel 63 In UTF8 mode, the result is in characters rather than bytes.
1142 nigel 23
1143     Arguments:
1144     code points to the start of the pattern (the bracket)
1145 nigel 49 options the compiling options
1146 nigel 23
1147 nigel 63 Returns: the fixed length, or -1 if there is no fixed length,
1148     or -2 if \C was encountered
1149 nigel 23 */
1150    
1151     static int
1152 nigel 49 find_fixedlength(uschar *code, int options)
1153 nigel 23 {
1154     int length = -1;
1155    
1156     register int branchlength = 0;
1157 nigel 63 register uschar *cc = code + 1 + LINK_SIZE;
1158 nigel 23
1159     /* Scan along the opcodes for this branch. If we get to the end of the
1160     branch, check the length against that of the other branches. */
1161    
1162     for (;;)
1163     {
1164     int d;
1165     register int op = *cc;
1166     if (op >= OP_BRA) op = OP_BRA;
1167    
1168     switch (op)
1169     {
1170     case OP_BRA:
1171     case OP_ONCE:
1172     case OP_COND:
1173 nigel 49 d = find_fixedlength(cc, options);
1174 nigel 63 if (d < 0) return d;
1175 nigel 23 branchlength += d;
1176 nigel 63 do cc += GET(cc, 1); while (*cc == OP_ALT);
1177     cc += 1 + LINK_SIZE;
1178 nigel 23 break;
1179    
1180     /* Reached end of a branch; if it's a ket it is the end of a nested
1181     call. If it's ALT it is an alternation in a nested call. If it is
1182     END it's the end of the outer call. All can be handled by the same code. */
1183    
1184     case OP_ALT:
1185     case OP_KET:
1186     case OP_KETRMAX:
1187     case OP_KETRMIN:
1188     case OP_END:
1189     if (length < 0) length = branchlength;
1190     else if (length != branchlength) return -1;
1191     if (*cc != OP_ALT) return length;
1192 nigel 63 cc += 1 + LINK_SIZE;
1193 nigel 23 branchlength = 0;
1194     break;
1195    
1196     /* Skip over assertive subpatterns */
1197    
1198     case OP_ASSERT:
1199     case OP_ASSERT_NOT:
1200     case OP_ASSERTBACK:
1201     case OP_ASSERTBACK_NOT:
1202 nigel 63 do cc += GET(cc, 1); while (*cc == OP_ALT);
1203     /* Fall through */
1204 nigel 23
1205     /* Skip over things that don't match chars */
1206    
1207     case OP_REVERSE:
1208 nigel 53 case OP_BRANUMBER:
1209     case OP_CREF:
1210 nigel 23 case OP_OPT:
1211 nigel 63 case OP_CALLOUT:
1212 nigel 23 case OP_SOD:
1213 nigel 63 case OP_SOM:
1214 nigel 23 case OP_EOD:
1215     case OP_EODN:
1216     case OP_CIRC:
1217     case OP_DOLL:
1218     case OP_NOT_WORD_BOUNDARY:
1219     case OP_WORD_BOUNDARY:
1220 nigel 63 cc += OP_lengths[*cc];
1221 nigel 23 break;
1222    
1223 nigel 49 /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
1224     This requires a scan of the string, unfortunately. We assume valid UTF-8
1225 nigel 63 strings, so all we do is reduce the length by one for every byte whose bits
1226     are 10xxxxxx. */
1227 nigel 23
1228     case OP_CHARS:
1229     branchlength += *(++cc);
1230 nigel 49 #ifdef SUPPORT_UTF8
1231 nigel 63 if ((options & PCRE_UTF8) != 0)
1232     for (d = 1; d <= *cc; d++)
1233     if ((cc[d] & 0xc0) == 0x80) branchlength--;
1234 nigel 49 #endif
1235 nigel 23 cc += *cc + 1;
1236     break;
1237    
1238 nigel 63 /* Handle exact repetitions. The count is already in characters, but we
1239     need to skip over a multibyte character in UTF8 mode. */
1240 nigel 23
1241     case OP_EXACT:
1242 nigel 63 branchlength += GET2(cc,1);
1243     cc += 4;
1244     #ifdef SUPPORT_UTF8
1245     if ((options & PCRE_UTF8) != 0)
1246     {
1247     while((*cc & 0x80) == 0x80) cc++;
1248     }
1249     #endif
1250     break;
1251    
1252 nigel 23 case OP_TYPEEXACT:
1253 nigel 63 branchlength += GET2(cc,1);
1254 nigel 23 cc += 4;
1255     break;
1256    
1257     /* Handle single-char matchers */
1258    
1259     case OP_NOT_DIGIT:
1260     case OP_DIGIT:
1261     case OP_NOT_WHITESPACE:
1262     case OP_WHITESPACE:
1263     case OP_NOT_WORDCHAR:
1264     case OP_WORDCHAR:
1265     case OP_ANY:
1266     branchlength++;
1267     cc++;
1268     break;
1269    
1270 nigel 63 /* The single-byte matcher isn't allowed */
1271 nigel 23
1272 nigel 63 case OP_ANYBYTE:
1273     return -2;
1274    
1275 nigel 23 /* Check a class for variable quantification */
1276    
1277 nigel 63 #ifdef SUPPORT_UTF8
1278     case OP_XCLASS:
1279     cc += GET(cc, 1) - 33;
1280     /* Fall through */
1281     #endif
1282    
1283 nigel 23 case OP_CLASS:
1284 nigel 63 case OP_NCLASS:
1285 nigel 53 cc += 33;
1286 nigel 23
1287     switch (*cc)
1288     {
1289     case OP_CRSTAR:
1290     case OP_CRMINSTAR:
1291     case OP_CRQUERY:
1292     case OP_CRMINQUERY:
1293     return -1;
1294    
1295     case OP_CRRANGE:
1296     case OP_CRMINRANGE:
1297 nigel 63 if (GET2(cc,1) != GET2(cc,3)) return -1;
1298     branchlength += GET2(cc,1);
1299 nigel 23 cc += 5;
1300     break;
1301    
1302     default:
1303     branchlength++;
1304     }
1305     break;
1306    
1307     /* Anything else is variable length */
1308    
1309     default:
1310     return -1;
1311     }
1312     }
1313     /* Control never gets here */
1314     }
1315    
1316    
1317    
1318    
1319     /*************************************************
1320 nigel 63 * Scan compiled regex for numbered bracket *
1321     *************************************************/
1322    
1323     /* This little function scans through a compiled pattern until it finds a
1324     capturing bracket with the given number.
1325    
1326     Arguments:
1327     code points to start of expression
1328     utf8 TRUE in UTF-8 mode
1329     number the required bracket number
1330    
1331     Returns: pointer to the opcode for the bracket, or NULL if not found
1332     */
1333    
1334     static const uschar *
1335     find_bracket(const uschar *code, BOOL utf8, int number)
1336     {
1337 nigel 65 #ifndef SUPPORT_UTF8
1338     utf8 = utf8; /* Stop pedantic compilers complaining */
1339     #endif
1340    
1341 nigel 63 for (;;)
1342     {
1343     register int c = *code;
1344     if (c == OP_END) return NULL;
1345     else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
1346     else if (c > OP_BRA)
1347     {
1348     int n = c - OP_BRA;
1349     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1350     if (n == number) return (uschar *)code;
1351     code += OP_lengths[OP_BRA];
1352     }
1353     else
1354     {
1355     code += OP_lengths[c];
1356    
1357 nigel 73 #ifdef SUPPORT_UTF8
1358    
1359 nigel 63 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1360     by a multi-byte character. The length in the table is a minimum, so we have
1361     to scan along to skip the extra characters. All opcodes are less than 128,
1362     so we can use relatively efficient code. */
1363    
1364 nigel 73 if (utf8) switch(c)
1365     {
1366     case OP_EXACT:
1367     case OP_UPTO:
1368     case OP_MINUPTO:
1369     case OP_STAR:
1370     case OP_MINSTAR:
1371     case OP_PLUS:
1372     case OP_MINPLUS:
1373     case OP_QUERY:
1374     case OP_MINQUERY:
1375     while ((*code & 0xc0) == 0x80) code++;
1376     break;
1377    
1378     /* XCLASS is used for classes that cannot be represented just by a bit
1379     map. This includes negated single high-valued characters. The length in
1380     the table is zero; the actual length is stored in the compled code. */
1381    
1382     case OP_XCLASS:
1383     code += GET(code, 1) + 1;
1384     break;
1385     }
1386     #endif
1387     }
1388     }
1389     }
1390    
1391    
1392    
1393     /*************************************************
1394     * Scan compiled regex for recursion reference *
1395     *************************************************/
1396    
1397     /* This little function scans through a compiled pattern until it finds an
1398     instance of OP_RECURSE.
1399    
1400     Arguments:
1401     code points to start of expression
1402     utf8 TRUE in UTF-8 mode
1403    
1404     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1405     */
1406    
1407     static const uschar *
1408     find_recurse(const uschar *code, BOOL utf8)
1409     {
1410     #ifndef SUPPORT_UTF8
1411     utf8 = utf8; /* Stop pedantic compilers complaining */
1412     #endif
1413    
1414     for (;;)
1415     {
1416     register int c = *code;
1417     if (c == OP_END) return NULL;
1418     else if (c == OP_RECURSE) return code;
1419     else if (c == OP_CHARS) code += code[1] + OP_lengths[c];
1420     else if (c > OP_BRA)
1421     {
1422     code += OP_lengths[OP_BRA];
1423     }
1424     else
1425     {
1426     code += OP_lengths[c];
1427    
1428 nigel 63 #ifdef SUPPORT_UTF8
1429 nigel 73
1430     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1431     by a multi-byte character. The length in the table is a minimum, so we have
1432     to scan along to skip the extra characters. All opcodes are less than 128,
1433     so we can use relatively efficient code. */
1434    
1435 nigel 63 if (utf8) switch(c)
1436     {
1437     case OP_EXACT:
1438     case OP_UPTO:
1439     case OP_MINUPTO:
1440     case OP_STAR:
1441     case OP_MINSTAR:
1442     case OP_PLUS:
1443     case OP_MINPLUS:
1444     case OP_QUERY:
1445     case OP_MINQUERY:
1446     while ((*code & 0xc0) == 0x80) code++;
1447     break;
1448 nigel 73
1449     /* XCLASS is used for classes that cannot be represented just by a bit
1450     map. This includes negated single high-valued characters. The length in
1451     the table is zero; the actual length is stored in the compled code. */
1452    
1453     case OP_XCLASS:
1454     code += GET(code, 1) + 1;
1455     break;
1456 nigel 63 }
1457     #endif
1458     }
1459     }
1460     }
1461    
1462    
1463    
1464     /*************************************************
1465     * Scan compiled branch for non-emptiness *
1466     *************************************************/
1467    
1468     /* This function scans through a branch of a compiled pattern to see whether it
1469     can match the empty string or not. It is called only from could_be_empty()
1470     below. Note that first_significant_code() skips over assertions. If we hit an
1471     unclosed bracket, we return "empty" - this means we've struck an inner bracket
1472     whose current branch will already have been scanned.
1473    
1474     Arguments:
1475     code points to start of search
1476     endcode points to where to stop
1477     utf8 TRUE if in UTF8 mode
1478    
1479     Returns: TRUE if what is matched could be empty
1480     */
1481    
1482     static BOOL
1483     could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1484     {
1485     register int c;
1486     for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0);
1487     code < endcode;
1488     code = first_significant_code(code + OP_lengths[c], NULL, 0))
1489     {
1490     const uschar *ccode;
1491    
1492     c = *code;
1493    
1494     if (c >= OP_BRA)
1495     {
1496     BOOL empty_branch;
1497     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1498    
1499     /* Scan a closed bracket */
1500    
1501     empty_branch = FALSE;
1502     do
1503     {
1504     if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1505     empty_branch = TRUE;
1506     code += GET(code, 1);
1507     }
1508     while (*code == OP_ALT);
1509     if (!empty_branch) return FALSE; /* All branches are non-empty */
1510     code += 1 + LINK_SIZE;
1511     c = *code;
1512     }
1513    
1514     else switch (c)
1515     {
1516     /* Check for quantifiers after a class */
1517    
1518     #ifdef SUPPORT_UTF8
1519     case OP_XCLASS:
1520     ccode = code + GET(code, 1);
1521     goto CHECK_CLASS_REPEAT;
1522     #endif
1523    
1524     case OP_CLASS:
1525     case OP_NCLASS:
1526     ccode = code + 33;
1527    
1528     #ifdef SUPPORT_UTF8
1529     CHECK_CLASS_REPEAT:
1530     #endif
1531    
1532     switch (*ccode)
1533     {
1534     case OP_CRSTAR: /* These could be empty; continue */
1535     case OP_CRMINSTAR:
1536     case OP_CRQUERY:
1537     case OP_CRMINQUERY:
1538     break;
1539    
1540     default: /* Non-repeat => class must match */
1541     case OP_CRPLUS: /* These repeats aren't empty */
1542     case OP_CRMINPLUS:
1543     return FALSE;
1544    
1545     case OP_CRRANGE:
1546     case OP_CRMINRANGE:
1547     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1548     break;
1549     }
1550     break;
1551    
1552     /* Opcodes that must match a character */
1553    
1554     case OP_NOT_DIGIT:
1555     case OP_DIGIT:
1556     case OP_NOT_WHITESPACE:
1557     case OP_WHITESPACE:
1558     case OP_NOT_WORDCHAR:
1559     case OP_WORDCHAR:
1560     case OP_ANY:
1561     case OP_ANYBYTE:
1562     case OP_CHARS:
1563     case OP_NOT:
1564     case OP_PLUS:
1565     case OP_MINPLUS:
1566     case OP_EXACT:
1567     case OP_NOTPLUS:
1568     case OP_NOTMINPLUS:
1569     case OP_NOTEXACT:
1570     case OP_TYPEPLUS:
1571     case OP_TYPEMINPLUS:
1572     case OP_TYPEEXACT:
1573     return FALSE;
1574    
1575     /* End of branch */
1576    
1577     case OP_KET:
1578     case OP_KETRMAX:
1579     case OP_KETRMIN:
1580     case OP_ALT:
1581     return TRUE;
1582    
1583     /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1584     followed by a multibyte character */
1585    
1586     #ifdef SUPPORT_UTF8
1587     case OP_STAR:
1588     case OP_MINSTAR:
1589     case OP_QUERY:
1590     case OP_MINQUERY:
1591     case OP_UPTO:
1592     case OP_MINUPTO:
1593     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1594     break;
1595     #endif
1596     }
1597     }
1598    
1599     return TRUE;
1600     }
1601    
1602    
1603    
1604     /*************************************************
1605     * Scan compiled regex for non-emptiness *
1606     *************************************************/
1607    
1608     /* This function is called to check for left recursive calls. We want to check
1609     the current branch of the current pattern to see if it could match the empty
1610     string. If it could, we must look outwards for branches at other levels,
1611     stopping when we pass beyond the bracket which is the subject of the recursion.
1612    
1613     Arguments:
1614     code points to start of the recursion
1615     endcode points to where to stop (current RECURSE item)
1616     bcptr points to the chain of current (unclosed) branch starts
1617     utf8 TRUE if in UTF-8 mode
1618    
1619     Returns: TRUE if what is matched could be empty
1620     */
1621    
1622     static BOOL
1623     could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1624     BOOL utf8)
1625     {
1626     while (bcptr != NULL && bcptr->current >= code)
1627     {
1628     if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1629     bcptr = bcptr->outer;
1630     }
1631     return TRUE;
1632     }
1633    
1634    
1635    
1636     /*************************************************
1637 nigel 43 * Check for POSIX class syntax *
1638     *************************************************/
1639    
1640     /* This function is called when the sequence "[:" or "[." or "[=" is
1641     encountered in a character class. It checks whether this is followed by an
1642     optional ^ and then a sequence of letters, terminated by a matching ":]" or
1643     ".]" or "=]".
1644    
1645     Argument:
1646     ptr pointer to the initial [
1647     endptr where to return the end pointer
1648     cd pointer to compile data
1649    
1650     Returns: TRUE or FALSE
1651     */
1652    
1653     static BOOL
1654     check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1655     {
1656     int terminator; /* Don't combine these lines; the Solaris cc */
1657     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1658     if (*(++ptr) == '^') ptr++;
1659     while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1660     if (*ptr == terminator && ptr[1] == ']')
1661     {
1662     *endptr = ptr;
1663     return TRUE;
1664     }
1665     return FALSE;
1666     }
1667    
1668    
1669    
1670    
1671     /*************************************************
1672     * Check POSIX class name *
1673     *************************************************/
1674    
1675     /* This function is called to check the name given in a POSIX-style class entry
1676     such as [:alnum:].
1677    
1678     Arguments:
1679     ptr points to the first letter
1680     len the length of the name
1681    
1682     Returns: a value representing the name, or -1 if unknown
1683     */
1684    
1685     static int
1686     check_posix_name(const uschar *ptr, int len)
1687     {
1688     register int yield = 0;
1689     while (posix_name_lengths[yield] != 0)
1690     {
1691     if (len == posix_name_lengths[yield] &&
1692     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1693     yield++;
1694     }
1695     return -1;
1696     }
1697    
1698    
1699 nigel 73 /*************************************************
1700     * Adjust OP_RECURSE items in repeated group *
1701     *************************************************/
1702 nigel 43
1703 nigel 73 /* OP_RECURSE items contain an offset from the start of the regex to the group
1704     that is referenced. This means that groups can be replicated for fixed
1705     repetition simply by copying (because the recursion is allowed to refer to
1706     earlier groups that are outside the current group). However, when a group is
1707     optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1708     it, after it has been compiled. This means that any OP_RECURSE items within it
1709     that refer to the group itself or any contained groups have to have their
1710     offsets adjusted. That is the job of this function. Before it is called, the
1711     partially compiled regex must be temporarily terminated with OP_END.
1712 nigel 43
1713 nigel 73 Arguments:
1714     group points to the start of the group
1715     adjust the amount by which the group is to be moved
1716     utf8 TRUE in UTF-8 mode
1717     cd contains pointers to tables etc.
1718    
1719     Returns: nothing
1720     */
1721    
1722     static void
1723     adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1724     {
1725     uschar *ptr = group;
1726     while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1727     {
1728     int offset = GET(ptr, 1);
1729     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1730     ptr += 1 + LINK_SIZE;
1731     }
1732     }
1733    
1734    
1735    
1736 nigel 43 /*************************************************
1737 nigel 3 * Compile one branch *
1738     *************************************************/
1739    
1740 nigel 63 /* Scan the pattern, compiling it into the code vector. If the options are
1741     changed during the branch, the pointer is used to change the external options
1742     bits.
1743 nigel 3
1744     Arguments:
1745 nigel 63 optionsptr pointer to the option bits
1746     brackets points to number of extracting brackets used
1747     code points to the pointer to the current code point
1748     ptrptr points to the current pattern pointer
1749     errorptr points to pointer to error message
1750     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1751     reqbyteptr set to the last literal character required, else < 0
1752     bcptr points to current branch chain
1753     cd contains pointers to tables etc.
1754 nigel 3
1755 nigel 63 Returns: TRUE on success
1756     FALSE, with *errorptr set on error
1757 nigel 3 */
1758    
1759     static BOOL
1760 nigel 63 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1761     const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
1762     int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1763 nigel 3 {
1764     int repeat_type, op_type;
1765 nigel 63 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1766     int bravalue = 0;
1767     int length;
1768 nigel 19 int greedy_default, greedy_non_default;
1769 nigel 63 int firstbyte, reqbyte;
1770     int zeroreqbyte, zerofirstbyte;
1771 nigel 65 int req_caseopt, reqvary, tempreqvary;
1772 nigel 37 int condcount = 0;
1773 nigel 63 int options = *optionsptr;
1774 nigel 3 register int c;
1775     register uschar *code = *codeptr;
1776 nigel 23 uschar *tempcode;
1777 nigel 63 BOOL inescq = FALSE;
1778     BOOL groupsetfirstbyte = FALSE;
1779 nigel 7 const uschar *ptr = *ptrptr;
1780 nigel 23 const uschar *tempptr;
1781 nigel 3 uschar *previous = NULL;
1782     uschar class[32];
1783    
1784 nigel 63 #ifdef SUPPORT_UTF8
1785     BOOL class_utf8;
1786     BOOL utf8 = (options & PCRE_UTF8) != 0;
1787     uschar *class_utf8data;
1788     uschar utf8_char[6];
1789     #else
1790     BOOL utf8 = FALSE;
1791     #endif
1792    
1793 nigel 19 /* Set up the default and non-default settings for greediness */
1794    
1795     greedy_default = ((options & PCRE_UNGREEDY) != 0);
1796     greedy_non_default = greedy_default ^ 1;
1797    
1798 nigel 63 /* Initialize no first char, no required char. REQ_UNSET means "no char
1799     matching encountered yet". It gets changed to REQ_NONE if we hit something that
1800     matches a non-fixed char first char; reqbyte just remains unset if we never
1801     find one.
1802 nigel 37
1803 nigel 63 When we hit a repeat whose minimum is zero, we may have to adjust these values
1804     to take the zero repeat into account. This is implemented by setting them to
1805     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1806     item types that can be repeated set these backoff variables appropriately. */
1807 nigel 37
1808 nigel 63 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1809    
1810     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1811     according to the current setting of the caseless flag. REQ_CASELESS is a bit
1812     value > 255. It is added into the firstbyte or reqbyte variables to record the
1813     case status of the value. */
1814    
1815     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1816    
1817 nigel 3 /* Switch on next character until the end of the branch */
1818    
1819     for (;; ptr++)
1820     {
1821     BOOL negate_class;
1822 nigel 63 BOOL possessive_quantifier;
1823 nigel 23 int class_charcount;
1824     int class_lastchar;
1825     int newoptions;
1826 nigel 63 int recno;
1827 nigel 53 int skipbytes;
1828 nigel 63 int subreqbyte;
1829     int subfirstbyte;
1830 nigel 3
1831     c = *ptr;
1832 nigel 63 if (inescq && c != 0) goto NORMAL_CHAR;
1833    
1834 nigel 3 if ((options & PCRE_EXTENDED) != 0)
1835     {
1836 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1837 nigel 3 if (c == '#')
1838     {
1839 nigel 47 /* The space before the ; is to avoid a warning on a silly compiler
1840     on the Macintosh. */
1841 nigel 53 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1842 nigel 63 if (c != 0) continue; /* Else fall through to handle end of string */
1843 nigel 3 }
1844     }
1845    
1846     switch(c)
1847     {
1848     /* The branch terminates at end of string, |, or ). */
1849    
1850     case 0:
1851     case '|':
1852     case ')':
1853 nigel 63 *firstbyteptr = firstbyte;
1854     *reqbyteptr = reqbyte;
1855 nigel 3 *codeptr = code;
1856     *ptrptr = ptr;
1857     return TRUE;
1858    
1859 nigel 63 /* Handle single-character metacharacters. In multiline mode, ^ disables
1860     the setting of any following char as a first character. */
1861 nigel 3
1862     case '^':
1863 nigel 63 if ((options & PCRE_MULTILINE) != 0)
1864     {
1865     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1866     }
1867 nigel 3 previous = NULL;
1868     *code++ = OP_CIRC;
1869     break;
1870    
1871     case '$':
1872     previous = NULL;
1873     *code++ = OP_DOLL;
1874     break;
1875    
1876 nigel 63 /* There can never be a first char if '.' is first, whatever happens about
1877     repeats. The value of reqbyte doesn't change either. */
1878    
1879 nigel 3 case '.':
1880 nigel 63 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1881     zerofirstbyte = firstbyte;
1882     zeroreqbyte = reqbyte;
1883 nigel 3 previous = code;
1884     *code++ = OP_ANY;
1885     break;
1886    
1887 nigel 63 /* Character classes. If the included characters are all < 255 in value, we
1888     build a 32-byte bitmap of the permitted characters, except in the special
1889     case where there is only one such character. For negated classes, we build
1890     the map as usual, then invert it at the end. However, we use a different
1891     opcode so that data characters > 255 can be handled correctly.
1892    
1893     If the class contains characters outside the 0-255 range, a different
1894     opcode is compiled. It may optionally have a bit map for characters < 256,
1895     but those above are are explicitly listed afterwards. A flag byte tells
1896     whether the bitmap is present, and whether this is a negated class or not.
1897 nigel 3 */
1898    
1899     case '[':
1900     previous = code;
1901    
1902 nigel 63 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1903     they are encountered at the top level, so we'll do that too. */
1904    
1905     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1906     check_posix_syntax(ptr, &tempptr, cd))
1907     {
1908     *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
1909     goto FAILED;
1910     }
1911    
1912 nigel 23 /* If the first character is '^', set the negation flag and skip it. */
1913 nigel 3
1914     if ((c = *(++ptr)) == '^')
1915     {
1916     negate_class = TRUE;
1917     c = *(++ptr);
1918     }
1919 nigel 63 else
1920     {
1921     negate_class = FALSE;
1922     }
1923 nigel 3
1924 nigel 63 /* Keep a count of chars with values < 256 so that we can optimize the case
1925     of just a single character (as long as it's < 256). For higher valued UTF-8
1926     characters, we don't yet do any optimization. */
1927 nigel 3
1928     class_charcount = 0;
1929     class_lastchar = -1;
1930    
1931 nigel 63 #ifdef SUPPORT_UTF8
1932     class_utf8 = FALSE; /* No chars >= 256 */
1933     class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1934     #endif
1935    
1936 nigel 3 /* Initialize the 32-char bit map to all zeros. We have to build the
1937     map in a temporary bit of store, in case the class contains only 1
1938 nigel 63 character (< 256), because in that case the compiled code doesn't use the
1939 nigel 3 bit map. */
1940    
1941     memset(class, 0, 32 * sizeof(uschar));
1942    
1943     /* Process characters until ] is reached. By writing this as a "do" it
1944 nigel 63 means that an initial ] is taken as a data character. The first pass
1945     through the regex checked the overall syntax, so we don't need to be very
1946     strict here. At the start of the loop, c contains the first byte of the
1947     character. */
1948 nigel 3
1949     do
1950     {
1951 nigel 63 #ifdef SUPPORT_UTF8
1952 nigel 67 if (utf8 && c > 127)
1953     { /* Braces are required because the */
1954     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1955     }
1956 nigel 63 #endif
1957    
1958     /* Inside \Q...\E everything is literal except \E */
1959    
1960     if (inescq)
1961 nigel 3 {
1962 nigel 63 if (c == '\\' && ptr[1] == 'E')
1963     {
1964     inescq = FALSE;
1965     ptr++;
1966     continue;
1967     }
1968     else goto LONE_SINGLE_CHARACTER;
1969 nigel 3 }
1970    
1971 nigel 43 /* Handle POSIX class names. Perl allows a negation extension of the
1972 nigel 63 form [:^name:]. A square bracket that doesn't match the syntax is
1973 nigel 43 treated as a literal. We also recognize the POSIX constructions
1974     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1975 nigel 63 5.6 and 5.8 do. */
1976 nigel 43
1977     if (c == '[' &&
1978     (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1979     check_posix_syntax(ptr, &tempptr, cd))
1980     {
1981     BOOL local_negate = FALSE;
1982     int posix_class, i;
1983     register const uschar *cbits = cd->cbits;
1984    
1985     if (ptr[1] != ':')
1986     {
1987     *errorptr = ERR31;
1988     goto FAILED;
1989     }
1990    
1991     ptr += 2;
1992     if (*ptr == '^')
1993     {
1994     local_negate = TRUE;
1995     ptr++;
1996     }
1997    
1998     posix_class = check_posix_name(ptr, tempptr - ptr);
1999     if (posix_class < 0)
2000     {
2001     *errorptr = ERR30;
2002     goto FAILED;
2003     }
2004    
2005     /* If matching is caseless, upper and lower are converted to
2006     alpha. This relies on the fact that the class table starts with
2007     alpha, lower, upper as the first 3 entries. */
2008    
2009     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2010     posix_class = 0;
2011    
2012     /* Or into the map we are building up to 3 of the static class
2013 nigel 63 tables, or their negations. The [:blank:] class sets up the same
2014     chars as the [:space:] class (all white space). We remove the vertical
2015     white space chars afterwards. */
2016 nigel 43
2017     posix_class *= 3;
2018     for (i = 0; i < 3; i++)
2019     {
2020 nigel 73 BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
2021 nigel 43 int taboffset = posix_class_maps[posix_class + i];
2022     if (taboffset < 0) break;
2023     if (local_negate)
2024 nigel 63 {
2025 nigel 43 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
2026 nigel 73 if (blankclass) class[1] |= 0x3c;
2027 nigel 63 }
2028 nigel 43 else
2029 nigel 63 {
2030 nigel 43 for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
2031 nigel 73 if (blankclass) class[1] &= ~0x3c;
2032 nigel 63 }
2033 nigel 43 }
2034    
2035     ptr = tempptr + 1;
2036     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2037 nigel 63 continue; /* End of POSIX syntax handling */
2038 nigel 43 }
2039    
2040 nigel 3 /* Backslash may introduce a single character, or it may introduce one
2041     of the specials, which just set a flag. Escaped items are checked for
2042     validity in the pre-compiling pass. The sequence \b is a special case.
2043 nigel 7 Inside a class (and only there) it is treated as backspace. Elsewhere
2044 nigel 3 it marks a word boundary. Other escapes have preset maps ready to
2045     or into the one we are building. We assume they have more than one
2046 nigel 63 character in them, so set class_charcount bigger than one. */
2047 nigel 3
2048     if (c == '\\')
2049     {
2050 nigel 71 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2051 nigel 63 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2052    
2053     if (-c == ESC_Q) /* Handle start of quoted string */
2054     {
2055     if (ptr[1] == '\\' && ptr[2] == 'E')
2056     {
2057     ptr += 2; /* avoid empty string */
2058     }
2059     else inescq = TRUE;
2060     continue;
2061     }
2062    
2063 nigel 3 else if (c < 0)
2064     {
2065 nigel 25 register const uschar *cbits = cd->cbits;
2066 nigel 63 class_charcount = 10; /* Greater than 1 is what matters */
2067 nigel 3 switch (-c)
2068     {
2069     case ESC_d:
2070 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
2071 nigel 3 continue;
2072    
2073     case ESC_D:
2074 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
2075 nigel 3 continue;
2076    
2077     case ESC_w:
2078 nigel 43 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
2079 nigel 3 continue;
2080    
2081     case ESC_W:
2082 nigel 43 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
2083 nigel 3 continue;
2084    
2085     case ESC_s:
2086 nigel 25 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
2087 nigel 63 class[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2088 nigel 3 continue;
2089    
2090     case ESC_S:
2091 nigel 25 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
2092 nigel 63 class[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2093 nigel 3 continue;
2094    
2095 nigel 63 /* Unrecognized escapes are faulted if PCRE is running in its
2096     strict mode. By default, for compatibility with Perl, they are
2097     treated as literals. */
2098    
2099 nigel 3 default:
2100 nigel 63 if ((options & PCRE_EXTRA) != 0)
2101     {
2102     *errorptr = ERR7;
2103     goto FAILED;
2104     }
2105     c = *ptr; /* The final character */
2106 nigel 3 }
2107     }
2108 nigel 49
2109 nigel 63 /* Fall through if we have a single character (c >= 0). This may be
2110     > 256 in UTF-8 mode. */
2111 nigel 49
2112 nigel 63 } /* End of backslash handling */
2113 nigel 3
2114     /* A single character may be followed by '-' to form a range. However,
2115     Perl does not permit ']' to be the end of the range. A '-' character
2116     here is treated as a literal. */
2117    
2118     if (ptr[1] == '-' && ptr[2] != ']')
2119     {
2120     int d;
2121     ptr += 2;
2122    
2123 nigel 63 #ifdef SUPPORT_UTF8
2124     if (utf8)
2125     { /* Braces are required because the */
2126     GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2127 nigel 3 }
2128 nigel 63 else
2129     #endif
2130     d = *ptr;
2131 nigel 3
2132     /* The second part of a range can be a single-character escape, but
2133 nigel 49 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2134     in such circumstances. */
2135 nigel 3
2136     if (d == '\\')
2137     {
2138 nigel 49 const uschar *oldptr = ptr;
2139 nigel 71 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2140 nigel 49
2141     /* \b is backslash; any other special means the '-' was literal */
2142    
2143 nigel 3 if (d < 0)
2144     {
2145     if (d == -ESC_b) d = '\b'; else
2146     {
2147 nigel 49 ptr = oldptr - 2;
2148 nigel 63 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2149 nigel 3 }
2150     }
2151     }
2152    
2153 nigel 63 /* Check that the two values are in the correct order */
2154    
2155 nigel 3 if (d < c)
2156     {
2157     *errorptr = ERR8;
2158     goto FAILED;
2159     }
2160    
2161 nigel 63 /* If d is greater than 255, we can't just use the bit map, so set up
2162     for the UTF-8 supporting class type. If we are not caseless, we can
2163     just set up a single range. If we are caseless, the characters < 256
2164     are handled with a bitmap, in order to get the case-insensitive
2165     handling. */
2166    
2167     #ifdef SUPPORT_UTF8
2168     if (d > 255)
2169     {
2170     class_utf8 = TRUE;
2171     *class_utf8data++ = XCL_RANGE;
2172     if ((options & PCRE_CASELESS) == 0)
2173     {
2174     class_utf8data += ord2utf8(c, class_utf8data);
2175     class_utf8data += ord2utf8(d, class_utf8data);
2176     continue; /* Go get the next char in the class */
2177     }
2178     class_utf8data += ord2utf8(256, class_utf8data);
2179     class_utf8data += ord2utf8(d, class_utf8data);
2180     d = 255;
2181     /* Fall through */
2182     }
2183     #endif
2184     /* We use the bit map if the range is entirely < 255, or if part of it
2185     is < 255 and matching is caseless. */
2186    
2187 nigel 3 for (; c <= d; c++)
2188     {
2189     class[c/8] |= (1 << (c&7));
2190     if ((options & PCRE_CASELESS) != 0)
2191     {
2192 nigel 25 int uc = cd->fcc[c]; /* flip case */
2193 nigel 3 class[uc/8] |= (1 << (uc&7));
2194     }
2195     class_charcount++; /* in case a one-char range */
2196     class_lastchar = c;
2197     }
2198 nigel 63
2199 nigel 3 continue; /* Go get the next char in the class */
2200     }
2201    
2202     /* Handle a lone single character - we can get here for a normal
2203     non-escape char, or after \ that introduces a single character. */
2204    
2205 nigel 63 LONE_SINGLE_CHARACTER:
2206 nigel 49
2207 nigel 63 /* Handle a multibyte character */
2208    
2209     #ifdef SUPPORT_UTF8
2210     if (utf8 && c > 255)
2211 nigel 3 {
2212 nigel 63 class_utf8 = TRUE;
2213     *class_utf8data++ = XCL_SINGLE;
2214     class_utf8data += ord2utf8(c, class_utf8data);
2215 nigel 3 }
2216 nigel 63 else
2217     #endif
2218     /* Handle a single-byte character */
2219     {
2220     class [c/8] |= (1 << (c&7));
2221     if ((options & PCRE_CASELESS) != 0)
2222     {
2223     c = cd->fcc[c]; /* flip case */
2224     class[c/8] |= (1 << (c&7));
2225     }
2226     class_charcount++;
2227     class_lastchar = c;
2228     }
2229 nigel 3 }
2230    
2231     /* Loop until ']' reached; the check for end of string happens inside the
2232     loop. This "while" is the end of the "do" above. */
2233    
2234 nigel 63 while ((c = *(++ptr)) != ']' || inescq);
2235 nigel 3
2236 nigel 63 /* If class_charcount is 1, we saw precisely one character with a value <
2237     256. In UTF-8 mode, we can optimize if there were no characters >= 256 and
2238     the one character is < 128. In non-UTF-8 mode we can always optimize.
2239 nigel 3
2240 nigel 63 The optimization throws away the bit map. We turn the item into a
2241     1-character OP_CHARS if it's positive, or OP_NOT if it's negative. Note
2242     that OP_NOT does not support multibyte characters. In the positive case, it
2243     can cause firstbyte to be set. Otherwise, there can be no first char if
2244     this item is first, whatever repeat count may follow. In the case of
2245     reqbyte, save the previous value for reinstating. */
2246    
2247     #ifdef SUPPORT_UTF8
2248 nigel 67 if (class_charcount == 1 &&
2249     (!utf8 ||
2250     (!class_utf8 && class_lastchar < 128)))
2251 nigel 63 #else
2252     if (class_charcount == 1)
2253     #endif
2254 nigel 3 {
2255 nigel 63 zeroreqbyte = reqbyte;
2256 nigel 3 if (negate_class)
2257     {
2258 nigel 63 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2259     zerofirstbyte = firstbyte;
2260     *code++ = OP_NOT;
2261 nigel 3 }
2262     else
2263     {
2264 nigel 63 if (firstbyte == REQ_UNSET)
2265     {
2266     zerofirstbyte = REQ_NONE;
2267     firstbyte = class_lastchar | req_caseopt;
2268     }
2269     else
2270     {
2271     zerofirstbyte = firstbyte;
2272 nigel 65 reqbyte = class_lastchar | req_caseopt | cd->req_varyopt;
2273 nigel 63 }
2274     *code++ = OP_CHARS;
2275 nigel 3 *code++ = 1;
2276     }
2277     *code++ = class_lastchar;
2278 nigel 63 break; /* End of class handling */
2279     } /* End of 1-byte optimization */
2280    
2281     /* Otherwise, if this is the first thing in the branch, there can be no
2282     first char setting, whatever the repeat count. Any reqbyte setting must
2283     remain unchanged after any kind of repeat. */
2284    
2285     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2286     zerofirstbyte = firstbyte;
2287     zeroreqbyte = reqbyte;
2288    
2289     /* If there are characters with values > 255, we have to compile an
2290     extended class, with its own opcode. If there are no characters < 256,
2291     we can omit the bitmap. */
2292    
2293     #ifdef SUPPORT_UTF8
2294     if (class_utf8)
2295     {
2296     *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2297     *code++ = OP_XCLASS;
2298     code += LINK_SIZE;
2299     *code = negate_class? XCL_NOT : 0;
2300    
2301     /* If the map is required, install it, and move on to the end of
2302     the extra data */
2303    
2304     if (class_charcount > 0)
2305     {
2306     *code++ |= XCL_MAP;
2307     memcpy(code, class, 32);
2308     code = class_utf8data;
2309     }
2310    
2311     /* If the map is not required, slide down the extra data. */
2312    
2313     else
2314     {
2315     int len = class_utf8data - (code + 33);
2316     memmove(code + 1, code + 33, len);
2317     code += len + 1;
2318     }
2319    
2320     /* Now fill in the complete length of the item */
2321    
2322     PUT(previous, 1, code - previous);
2323     break; /* End of class handling */
2324 nigel 3 }
2325 nigel 63 #endif
2326 nigel 3
2327 nigel 63 /* If there are no characters > 255, negate the 32-byte map if necessary,
2328     and copy it into the code vector. If this is the first thing in the branch,
2329     there can be no first char setting, whatever the repeat count. Any reqbyte
2330     setting must remain unchanged after any kind of repeat. */
2331 nigel 3
2332 nigel 63 if (negate_class)
2333     {
2334     *code++ = OP_NCLASS;
2335     for (c = 0; c < 32; c++) code[c] = ~class[c];
2336     }
2337 nigel 3 else
2338     {
2339 nigel 63 *code++ = OP_CLASS;
2340     memcpy(code, class, 32);
2341 nigel 3 }
2342 nigel 63 code += 32;
2343 nigel 3 break;
2344    
2345     /* Various kinds of repeat */
2346    
2347     case '{':
2348 nigel 71 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
2349     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
2350 nigel 3 if (*errorptr != NULL) goto FAILED;
2351     goto REPEAT;
2352    
2353     case '*':
2354     repeat_min = 0;
2355     repeat_max = -1;
2356     goto REPEAT;
2357    
2358     case '+':
2359     repeat_min = 1;
2360     repeat_max = -1;
2361     goto REPEAT;
2362    
2363     case '?':
2364     repeat_min = 0;
2365     repeat_max = 1;
2366    
2367     REPEAT:
2368     if (previous == NULL)
2369     {
2370     *errorptr = ERR9;
2371     goto FAILED;
2372     }
2373    
2374 nigel 63 if (repeat_min == 0)
2375     {
2376 nigel 65 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2377     reqbyte = zeroreqbyte; /* Ditto */
2378 nigel 63 }
2379 nigel 3
2380 nigel 65 /* Remember whether this is a variable length repeat */
2381    
2382     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2383    
2384 nigel 63 op_type = 0; /* Default single-char op codes */
2385     possessive_quantifier = FALSE; /* Default not possessive quantifier */
2386    
2387     /* Save start of previous item, in case we have to move it up to make space
2388     for an inserted OP_ONCE for the additional '+' extension. */
2389    
2390     tempcode = previous;
2391    
2392     /* If the next character is '+', we have a possessive quantifier. This
2393     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2394     If the next character is '?' this is a minimizing repeat, by default,
2395     but if PCRE_UNGREEDY is set, it works the other way round. We change the
2396     repeat type to the non-default. */
2397    
2398     if (ptr[1] == '+')
2399     {
2400     repeat_type = 0; /* Force greedy */
2401     possessive_quantifier = TRUE;
2402     ptr++;
2403     }
2404     else if (ptr[1] == '?')
2405     {
2406     repeat_type = greedy_non_default;
2407     ptr++;
2408     }
2409 nigel 19 else repeat_type = greedy_default;
2410 nigel 3
2411 nigel 63 /* If previous was a recursion, we need to wrap it inside brackets so that
2412     it can be replicated if necessary. */
2413    
2414     if (*previous == OP_RECURSE)
2415     {
2416     memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2417     code += 1 + LINK_SIZE;
2418     *previous = OP_BRA;
2419     PUT(previous, 1, code - previous);
2420     *code = OP_KET;
2421     PUT(code, 1, code - previous);
2422     code += 1 + LINK_SIZE;
2423     }
2424    
2425 nigel 3 /* If previous was a string of characters, chop off the last one and use it
2426     as the subject of the repeat. If there was only one character, we can
2427 nigel 63 abolish the previous item altogether. If a one-char item has a minumum of
2428     more than one, ensure that it is set in reqbyte - it might not be if a
2429     sequence such as x{3} is the first thing in a branch because the x will
2430     have gone into firstbyte instead. */
2431 nigel 3
2432 nigel 37 if (*previous == OP_CHARS)
2433 nigel 3 {
2434 nigel 63 /* Deal with UTF-8 characters that take up more than one byte. It's
2435     easier to write this out separately than try to macrify it. Use c to
2436     hold the length of the character in bytes, plus 0x80 to flag that it's a
2437     length rather than a small character. */
2438 nigel 37
2439 nigel 63 #ifdef SUPPORT_UTF8
2440     if (utf8 && (code[-1] & 0x80) != 0)
2441 nigel 3 {
2442 nigel 63 uschar *lastchar = code - 1;
2443     while((*lastchar & 0xc0) == 0x80) lastchar--;
2444     c = code - lastchar; /* Length of UTF-8 character */
2445     memcpy(utf8_char, lastchar, c); /* Save the char */
2446     if (lastchar == previous + 2) /* There was only one character */
2447     {
2448     code = previous; /* Abolish the previous item */
2449     }
2450     else
2451     {
2452     previous[1] -= c; /* Adjust length of previous */
2453     code = lastchar; /* Lost char off the end */
2454     tempcode = code; /* Adjust position to be moved for '+' */
2455     }
2456     c |= 0x80; /* Flag c as a length */
2457 nigel 3 }
2458     else
2459 nigel 63 #endif
2460    
2461     /* Handle the case of a single byte - either with no UTF8 support, or
2462     with UTF-8 disabled, or for a UTF-8 character < 128. */
2463    
2464 nigel 3 {
2465 nigel 63 c = *(--code);
2466     if (code == previous + 2) /* There was only one character */
2467     {
2468     code = previous; /* Abolish the previous item */
2469 nigel 65 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2470 nigel 63 }
2471     else
2472     {
2473     previous[1]--; /* adjust length */
2474     tempcode = code; /* Adjust position to be moved for '+' */
2475     }
2476 nigel 3 }
2477 nigel 63
2478 nigel 3 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2479     }
2480    
2481     /* If previous was a single negated character ([^a] or similar), we use
2482     one of the special opcodes, replacing it. The code is shared with single-
2483 nigel 63 character repeats by setting opt_type to add a suitable offset into
2484     repeat_type. OP_NOT is currently used only for single-byte chars. */
2485 nigel 3
2486 nigel 63 else if (*previous == OP_NOT)
2487 nigel 3 {
2488     op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2489     c = previous[1];
2490     code = previous;
2491     goto OUTPUT_SINGLE_REPEAT;
2492     }
2493    
2494     /* If previous was a character type match (\d or similar), abolish it and
2495     create a suitable repeat item. The code is shared with single-character
2496 nigel 63 repeats by setting op_type to add a suitable offset into repeat_type. */
2497 nigel 3
2498 nigel 63 else if (*previous < OP_EODN)
2499 nigel 3 {
2500     op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2501     c = *previous;
2502     code = previous;
2503    
2504     OUTPUT_SINGLE_REPEAT:
2505    
2506 nigel 37 /* If the maximum is zero then the minimum must also be zero; Perl allows
2507     this case, so we do too - by simply omitting the item altogether. */
2508    
2509     if (repeat_max == 0) goto END_REPEAT;
2510    
2511     /* Combine the op_type with the repeat_type */
2512    
2513     repeat_type += op_type;
2514    
2515 nigel 3 /* A minimum of zero is handled either as the special case * or ?, or as
2516     an UPTO, with the maximum given. */
2517    
2518     if (repeat_min == 0)
2519     {
2520     if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2521     else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2522     else
2523     {
2524     *code++ = OP_UPTO + repeat_type;
2525 nigel 63 PUT2INC(code, 0, repeat_max);
2526 nigel 3 }
2527     }
2528    
2529     /* The case {1,} is handled as the special case + */
2530    
2531     else if (repeat_min == 1 && repeat_max == -1)
2532     *code++ = OP_PLUS + repeat_type;
2533    
2534     /* The case {n,n} is just an EXACT, while the general case {n,m} is
2535     handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
2536    
2537     else
2538     {
2539     if (repeat_min != 1)
2540     {
2541     *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2542 nigel 63 PUT2INC(code, 0, repeat_min);
2543 nigel 3 }
2544    
2545     /* If the mininum is 1 and the previous item was a character string,
2546     we either have to put back the item that got cancelled if the string
2547     length was 1, or add the character back onto the end of a longer
2548 nigel 21 string. For a character type nothing need be done; it will just get
2549     put back naturally. Note that the final character is always going to
2550 nigel 63 get added below, so we leave code ready for its insertion. */
2551 nigel 3
2552     else if (*previous == OP_CHARS)
2553     {
2554 nigel 63 if (code == previous) code += 2; else
2555    
2556     /* In UTF-8 mode, a multibyte char has its length in c, with the 0x80
2557     bit set as a flag. The length will always be between 2 and 6. */
2558    
2559     #ifdef SUPPORT_UTF8
2560     if (utf8 && c >= 128) previous[1] += c & 7; else
2561     #endif
2562     previous[1]++;
2563 nigel 3 }
2564    
2565 nigel 21 /* For a single negated character we also have to put back the
2566 nigel 63 item that got cancelled. At present this applies only to single byte
2567     characters in any mode. */
2568 nigel 21
2569     else if (*previous == OP_NOT) code++;
2570    
2571 nigel 63 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2572     we have to insert the character for the previous code. In UTF-8 mode,
2573     long characters have their length in c, with the 0x80 bit as a flag. */
2574 nigel 3
2575 nigel 9 if (repeat_max < 0)
2576 nigel 3 {
2577 nigel 63 #ifdef SUPPORT_UTF8
2578     if (utf8 && c >= 128)
2579     {
2580     memcpy(code, utf8_char, c & 7);
2581     code += c & 7;
2582     }
2583     else
2584     #endif
2585 nigel 3 *code++ = c;
2586 nigel 9 *code++ = OP_STAR + repeat_type;
2587     }
2588    
2589 nigel 63 /* Else insert an UPTO if the max is greater than the min, again
2590     preceded by the character, for the previously inserted code. */
2591 nigel 9
2592     else if (repeat_max != repeat_min)
2593     {
2594 nigel 63 #ifdef SUPPORT_UTF8
2595     if (utf8 && c >= 128)
2596     {
2597     memcpy(code, utf8_char, c & 7);
2598     code += c & 7;
2599     }
2600     else
2601     #endif
2602 nigel 9 *code++ = c;
2603 nigel 3 repeat_max -= repeat_min;
2604     *code++ = OP_UPTO + repeat_type;
2605 nigel 63 PUT2INC(code, 0, repeat_max);
2606 nigel 3 }
2607     }
2608    
2609     /* The character or character type itself comes last in all cases. */
2610    
2611 nigel 63 #ifdef SUPPORT_UTF8
2612     if (utf8 && c >= 128)
2613     {
2614     memcpy(code, utf8_char, c & 7);
2615     code += c & 7;
2616     }
2617     else
2618     #endif
2619    
2620 nigel 3 *code++ = c;
2621     }
2622    
2623     /* If previous was a character class or a back reference, we put the repeat
2624 nigel 37 stuff after it, but just skip the item if the repeat was {0,0}. */
2625 nigel 3
2626 nigel 63 else if (*previous == OP_CLASS ||
2627     *previous == OP_NCLASS ||
2628     #ifdef SUPPORT_UTF8
2629     *previous == OP_XCLASS ||
2630     #endif
2631     *previous == OP_REF)
2632 nigel 3 {
2633 nigel 37 if (repeat_max == 0)
2634     {
2635     code = previous;
2636     goto END_REPEAT;
2637     }
2638 nigel 3 if (repeat_min == 0 && repeat_max == -1)
2639     *code++ = OP_CRSTAR + repeat_type;
2640     else if (repeat_min == 1 && repeat_max == -1)
2641     *code++ = OP_CRPLUS + repeat_type;
2642     else if (repeat_min == 0 && repeat_max == 1)
2643     *code++ = OP_CRQUERY + repeat_type;
2644     else
2645     {
2646     *code++ = OP_CRRANGE + repeat_type;
2647 nigel 63 PUT2INC(code, 0, repeat_min);
2648 nigel 3 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2649 nigel 63 PUT2INC(code, 0, repeat_max);
2650 nigel 3 }
2651     }
2652    
2653     /* If previous was a bracket group, we may have to replicate it in certain
2654 nigel 23 cases. */
2655 nigel 3
2656 nigel 63 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2657     *previous == OP_COND)
2658 nigel 3 {
2659 nigel 31 register int i;
2660     int ketoffset = 0;
2661 nigel 9 int len = code - previous;
2662 nigel 31 uschar *bralink = NULL;
2663 nigel 3
2664 nigel 23 /* If the maximum repeat count is unlimited, find the end of the bracket
2665     by scanning through from the start, and compute the offset back to it
2666     from the current code pointer. There may be an OP_OPT setting following
2667     the final KET, so we can't find the end just by going back from the code
2668     pointer. */
2669    
2670     if (repeat_max == -1)
2671 nigel 3 {
2672 nigel 23 register uschar *ket = previous;
2673 nigel 63 do ket += GET(ket, 1); while (*ket != OP_KET);
2674 nigel 23 ketoffset = code - ket;
2675 nigel 3 }
2676    
2677 nigel 31 /* The case of a zero minimum is special because of the need to stick
2678     OP_BRAZERO in front of it, and because the group appears once in the
2679     data, whereas in other cases it appears the minimum number of times. For
2680     this reason, it is simplest to treat this case separately, as otherwise
2681 nigel 53 the code gets far too messy. There are several special subcases when the
2682 nigel 31 minimum is zero. */
2683    
2684     if (repeat_min == 0)
2685     {
2686     /* If the maximum is also zero, we just omit the group from the output
2687     altogether. */
2688    
2689     if (repeat_max == 0)
2690     {
2691     code = previous;
2692 nigel 37 goto END_REPEAT;
2693 nigel 31 }
2694    
2695     /* If the maximum is 1 or unlimited, we just have to stick in the
2696 nigel 73 BRAZERO and do no more at this point. However, we do need to adjust
2697     any OP_RECURSE calls inside the group that refer to the group itself or
2698     any internal group, because the offset is from the start of the whole
2699     regex. Temporarily terminate the pattern while doing this. */
2700 nigel 31
2701     if (repeat_max <= 1)
2702     {
2703 nigel 73 *code = OP_END;
2704     adjust_recurse(previous, 1, utf8, cd);
2705 nigel 31 memmove(previous+1, previous, len);
2706     code++;
2707     *previous++ = OP_BRAZERO + repeat_type;
2708     }
2709    
2710     /* If the maximum is greater than 1 and limited, we have to replicate
2711     in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2712     The first one has to be handled carefully because it's the original
2713     copy, which has to be moved up. The remainder can be handled by code
2714 nigel 73 that is common with the non-zero minimum case below. We have to
2715     adjust the value or repeat_max, since one less copy is required. Once
2716     again, we may have to adjust any OP_RECURSE calls inside the group. */
2717 nigel 31
2718     else
2719     {
2720     int offset;
2721 nigel 73 *code = OP_END;
2722     adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2723 nigel 63 memmove(previous + 2 + LINK_SIZE, previous, len);
2724     code += 2 + LINK_SIZE;
2725 nigel 31 *previous++ = OP_BRAZERO + repeat_type;
2726     *previous++ = OP_BRA;
2727    
2728     /* We chain together the bracket offset fields that have to be
2729     filled in later when the ends of the brackets are reached. */
2730    
2731     offset = (bralink == NULL)? 0 : previous - bralink;
2732     bralink = previous;
2733 nigel 63 PUTINC(previous, 0, offset);
2734 nigel 31 }
2735    
2736     repeat_max--;
2737     }
2738    
2739     /* If the minimum is greater than zero, replicate the group as many
2740     times as necessary, and adjust the maximum to the number of subsequent
2741 nigel 63 copies that we need. If we set a first char from the group, and didn't
2742     set a required char, copy the latter from the former. */
2743 nigel 31
2744     else
2745     {
2746 nigel 63 if (repeat_min > 1)
2747 nigel 31 {
2748 nigel 63 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2749     for (i = 1; i < repeat_min; i++)
2750     {
2751     memcpy(code, previous, len);
2752     code += len;
2753     }
2754 nigel 31 }
2755     if (repeat_max > 0) repeat_max -= repeat_min;
2756     }
2757    
2758     /* This code is common to both the zero and non-zero minimum cases. If
2759     the maximum is limited, it replicates the group in a nested fashion,
2760     remembering the bracket starts on a stack. In the case of a zero minimum,
2761     the first one was set up above. In all cases the repeat_max now specifies
2762     the number of additional copies needed. */
2763    
2764     if (repeat_max >= 0)
2765     {
2766     for (i = repeat_max - 1; i >= 0; i--)
2767     {
2768     *code++ = OP_BRAZERO + repeat_type;
2769    
2770     /* All but the final copy start a new nesting, maintaining the
2771     chain of brackets outstanding. */
2772    
2773     if (i != 0)
2774     {
2775     int offset;
2776     *code++ = OP_BRA;
2777     offset = (bralink == NULL)? 0 : code - bralink;
2778     bralink = code;
2779 nigel 63 PUTINC(code, 0, offset);
2780 nigel 31 }
2781    
2782     memcpy(code, previous, len);
2783     code += len;
2784     }
2785    
2786     /* Now chain through the pending brackets, and fill in their length
2787     fields (which are holding the chain links pro tem). */
2788    
2789     while (bralink != NULL)
2790     {
2791     int oldlinkoffset;
2792     int offset = code - bralink + 1;
2793     uschar *bra = code - offset;
2794 nigel 63 oldlinkoffset = GET(bra, 1);
2795 nigel 31 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2796     *code++ = OP_KET;
2797 nigel 63 PUTINC(code, 0, offset);
2798     PUT(bra, 1, offset);
2799 nigel 31 }
2800     }
2801    
2802     /* If the maximum is unlimited, set a repeater in the final copy. We
2803     can't just offset backwards from the current code point, because we
2804     don't know if there's been an options resetting after the ket. The
2805     correct offset was computed above. */
2806    
2807     else code[-ketoffset] = OP_KETRMAX + repeat_type;
2808 nigel 3 }
2809    
2810     /* Else there's some kind of shambles */
2811    
2812     else
2813     {
2814     *errorptr = ERR11;
2815     goto FAILED;
2816     }
2817    
2818 nigel 63 /* If the character following a repeat is '+', we wrap the entire repeated
2819     item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2820     Sun's Java package. The repeated item starts at tempcode, not at previous,
2821     which might be the first part of a string whose (former) last char we
2822     repeated. However, we don't support '+' after a greediness '?'. */
2823    
2824     if (possessive_quantifier)
2825     {
2826     int len = code - tempcode;
2827     memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2828     code += 1 + LINK_SIZE;
2829     len += 1 + LINK_SIZE;
2830     tempcode[0] = OP_ONCE;
2831     *code++ = OP_KET;
2832     PUTINC(code, 0, len);
2833     PUT(tempcode, 1, len);
2834     }
2835    
2836 nigel 65 /* In all case we no longer have a previous item. We also set the
2837     "follows varying string" flag for subsequently encountered reqbytes if
2838     it isn't already set and we have just passed a varying length item. */
2839 nigel 3
2840 nigel 37 END_REPEAT:
2841 nigel 3 previous = NULL;
2842 nigel 65 cd->req_varyopt |= reqvary;
2843 nigel 3 break;
2844    
2845    
2846 nigel 23 /* Start of nested bracket sub-expression, or comment or lookahead or
2847     lookbehind or option setting or condition. First deal with special things
2848     that can come after a bracket; all are introduced by ?, and the appearance
2849     of any of them means that this is not a referencing group. They were
2850     checked for validity in the first pass over the string, so we don't have to
2851     check for syntax errors here. */
2852 nigel 3
2853     case '(':
2854 nigel 23 newoptions = options;
2855 nigel 53 skipbytes = 0;
2856 nigel 23
2857 nigel 3 if (*(++ptr) == '?')
2858     {
2859 nigel 23 int set, unset;
2860     int *optset;
2861 nigel 3
2862     switch (*(++ptr))
2863     {
2864 nigel 23 case '#': /* Comment; skip to ket */
2865 nigel 3 ptr++;
2866     while (*ptr != ')') ptr++;
2867     continue;
2868    
2869     case ':': /* Non-extracting bracket */
2870 nigel 23 bravalue = OP_BRA;
2871 nigel 3 ptr++;
2872     break;
2873    
2874 nigel 23 case '(':
2875     bravalue = OP_COND; /* Conditional group */
2876 nigel 63
2877     /* Condition to test for recursion */
2878    
2879     if (ptr[1] == 'R')
2880 nigel 23 {
2881 nigel 63 code[1+LINK_SIZE] = OP_CREF;
2882     PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2883     skipbytes = 3;
2884     ptr += 3;
2885     }
2886    
2887 nigel 69 /* Condition to test for a numbered subpattern match. We know that
2888     if a digit follows ( then there will just be digits until ) because
2889     the syntax was checked in the first pass. */
2890 nigel 63
2891 nigel 69 else if ((digitab[ptr[1]] && ctype_digit) != 0)
2892 nigel 63 {
2893 nigel 65 int condref; /* Don't amalgamate; some compilers */
2894     condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2895 nigel 23 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2896 nigel 51 if (condref == 0)
2897     {
2898     *errorptr = ERR35;
2899     goto FAILED;
2900     }
2901 nigel 23 ptr++;
2902 nigel 63 code[1+LINK_SIZE] = OP_CREF;
2903     PUT2(code, 2+LINK_SIZE, condref);
2904 nigel 53 skipbytes = 3;
2905 nigel 23 }
2906 nigel 63 /* For conditions that are assertions, we just fall through, having
2907     set bravalue above. */
2908 nigel 23 break;
2909    
2910     case '=': /* Positive lookahead */
2911 nigel 3 bravalue = OP_ASSERT;
2912     ptr++;
2913     break;
2914    
2915 nigel 23 case '!': /* Negative lookahead */
2916 nigel 3 bravalue = OP_ASSERT_NOT;
2917     ptr++;
2918     break;
2919    
2920 nigel 23 case '<': /* Lookbehinds */
2921     switch (*(++ptr))
2922 nigel 3 {
2923 nigel 23 case '=': /* Positive lookbehind */
2924     bravalue = OP_ASSERTBACK;
2925 nigel 3 ptr++;
2926     break;
2927 nigel 23
2928     case '!': /* Negative lookbehind */
2929     bravalue = OP_ASSERTBACK_NOT;
2930     ptr++;
2931     break;
2932 nigel 3 }
2933 nigel 23 break;
2934 nigel 3
2935 nigel 23 case '>': /* One-time brackets */
2936     bravalue = OP_ONCE;
2937     ptr++;
2938     break;
2939    
2940 nigel 63 case 'C': /* Callout - may be followed by digits */
2941     *code++ = OP_CALLOUT;
2942     {
2943     int n = 0;
2944 nigel 69 while ((digitab[*(++ptr)] & ctype_digit) != 0)
2945 nigel 63 n = n * 10 + *ptr - '0';
2946     if (n > 255)
2947     {
2948     *errorptr = ERR38;
2949     goto FAILED;
2950     }
2951     *code++ = n;
2952     }
2953     previous = NULL;
2954     continue;
2955    
2956     case 'P': /* Named subpattern handling */
2957     if (*(++ptr) == '<') /* Definition */
2958     {
2959     int i, namelen;
2960     uschar *slot = cd->name_table;
2961 nigel 65 const uschar *name; /* Don't amalgamate; some compilers */
2962     name = ++ptr; /* grumble at autoincrement in declaration */
2963 nigel 63
2964     while (*ptr++ != '>');
2965     namelen = ptr - name - 1;
2966    
2967     for (i = 0; i < cd->names_found; i++)
2968     {
2969 nigel 67 int crc = memcmp(name, slot+2, namelen);
2970     if (crc == 0)
2971 nigel 63 {
2972 nigel 65 if (slot[2+namelen] == 0)
2973     {
2974     *errorptr = ERR43;
2975     goto FAILED;
2976     }
2977 nigel 67 crc = -1; /* Current name is substring */
2978 nigel 63 }
2979 nigel 67 if (crc < 0)
2980 nigel 63 {
2981     memmove(slot + cd->name_entry_size, slot,
2982     (cd->names_found - i) * cd->name_entry_size);
2983     break;
2984     }
2985     slot += cd->name_entry_size;
2986     }
2987    
2988     PUT2(slot, 0, *brackets + 1);
2989     memcpy(slot + 2, name, namelen);
2990     slot[2+namelen] = 0;
2991     cd->names_found++;
2992     goto NUMBERED_GROUP;
2993     }
2994    
2995     if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2996     {
2997     int i, namelen;
2998     int type = *ptr++;
2999     const uschar *name = ptr;
3000     uschar *slot = cd->name_table;
3001    
3002     while (*ptr != ')') ptr++;
3003     namelen = ptr - name;
3004    
3005     for (i = 0; i < cd->names_found; i++)
3006     {
3007 nigel 65 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3008 nigel 63 slot += cd->name_entry_size;
3009     }
3010     if (i >= cd->names_found)
3011     {
3012     *errorptr = ERR15;
3013     goto FAILED;
3014     }
3015    
3016     recno = GET2(slot, 0);
3017    
3018     if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
3019    
3020     /* Back reference */
3021    
3022     previous = code;
3023     *code++ = OP_REF;
3024     PUT2INC(code, 0, recno);
3025     cd->backref_map |= (recno < 32)? (1 << recno) : 1;
3026     if (recno > cd->top_backref) cd->top_backref = recno;
3027     continue;
3028     }
3029    
3030     /* Should never happen */
3031     break;
3032    
3033 nigel 43 case 'R': /* Pattern recursion */
3034 nigel 63 ptr++; /* Same as (?0) */
3035     /* Fall through */
3036    
3037     /* Recursion or "subroutine" call */
3038    
3039     case '0': case '1': case '2': case '3': case '4':
3040     case '5': case '6': case '7': case '8': case '9':
3041     {
3042     const uschar *called;
3043     recno = 0;
3044 nigel 69 while((digitab[*ptr] & ctype_digit) != 0)
3045 nigel 63 recno = recno * 10 + *ptr++ - '0';
3046    
3047     /* Come here from code above that handles a named recursion */
3048    
3049     HANDLE_RECURSION:
3050    
3051     previous = code;
3052    
3053     /* Find the bracket that is being referenced. Temporarily end the
3054     regex in case it doesn't exist. */
3055    
3056     *code = OP_END;
3057     called = (recno == 0)?
3058     cd->start_code : find_bracket(cd->start_code, utf8, recno);
3059    
3060     if (called == NULL)
3061     {
3062     *errorptr = ERR15;
3063     goto FAILED;
3064     }
3065    
3066     /* If the subpattern is still open, this is a recursive call. We
3067     check to see if this is a left recursion that could loop for ever,
3068     and diagnose that case. */
3069    
3070     if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3071     {
3072     *errorptr = ERR40;
3073     goto FAILED;
3074     }
3075    
3076     /* Insert the recursion/subroutine item */
3077    
3078     *code = OP_RECURSE;
3079     PUT(code, 1, called - cd->start_code);
3080     code += 1 + LINK_SIZE;
3081     }
3082 nigel 43 continue;
3083    
3084 nigel 63 /* Character after (? not specially recognized */
3085    
3086 nigel 23 default: /* Option setting */
3087     set = unset = 0;
3088     optset = &set;
3089    
3090     while (*ptr != ')' && *ptr != ':')
3091     {
3092     switch (*ptr++)
3093     {
3094     case '-': optset = &unset; break;
3095    
3096     case 'i': *optset |= PCRE_CASELESS; break;
3097     case 'm': *optset |= PCRE_MULTILINE; break;
3098     case 's': *optset |= PCRE_DOTALL; break;
3099     case 'x': *optset |= PCRE_EXTENDED; break;
3100     case 'U': *optset |= PCRE_UNGREEDY; break;
3101     case 'X': *optset |= PCRE_EXTRA; break;
3102     }
3103     }
3104    
3105     /* Set up the changed option bits, but don't change anything yet. */
3106    
3107     newoptions = (options | set) & (~unset);
3108    
3109     /* If the options ended with ')' this is not the start of a nested
3110 nigel 63 group with option changes, so the options change at this level. Compile
3111     code to change the ims options if this setting actually changes any of
3112     them. We also pass the new setting back so that it can be put at the
3113     start of any following branches, and when this group ends (if we are in
3114     a group), a resetting item can be compiled.
3115 nigel 23
3116 nigel 63 Note that if this item is right at the start of the pattern, the
3117     options will have been abstracted and made global, so there will be no
3118     change to compile. */
3119    
3120 nigel 23 if (*ptr == ')')
3121     {
3122 nigel 63 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3123 nigel 23 {
3124     *code++ = OP_OPT;
3125 nigel 63 *code++ = newoptions & PCRE_IMS;
3126 nigel 23 }
3127 nigel 63
3128     /* Change options at this level, and pass them back for use
3129     in subsequent branches. Reset the greedy defaults and the case
3130     value for firstbyte and reqbyte. */
3131    
3132     *optionsptr = options = newoptions;
3133     greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3134     greedy_non_default = greedy_default ^ 1;
3135     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3136    
3137 nigel 23 previous = NULL; /* This item can't be repeated */
3138     continue; /* It is complete */
3139     }
3140    
3141     /* If the options ended with ':' we are heading into a nested group
3142     with possible change of options. Such groups are non-capturing and are
3143     not assertions of any kind. All we need to do is skip over the ':';
3144     the newoptions value is handled below. */
3145    
3146     bravalue = OP_BRA;
3147     ptr++;
3148 nigel 3 }
3149     }
3150    
3151 nigel 63 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3152     non-capturing and behave like (?:...) brackets */
3153    
3154     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3155     {
3156     bravalue = OP_BRA;
3157     }
3158    
3159 nigel 53 /* Else we have a referencing group; adjust the opcode. If the bracket
3160     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3161     arrange for the true number to follow later, in an OP_BRANUMBER item. */
3162 nigel 3
3163     else
3164     {
3165 nigel 63 NUMBERED_GROUP:
3166 nigel 53 if (++(*brackets) > EXTRACT_BASIC_MAX)
3167 nigel 3 {
3168 nigel 53 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3169 nigel 63 code[1+LINK_SIZE] = OP_BRANUMBER;
3170     PUT2(code, 2+LINK_SIZE, *brackets);
3171 nigel 53 skipbytes = 3;
3172 nigel 3 }
3173 nigel 53 else bravalue = OP_BRA + *brackets;
3174 nigel 3 }
3175    
3176 nigel 23 /* Process nested bracketed re. Assertions may not be repeated, but other
3177     kinds can be. We copy code into a non-register variable in order to be able
3178     to pass its address because some compilers complain otherwise. Pass in a
3179     new setting for the ims options if they have changed. */
3180 nigel 3
3181 nigel 23 previous = (bravalue >= OP_ONCE)? code : NULL;
3182 nigel 3 *code = bravalue;
3183 nigel 23 tempcode = code;
3184 nigel 65 tempreqvary = cd->req_varyopt; /* Save value before bracket */
3185 nigel 23
3186     if (!compile_regex(
3187 nigel 63 newoptions, /* The complete new option state */
3188     options & PCRE_IMS, /* The previous ims option state */
3189 nigel 53 brackets, /* Extracting bracket count */
3190 nigel 23 &tempcode, /* Where to put code (updated) */
3191     &ptr, /* Input pointer (updated) */
3192     errorptr, /* Where to put an error message */
3193     (bravalue == OP_ASSERTBACK ||
3194     bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3195 nigel 53 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3196 nigel 63 &subfirstbyte, /* For possible first char */
3197     &subreqbyte, /* For possible last char */
3198     bcptr, /* Current branch chain */
3199 nigel 25 cd)) /* Tables block */
3200 nigel 23 goto FAILED;
3201    
3202     /* At the end of compiling, code is still pointing to the start of the
3203     group, while tempcode has been updated to point past the end of the group
3204     and any option resetting that may follow it. The pattern pointer (ptr)
3205     is on the bracket. */
3206    
3207     /* If this is a conditional bracket, check that there are no more than
3208     two branches in the group. */
3209    
3210 nigel 53 else if (bravalue == OP_COND)
3211 nigel 3 {
3212 nigel 23 uschar *tc = code;
3213 nigel 37 condcount = 0;
3214 nigel 23
3215     do {
3216 nigel 37 condcount++;
3217 nigel 63 tc += GET(tc,1);
3218 nigel 23 }
3219     while (*tc != OP_KET);
3220    
3221 nigel 37 if (condcount > 2)
3222 nigel 23 {
3223     *errorptr = ERR27;
3224 nigel 3 goto FAILED;
3225 nigel 23 }
3226 nigel 63
3227     /* If there is just one branch, we must not make use of its firstbyte or
3228     reqbyte, because this is equivalent to an empty second branch. */
3229    
3230     if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3231 nigel 3 }
3232    
3233 nigel 63 /* Handle updating of the required and first characters. Update for normal
3234     brackets of all kinds, and conditions with two branches (see code above).
3235     If the bracket is followed by a quantifier with zero repeat, we have to
3236     back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3237     main loop so that they can be accessed for the back off. */
3238 nigel 37
3239 nigel 63 zeroreqbyte = reqbyte;
3240     zerofirstbyte = firstbyte;
3241     groupsetfirstbyte = FALSE;
3242    
3243     if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3244 nigel 37 {
3245 nigel 63 /* If we have not yet set a firstbyte in this branch, take it from the
3246     subpattern, remembering that it was set here so that a repeat of more
3247     than one can replicate it as reqbyte if necessary. If the subpattern has
3248     no firstbyte, set "none" for the whole branch. In both cases, a zero
3249     repeat forces firstbyte to "none". */
3250    
3251     if (firstbyte == REQ_UNSET)
3252     {
3253     if (subfirstbyte >= 0)
3254     {
3255     firstbyte = subfirstbyte;
3256     groupsetfirstbyte = TRUE;
3257     }
3258     else firstbyte = REQ_NONE;
3259     zerofirstbyte = REQ_NONE;
3260     }
3261    
3262     /* If firstbyte was previously set, convert the subpattern's firstbyte
3263 nigel 65 into reqbyte if there wasn't one, using the vary flag that was in
3264     existence beforehand. */
3265 nigel 63
3266 nigel 65 else if (subfirstbyte >= 0 && subreqbyte < 0)
3267     subreqbyte = subfirstbyte | tempreqvary;
3268 nigel 63
3269 nigel 65 /* If the subpattern set a required byte (or set a first byte that isn't
3270     really the first byte - see above), set it. */
3271 nigel 63
3272     if (subreqbyte >= 0) reqbyte = subreqbyte;
3273 nigel 37 }
3274    
3275 nigel 63 /* For a forward assertion, we take the reqbyte, if set. This can be
3276     helpful if the pattern that follows the assertion doesn't set a different
3277     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3278     for an assertion, however because it leads to incorrect effect for patterns
3279     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3280     of a firstbyte. This is overcome by a scan at the end if there's no
3281     firstbyte, looking for an asserted first char. */
3282    
3283     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3284    
3285 nigel 23 /* Now update the main code pointer to the end of the group. */
3286    
3287     code = tempcode;
3288    
3289     /* Error if hit end of pattern */
3290    
3291 nigel 3 if (*ptr != ')')
3292     {
3293     *errorptr = ERR14;
3294     goto FAILED;
3295     }
3296     break;
3297    
3298     /* Check \ for being a real metacharacter; if not, fall through and handle
3299     it as a data character at the start of a string. Escape items are checked
3300     for validity in the pre-compiling pass. */
3301    
3302     case '\\':
3303 nigel 23 tempptr = ptr;
3304 nigel 71 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3305 nigel 3
3306     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3307     are arranged to be the negation of the corresponding OP_values. For the
3308     back references, the values are ESC_REF plus the reference number. Only
3309     back references and those types that consume a character may be repeated.
3310     We can test for values between ESC_b and ESC_Z for the latter; this may
3311     have to change if any new ones are ever created. */
3312    
3313     if (c < 0)
3314     {
3315 nigel 63 if (-c == ESC_Q) /* Handle start of quoted string */
3316     {
3317     if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3318     else inescq = TRUE;
3319     continue;
3320     }
3321    
3322     /* For metasequences that actually match a character, we disable the
3323     setting of a first character if it hasn't already been set. */
3324    
3325     if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3326     firstbyte = REQ_NONE;
3327    
3328     /* Set values to reset to if this is followed by a zero repeat. */
3329    
3330     zerofirstbyte = firstbyte;
3331     zeroreqbyte = reqbyte;
3332    
3333     /* Back references are handled specially */
3334    
3335 nigel 3 if (-c >= ESC_REF)
3336     {
3337 nigel 53 int number = -c - ESC_REF;
3338 nigel 3 previous = code;
3339     *code++ = OP_REF;
3340 nigel 63 PUT2INC(code, 0, number);
3341 nigel 3 }
3342     else
3343     {
3344 nigel 23 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3345 nigel 3 *code++ = -c;
3346     }
3347     continue;
3348     }
3349    
3350 nigel 7 /* Data character: reset and fall through */
3351 nigel 3
3352 nigel 23 ptr = tempptr;
3353 nigel 3 c = '\\';
3354    
3355     /* Handle a run of data characters until a metacharacter is encountered.
3356     The first character is guaranteed not to be whitespace or # when the
3357     extended flag is set. */
3358    
3359     NORMAL_CHAR:
3360     default:
3361     previous = code;
3362     *code = OP_CHARS;
3363     code += 2;
3364     length = 0;
3365    
3366     do
3367     {
3368 nigel 63 /* If in \Q...\E, check for the end; if not, we always have a literal */
3369    
3370     if (inescq)
3371     {
3372     if (c == '\\' && ptr[1] == 'E')
3373     {
3374     inescq = FALSE;
3375     ptr++;
3376     }
3377     else
3378     {
3379     *code++ = c;
3380     length++;
3381     }
3382     continue;
3383     }
3384    
3385     /* Skip white space and comments for /x patterns */
3386    
3387 nigel 3 if ((options & PCRE_EXTENDED) != 0)
3388     {
3389 nigel 25 if ((cd->ctypes[c] & ctype_space) != 0) continue;
3390 nigel 3 if (c == '#')
3391     {
3392 nigel 47 /* The space before the ; is to avoid a warning on a silly compiler
3393     on the Macintosh. */
3394 nigel 53 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3395 nigel 3 if (c == 0) break;
3396     continue;
3397     }
3398     }
3399    
3400     /* Backslash may introduce a data char or a metacharacter. Escaped items
3401     are checked for validity in the pre-compiling pass. Stop the string
3402     before a metaitem. */
3403    
3404     if (c == '\\')
3405     {
3406 nigel 23 tempptr = ptr;
3407 nigel 71 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3408 nigel 23 if (c < 0) { ptr = tempptr; break; }
3409 nigel 49
3410     /* If a character is > 127 in UTF-8 mode, we have to turn it into
3411 nigel 73 two or more bytes in the UTF-8 encoding. */
3412 nigel 49
3413     #ifdef SUPPORT_UTF8
3414 nigel 63 if (utf8 && c > 127)
3415 nigel 49 {
3416     uschar buffer[8];
3417     int len = ord2utf8(c, buffer);
3418     for (c = 0; c < len; c++) *code++ = buffer[c];
3419     length += len;
3420     continue;
3421     }
3422     #endif
3423 nigel 3 }
3424    
3425     /* Ordinary character or single-char escape */
3426    
3427     *code++ = c;
3428     length++;
3429     }
3430    
3431     /* This "while" is the end of the "do" above. */
3432    
3433 nigel 49 while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
3434 nigel 3
3435 nigel 63 /* Update the first and last requirements. These are always bytes, even in
3436     UTF-8 mode. However, there is a special case to be considered when there
3437     are only one or two characters. Because this gets messy in UTF-8 mode, the
3438     code is kept separate. When we get here "length" contains the number of
3439     bytes. */
3440 nigel 37
3441 nigel 63 #ifdef SUPPORT_UTF8
3442     if (utf8 && length > 1)
3443     {
3444     uschar *t = previous + 3; /* After this code, t */
3445     while (t < code && (*t & 0xc0) == 0x80) t++; /* follows the 1st char */
3446 nigel 37
3447 nigel 63 /* Handle the case when there is only one multibyte character. It must
3448     have at least two bytes because of the "length > 1" test above. */
3449 nigel 3
3450 nigel 63 if (t == code)
3451     {
3452     /* If no previous first byte, set it from this character, but revert to
3453     none on a zero repeat. */
3454    
3455     if (firstbyte == REQ_UNSET)
3456     {
3457     zerofirstbyte = REQ_NONE;
3458     firstbyte = previous[2];
3459     }
3460    
3461     /* Otherwise, leave the first byte value alone, and don't change it on
3462     a zero repeat */
3463    
3464     else zerofirstbyte = firstbyte;
3465    
3466     /* In both cases, a zero repeat resets the previous required byte */
3467    
3468     zeroreqbyte = reqbyte;
3469     }
3470    
3471     /* Handle the case when there is more than one character. These may be
3472     single-byte or multibyte characters */
3473    
3474     else
3475     {
3476 nigel 67 t = code - 1; /* After this code, t is at the */
3477 nigel 63 while ((*t & 0xc0) == 0x80) t--; /* start of the last character */
3478    
3479     /* If no previous first byte, set it from the first character, and
3480     retain it on a zero repeat (of the last character). The required byte
3481     is reset on a zero repeat, either to the byte before the last
3482     character, unless this is the first byte of the string. In that case,
3483     it reverts to its previous value. */
3484    
3485     if (firstbyte == REQ_UNSET)
3486     {
3487     zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3488 nigel 65 zeroreqbyte = (t - 1 == previous + 2)?
3489     reqbyte : t[-1] | req_caseopt | cd->req_varyopt;
3490 nigel 63 }
3491    
3492     /* If there was a previous first byte, leave it alone, and don't change
3493     it on a zero repeat. The required byte is reset on a zero repeat to the
3494     byte before the last character. */
3495    
3496     else
3497     {
3498     zerofirstbyte = firstbyte;
3499 nigel 65 zeroreqbyte = t[-1] | req_caseopt | cd->req_varyopt;
3500 nigel 63 }
3501     }
3502    
3503     /* In all cases (we know length > 1), the new required byte is the last
3504     byte of the string. */
3505    
3506 nigel 65 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3507 nigel 63 }
3508    
3509     else /* End of UTF-8 coding */
3510     #endif
3511    
3512     /* This is the code for non-UTF-8 operation, either without UTF-8 support,
3513     or when UTF-8 is not enabled. */
3514    
3515     {
3516     /* firstbyte was not previously set; take it from this string */
3517    
3518     if (firstbyte == REQ_UNSET)
3519     {
3520     if (length == 1)
3521     {
3522     zerofirstbyte = REQ_NONE;
3523     firstbyte = previous[2] | req_caseopt;
3524     zeroreqbyte = reqbyte;
3525     }
3526     else
3527     {
3528     zerofirstbyte = firstbyte = previous[2] | req_caseopt;
3529 nigel 65 zeroreqbyte = (length > 2)?
3530     (code[-2] | req_caseopt | cd->req_varyopt) : reqbyte;
3531     reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3532 nigel 63 }
3533     }
3534    
3535     /* firstbyte was previously set */
3536    
3537     else
3538     {
3539     zerofirstbyte = firstbyte;
3540 nigel 65 zeroreqbyte = (length == 1)? reqbyte :
3541     code[-2] | req_caseopt | cd->req_varyopt;
3542     reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3543 nigel 63 }
3544     }
3545    
3546     /* Set the length in the data vector, and advance to the next state. */
3547    
3548 nigel 3 previous[1] = length;
3549 nigel 49 if (length < MAXLIT) ptr--;
3550 nigel 3 break;
3551     }
3552     } /* end of big loop */
3553    
3554     /* Control never reaches here by falling through, only by a goto for all the
3555     error states. Pass back the position in the pattern so that it can be displayed
3556     to the user for diagnosing the error. */
3557    
3558     FAILED:
3559     *ptrptr = ptr;
3560     return FALSE;
3561     }
3562    
3563    
3564    
3565    
3566     /*************************************************
3567     * Compile sequence of alternatives *
3568     *************************************************/
3569    
3570     /* On entry, ptr is pointing past the bracket character, but on return
3571     it points to the closing bracket, or vertical bar, or end of string.
3572     The code variable is pointing at the byte into which the BRA operator has been
3573 nigel 23 stored. If the ims options are changed at the start (for a (?ims: group) or
3574     during any branch, we need to insert an OP_OPT item at the start of every
3575     following branch to ensure they get set correctly at run time, and also pass
3576     the new options into every subsequent branch compile.
3577 nigel 3
3578     Argument:
3579 nigel 63 options option bits, including any changes for this subpattern
3580     oldims previous settings of ims option bits
3581     brackets -> int containing the number of extracting brackets used
3582     codeptr -> the address of the current code pointer
3583     ptrptr -> the address of the current pattern pointer
3584     errorptr -> pointer to error message
3585     lookbehind TRUE if this is a lookbehind assertion
3586     skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3587     firstbyteptr place to put the first required character, or a negative number
3588     reqbyteptr place to put the last required character, or a negative number
3589     bcptr pointer to the chain of currently open branches
3590     cd points to the data block with tables pointers etc.
3591 nigel 3
3592 nigel 23 Returns: TRUE on success
3593 nigel 3 */
3594    
3595     static BOOL
3596 nigel 63 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3597 nigel 53 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3598 nigel 63 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3599 nigel 3 {
3600 nigel 7 const uschar *ptr = *ptrptr;
3601 nigel 3 uschar *code = *codeptr;
3602 nigel 23 uschar *last_branch = code;
3603 nigel 3 uschar *start_bracket = code;
3604 nigel 23 uschar *reverse_count = NULL;
3605 nigel 63 int firstbyte, reqbyte;
3606     int branchfirstbyte, branchreqbyte;
3607     branch_chain bc;
3608 nigel 3
3609 nigel 63 bc.outer = bcptr;
3610     bc.current = code;
3611 nigel 23
3612 nigel 63 firstbyte = reqbyte = REQ_UNSET;
3613    
3614     /* Offset is set zero to mark that this bracket is still open */
3615    
3616     PUT(code, 1, 0);
3617     code += 1 + LINK_SIZE + skipbytes;
3618    
3619 nigel 23 /* Loop for each alternative branch */
3620    
3621 nigel 3 for (;;)
3622     {
3623 nigel 63 /* Handle a change of ims options at the start of the branch */
3624 nigel 3
3625 nigel 63 if ((options & PCRE_IMS) != oldims)
3626 nigel 3 {
3627 nigel 23 *code++ = OP_OPT;
3628 nigel 63 *code++ = options & PCRE_IMS;
3629 nigel 23 }
3630    
3631     /* Set up dummy OP_REVERSE if lookbehind assertion */
3632    
3633     if (lookbehind)
3634     {
3635     *code++ = OP_REVERSE;
3636     reverse_count = code;
3637 nigel 63 PUTINC(code, 0, 0);
3638 nigel 23 }
3639    
3640     /* Now compile the branch */
3641    
3642 nigel 63 if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
3643     &branchfirstbyte, &branchreqbyte, &bc, cd))
3644 nigel 23 {
3645 nigel 3 *ptrptr = ptr;
3646     return FALSE;
3647     }
3648    
3649 nigel 63 /* If this is the first branch, the firstbyte and reqbyte values for the
3650     branch become the values for the regex. */
3651 nigel 3
3652 nigel 63 if (*last_branch != OP_ALT)
3653     {
3654     firstbyte = branchfirstbyte;
3655     reqbyte = branchreqbyte;
3656     }
3657 nigel 3
3658 nigel 63 /* If this is not the first branch, the first char and reqbyte have to
3659 nigel 65 match the values from all the previous branches, except that if the previous
3660     value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3661     REQ_VARY for the regex. */
3662 nigel 37
3663 nigel 63 else
3664 nigel 37 {
3665 nigel 63 /* If we previously had a firstbyte, but it doesn't match the new branch,
3666     we have to abandon the firstbyte for the regex, but if there was previously
3667     no reqbyte, it takes on the value of the old firstbyte. */
3668    
3669     if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3670 nigel 37 {
3671 nigel 63 if (reqbyte < 0) reqbyte = firstbyte;
3672     firstbyte = REQ_NONE;
3673 nigel 37 }
3674    
3675 nigel 63 /* If we (now or from before) have no firstbyte, a firstbyte from the
3676     branch becomes a reqbyte if there isn't a branch reqbyte. */
3677 nigel 37
3678 nigel 63 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3679     branchreqbyte = branchfirstbyte;
3680 nigel 37
3681 nigel 63 /* Now ensure that the reqbytes match */
3682    
3683 nigel 65 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3684     reqbyte = REQ_NONE;
3685     else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3686 nigel 63 }
3687    
3688 nigel 23 /* If lookbehind, check that this branch matches a fixed-length string,
3689     and put the length into the OP_REVERSE item. Temporarily mark the end of
3690     the branch with OP_END. */
3691    
3692     if (lookbehind)
3693     {
3694 nigel 63 int length;
3695 nigel 23 *code = OP_END;
3696 nigel 49 length = find_fixedlength(last_branch, options);
3697 nigel 23 DPRINTF(("fixed length = %d\n", length));
3698     if (length < 0)
3699     {
3700 nigel 63 *errorptr = (length == -2)? ERR36 : ERR25;
3701 nigel 23 *ptrptr = ptr;
3702     return FALSE;
3703     }
3704 nigel 63 PUT(reverse_count, 0, length);
3705 nigel 23 }
3706    
3707 nigel 63 /* Reached end of expression, either ')' or end of pattern. Go back through
3708     the alternative branches and reverse the chain of offsets, with the field in
3709     the BRA item now becoming an offset to the first alternative. If there are
3710     no alternatives, it points to the end of the group. The length in the
3711     terminating ket is always the length of the whole bracketed item. If any of
3712     the ims options were changed inside the group, compile a resetting op-code
3713     following, except at the very end of the pattern. Return leaving the pointer
3714     at the terminating char. */
3715 nigel 3
3716     if (*ptr != '|')
3717     {
3718 nigel 63 int length = code - last_branch;
3719     do
3720 nigel 23 {
3721 nigel 63 int prev_length = GET(last_branch, 1);
3722     PUT(last_branch, 1, length);
3723     length = prev_length;
3724     last_branch -= length;
3725 nigel 23 }
3726 nigel 63 while (length > 0);
3727 nigel 3
3728 nigel 63 /* Fill in the ket */
3729 nigel 3
3730 nigel 63 *code = OP_KET;
3731     PUT(code, 1, code - start_bracket);
3732     code += 1 + LINK_SIZE;
3733 nigel 3
3734 nigel 63 /* Resetting option if needed */
3735 nigel 3
3736 nigel 63 if ((options & PCRE_IMS) != oldims && *ptr == ')')
3737 nigel 23 {
3738 nigel 63 *code++ = OP_OPT;
3739     *code++ = oldims;
3740 nigel 23 }
3741    
3742 nigel 63 /* Set values to pass back */
3743 nigel 23
3744 nigel 63 *codeptr = code;
3745     *ptrptr = ptr;
3746     *firstbyteptr = firstbyte;
3747     *reqbyteptr = reqbyte;
3748     return TRUE;
3749     }
3750 nigel 35
3751 nigel 63 /* Another branch follows; insert an "or" node. Its length field points back
3752     to the previous branch while the bracket remains open. At the end the chain
3753     is reversed. It's done like this so that the start of the bracket has a
3754     zero offset until it is closed, making it possible to detect recursion. */
3755 nigel 23
3756 nigel 63 *code = OP_ALT;
3757     PUT(code, 1, code - last_branch);
3758     bc.current = last_branch = code;
3759     code += 1 + LINK_SIZE;
3760     ptr++;
3761 nigel 23 }
3762     /* Control never reaches here */
3763     }
3764    
3765    
3766    
3767    
3768     /*************************************************
3769 nigel 3 * Check for anchored expression *
3770     *************************************************/
3771    
3772     /* Try to find out if this is an anchored regular expression. Consider each
3773     alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3774     all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3775     it's anchored. However, if this is a multiline pattern, then only OP_SOD
3776     counts, since OP_CIRC can match in the middle.
3777    
3778 nigel 63 We can also consider a regex to be anchored if OP_SOM starts all its branches.
3779     This is the code for \G, which means "match at start of match position, taking
3780     into account the match offset".
3781    
3782 nigel 33 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3783     because that will try the rest of the pattern at all possible matching points,
3784 nigel 63 so there is no point trying again.... er ....
3785 nigel 3
3786 nigel 63 .... except when the .* appears inside capturing parentheses, and there is a
3787     subsequent back reference to those parentheses. We haven't enough information
3788     to catch that case precisely.
3789    
3790     At first, the best we could do was to detect when .* was in capturing brackets
3791     and the highest back reference was greater than or equal to that level.
3792     However, by keeping a bitmap of the first 31 back references, we can catch some
3793     of the more common cases more precisely.
3794    
3795 nigel 23 Arguments:
3796 nigel 63 code points to start of expression (the bracket)
3797     options points to the options setting
3798     bracket_map a bitmap of which brackets we are inside while testing; this
3799     handles up to substring 31; after that we just have to take
3800     the less precise approach
3801     backref_map the back reference bitmap
3802 nigel 23
3803     Returns: TRUE or FALSE
3804 nigel 3 */
3805    
3806     static BOOL
3807 nigel 63 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3808     unsigned int backref_map)
3809 nigel 3 {
3810     do {
3811 nigel 63 const uschar *scode =
3812     first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE);
3813 nigel 23 register int op = *scode;
3814 nigel 63
3815     /* Capturing brackets */
3816    
3817     if (op > OP_BRA)
3818     {
3819     int new_map;
3820     op -= OP_BRA;
3821     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3822     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3823     if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3824     }
3825    
3826     /* Other brackets */
3827    
3828     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3829     {
3830     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3831     }
3832    
3833     /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3834     are or may be referenced. */
3835    
3836 nigel 33 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3837     (*options & PCRE_DOTALL) != 0)
3838 nigel 63 {
3839     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3840     }
3841    
3842     /* Check for explicit anchoring */
3843    
3844     else if (op != OP_SOD && op != OP_SOM &&
3845 nigel 23 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3846     return FALSE;
3847 nigel 63 code += GET(code, 1);
3848 nigel 3 }
3849 nigel 63 while (*code == OP_ALT); /* Loop for each alternative */
3850 nigel 3 return TRUE;
3851     }
3852    
3853    
3854    
3855     /*************************************************
3856 nigel 33 * Check for starting with ^ or .* *
3857 nigel 3 *************************************************/
3858    
3859 nigel 33 /* This is called to find out if every branch starts with ^ or .* so that
3860     "first char" processing can be done to speed things up in multiline
3861     matching and for non-DOTALL patterns that start with .* (which must start at
3862 nigel 63 the beginning or after \n). As in the case of is_anchored() (see above), we
3863     have to take account of back references to capturing brackets that contain .*
3864     because in that case we can't make the assumption.
3865 nigel 3
3866 nigel 63 Arguments:
3867     code points to start of expression (the bracket)
3868     bracket_map a bitmap of which brackets we are inside while testing; this
3869     handles up to substring 31; after that we just have to take
3870     the less precise approach
3871     backref_map the back reference bitmap
3872    
3873     Returns: TRUE or FALSE
3874 nigel 3 */
3875    
3876     static BOOL
3877 nigel 63 is_startline(const uschar *code, unsigned int bracket_map,
3878     unsigned int backref_map)
3879 nigel 3 {
3880     do {
3881 nigel 63 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0);
3882 nigel 23 register int op = *scode;
3883 nigel 63
3884     /* Capturing brackets */
3885    
3886     if (op > OP_BRA)
3887     {
3888     int new_map;
3889     op -= OP_BRA;
3890     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3891     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3892     if (!is_startline(scode, new_map, backref_map)) return FALSE;
3893     }
3894    
3895     /* Other brackets */
3896    
3897     else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3898     { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3899    
3900     /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3901     may be referenced. */
3902    
3903 nigel 33 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3904 nigel 63 {
3905     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3906     }
3907    
3908     /* Check for explicit circumflex */
3909    
3910 nigel 23 else if (op != OP_CIRC) return FALSE;
3911 nigel 63 code += GET(code, 1);
3912 nigel 3 }
3913 nigel 63 while (*code == OP_ALT); /* Loop for each alternative */
3914 nigel 3 return TRUE;
3915     }
3916    
3917    
3918    
3919     /*************************************************
3920 nigel 63 * Check for asserted fixed first char *
3921 nigel 3 *************************************************/
3922    
3923 nigel 63 /* During compilation, the "first char" settings from forward assertions are
3924     discarded, because they can cause conflicts with actual literals that follow.
3925     However, if we end up without a first char setting for an unanchored pattern,
3926     it is worth scanning the regex to see if there is an initial asserted first
3927     char. If all branches start with the same asserted char, or with a bracket all
3928     of whose alternatives start with the same asserted char (recurse ad lib), then
3929     we return that char, otherwise -1.
3930 nigel 3
3931 nigel 23 Arguments:
3932     code points to start of expression (the bracket)
3933     options pointer to the options (used to check casing changes)
3934 nigel 63 inassert TRUE if in an assertion
3935 nigel 23
3936     Returns: -1 or the fixed first char
3937 nigel 3 */
3938    
3939     static int
3940 nigel 63 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3941 nigel 3 {
3942     register int c = -1;
3943 nigel 23 do {
3944     int d;
3945 nigel 63 const uschar *scode =
3946     first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS);
3947 nigel 23 register int op = *scode;
3948 nigel 3
3949 nigel 23 if (op >= OP_BRA) op = OP_BRA;
3950 nigel 3
3951 nigel 23 switch(op)
3952     {
3953     default:
3954     return -1;
3955 nigel 3
3956 nigel 23 case OP_BRA:
3957     case OP_ASSERT:
3958     case OP_ONCE:
3959     case OP_COND:
3960 nigel 63 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3961     return -1;
3962 nigel 23 if (c < 0) c = d; else if (c != d) return -1;
3963     break;
3964 nigel 3
3965 nigel 23 case OP_EXACT: /* Fall through */
3966     scode++;
3967 nigel 3
3968 nigel 23 case OP_CHARS: /* Fall through */
3969     scode++;
3970    
3971     case OP_PLUS:
3972     case OP_MINPLUS:
3973 nigel 63 if (!inassert) return -1;
3974     if (c < 0)
3975     {
3976     c = scode[1];
3977     if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3978     }
3979     else if (c != scode[1]) return -1;
3980 nigel 23 break;
3981     }
3982    
3983 nigel 63 code += GET(code, 1);
3984 nigel 23 }
3985 nigel 3 while (*code == OP_ALT);
3986     return c;
3987     }
3988    
3989    
3990    
3991 nigel 23
3992 nigel 71 #ifdef SUPPORT_UTF8
3993 nigel 3 /*************************************************
3994 nigel 71 * Validate a UTF-8 string *
3995     *************************************************/
3996    
3997     /* This function is called (optionally) at the start of compile or match, to
3998     validate that a supposed UTF-8 string is actually valid. The early check means
3999     that subsequent code can assume it is dealing with a valid string. The check
4000     can be turned off for maximum performance, but then consequences of supplying
4001     an invalid string are then undefined.
4002    
4003     Arguments:
4004     string points to the string
4005     length length of string, or -1 if the string is zero-terminated
4006    
4007     Returns: < 0 if the string is a valid UTF-8 string
4008     >= 0 otherwise; the value is the offset of the bad byte
4009     */
4010    
4011     static int
4012     valid_utf8(const uschar *string, int length)
4013     {
4014     register const uschar *p;
4015    
4016