/[pcre]/code/branches/pcre16/pcre_compile.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 757 - (hide annotations) (download)
Mon Nov 21 11:44:55 2011 UTC (2 years, 11 months ago) by ph10
File MIME type: text/plain
File size: 253363 byte(s)
More 16-bit patches

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 ph10 475 /* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57     also used by pcretest. PCRE_DEBUG is not defined when building a production
58     library. */
59 nigel 85
60 ph10 475 #ifdef PCRE_DEBUG
61 nigel 85 #include "pcre_printint.src"
62     #endif
63    
64    
65 ph10 178 /* Macro for setting individual bits in class bitmaps. */
66    
67     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68    
69 ph10 202 /* Maximum length value to check against when making sure that the integer that
70     holds the compiled pattern length does not overflow. We make it a bit less than
71     INT_MAX to allow for adding in group terminating bytes, so that we don't have
72     to check them every time. */
73 ph10 178
74 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
75    
76    
77 nigel 77 /*************************************************
78     * Code parameters and static tables *
79     *************************************************/
80    
81 nigel 93 /* This value specifies the size of stack workspace that is used during the
82     first pre-compile phase that determines how much memory is required. The regex
83     is partly compiled into this space, but the compiled parts are discarded as
84     soon as they can be, so that hopefully there will never be an overrun. The code
85     does, however, check for an overrun. The largest amount I've seen used is 218,
86     so this number is very generous.
87 nigel 77
88 nigel 93 The same workspace is used during the second, actual compile phase for
89     remembering forward references to groups so that they can be filled in at the
90     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91     is 4 there is plenty of room. */
92 nigel 77
93 nigel 93 #define COMPILE_WORK_SIZE (4096)
94 nigel 77
95 ph10 507 /* The overrun tests check for a slightly smaller size so that they detect the
96 ph10 505 overrun before it actually does run off the end of the data block. */
97 nigel 93
98 ph10 505 #define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99    
100    
101 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
102     are simple data values; negative values are for special things like \d and so
103     on. Zero means further processing is needed (for things like \x), or the escape
104     is invalid. */
105    
106 ph10 391 #ifndef EBCDIC
107    
108     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
109 ph10 392 in UTF-8 mode. */
110 ph10 391
111 ph10 392 static const short int escapes[] = {
112 ph10 391 0, 0,
113     0, 0,
114 ph10 392 0, 0,
115     0, 0,
116     0, 0,
117 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
118 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
119 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
120 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
121     -ESC_B, -ESC_C,
122     -ESC_D, -ESC_E,
123     0, -ESC_G,
124     -ESC_H, 0,
125     0, -ESC_K,
126 ph10 391 0, 0,
127 ph10 514 -ESC_N, 0,
128 ph10 391 -ESC_P, -ESC_Q,
129     -ESC_R, -ESC_S,
130 ph10 392 0, 0,
131     -ESC_V, -ESC_W,
132     -ESC_X, 0,
133     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
134 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
135 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
136 ph10 391 CHAR_GRAVE_ACCENT, 7,
137 ph10 392 -ESC_b, 0,
138     -ESC_d, ESC_e,
139 ph10 391 ESC_f, 0,
140     -ESC_h, 0,
141 ph10 392 0, -ESC_k,
142 ph10 391 0, 0,
143     ESC_n, 0,
144 ph10 392 -ESC_p, 0,
145     ESC_r, -ESC_s,
146 ph10 391 ESC_tee, 0,
147 ph10 392 -ESC_v, -ESC_w,
148     0, 0,
149 ph10 391 -ESC_z
150 nigel 77 };
151    
152 ph10 392 #else
153 ph10 391
154     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155    
156 nigel 77 static const short int escapes[] = {
157     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
158     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
159     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
160     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
161     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
162     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
163     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
164     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
165 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
166 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
167 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
168 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
169 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
170     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
171     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
172     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
173 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
174 ph10 514 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
175 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
176 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
177 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
178     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
179     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
180     };
181     #endif
182    
183    
184 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185     searched linearly. Put all the names into a single string, in order to reduce
186 ph10 392 the number of relocations when a shared library is dynamically linked. The
187     string is built from string macros so that it works in UTF-8 mode on EBCDIC
188 ph10 391 platforms. */
189 ph10 210
190     typedef struct verbitem {
191 ph10 510 int len; /* Length of verb name */
192     int op; /* Op when no arg, or -1 if arg mandatory */
193     int op_arg; /* Op when arg present, or -1 if not allowed */
194 ph10 211 } verbitem;
195 ph10 210
196 ph10 240 static const char verbnames[] =
197 ph10 510 "\0" /* Empty name is a shorthand for MARK */
198 ph10 512 STRING_MARK0
199 ph10 391 STRING_ACCEPT0
200     STRING_COMMIT0
201     STRING_F0
202     STRING_FAIL0
203     STRING_PRUNE0
204     STRING_SKIP0
205     STRING_THEN;
206 ph10 240
207 ph10 327 static const verbitem verbs[] = {
208 ph10 510 { 0, -1, OP_MARK },
209 ph10 512 { 4, -1, OP_MARK },
210 ph10 510 { 6, OP_ACCEPT, -1 },
211     { 6, OP_COMMIT, -1 },
212     { 1, OP_FAIL, -1 },
213     { 4, OP_FAIL, -1 },
214     { 5, OP_PRUNE, OP_PRUNE_ARG },
215     { 4, OP_SKIP, OP_SKIP_ARG },
216     { 4, OP_THEN, OP_THEN_ARG }
217 ph10 210 };
218    
219 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
220 ph10 210
221    
222 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
223     now all in a single string, to reduce the number of relocations when a shared
224 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
225     length entry. The first three must be alpha, lower, upper, as this is assumed
226     for handling case independence. */
227 nigel 77
228 ph10 240 static const char posix_names[] =
229 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
230     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
231 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
232     STRING_word0 STRING_xdigit;
233 nigel 77
234 ph10 756 static const pcre_uint8 posix_name_lengths[] = {
235 nigel 77 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
236    
237 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
238     base map, with an optional addition or removal of another map. Then, for some
239     classes, there is some additional tweaking: for [:blank:] the vertical space
240     characters are removed, and for [:alpha:] and [:alnum:] the underscore
241     character is removed. The triples in the table consist of the base map offset,
242     second map offset or -1 if no second map, and a non-negative value for map
243     addition or a negative value for map subtraction (if there are two maps). The
244     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
245     remove vertical space characters, 2 => remove underscore. */
246 nigel 77
247     static const int posix_class_maps[] = {
248 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
249     cbit_lower, -1, 0, /* lower */
250     cbit_upper, -1, 0, /* upper */
251     cbit_word, -1, 2, /* alnum - word without underscore */
252     cbit_print, cbit_cntrl, 0, /* ascii */
253     cbit_space, -1, 1, /* blank - a GNU extension */
254     cbit_cntrl, -1, 0, /* cntrl */
255     cbit_digit, -1, 0, /* digit */
256     cbit_graph, -1, 0, /* graph */
257     cbit_print, -1, 0, /* print */
258     cbit_punct, -1, 0, /* punct */
259     cbit_space, -1, 0, /* space */
260     cbit_word, -1, 0, /* word - a Perl extension */
261     cbit_xdigit,-1, 0 /* xdigit */
262 nigel 77 };
263    
264 ph10 535 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
265     substitutes must be in the order of the names, defined above, and there are
266 ph10 518 both positive and negative cases. NULL means no substitute. */
267 nigel 77
268 ph10 518 #ifdef SUPPORT_UCP
269 ph10 756 static const pcre_uchar literal_PNd[] = { '\\', 'P', '{', 'N', 'd', '}', '\0' };
270     static const pcre_uchar literal_pNd[] = { '\\', 'p', '{', 'N', 'd', '}', '\0' };
271     static const pcre_uchar literal_PXsp[] = { '\\', 'P', '{', 'X', 's', 'p', '}', '\0' };
272     static const pcre_uchar literal_pXsp[] = { '\\', 'p', '{', 'X', 's', 'p', '}', '\0' };
273     static const pcre_uchar literal_PXwd[] = { '\\', 'P', '{', 'X', 'w', 'd', '}', '\0' };
274     static const pcre_uchar literal_pXwd[] = { '\\', 'p', '{', 'X', 'w', 'd', '}', '\0' };
275    
276     static const pcre_uchar *substitutes[] = {
277     literal_PNd, /* \D */
278     literal_pNd, /* \d */
279     literal_PXsp, /* \S */ /* NOTE: Xsp is Perl space */
280     literal_pXsp, /* \s */
281     literal_PXwd, /* \W */
282     literal_pXwd /* \w */
283 ph10 518 };
284 ph10 535
285 ph10 756 static const pcre_uchar literal_pL[] = { '\\', 'p', '{', 'L', '}', '\0' };
286     static const pcre_uchar literal_pLl[] = { '\\', 'p', '{', 'L', 'l', '}', '\0' };
287     static const pcre_uchar literal_pLu[] = { '\\', 'p', '{', 'L', 'u', '}', '\0' };
288     static const pcre_uchar literal_pXan[] = { '\\', 'p', '{', 'X', 'a', 'n', '}', '\0' };
289     static const pcre_uchar literal_h[] = { '\\', 'h', '\0' };
290     static const pcre_uchar literal_pXps[] = { '\\', 'p', '{', 'X', 'p', 's', '}', '\0' };
291     static const pcre_uchar literal_PL[] = { '\\', 'P', '{', 'L', '}', '\0' };
292     static const pcre_uchar literal_PLl[] = { '\\', 'P', '{', 'L', 'l', '}', '\0' };
293     static const pcre_uchar literal_PLu[] = { '\\', 'P', '{', 'L', 'u', '}', '\0' };
294     static const pcre_uchar literal_PXan[] = { '\\', 'P', '{', 'X', 'a', 'n', '}', '\0' };
295     static const pcre_uchar literal_H[] = { '\\', 'H', '\0' };
296     static const pcre_uchar literal_PXps[] = { '\\', 'P', '{', 'X', 'p', 's', '}', '\0' };
297    
298     static const pcre_uchar *posix_substitutes[] = {
299     literal_pL, /* alpha */
300     literal_pLl, /* lower */
301     literal_pLu, /* upper */
302     literal_pXan, /* alnum */
303     NULL, /* ascii */
304     literal_h, /* blank */
305     NULL, /* cntrl */
306     literal_pNd, /* digit */
307     NULL, /* graph */
308     NULL, /* print */
309     NULL, /* punct */
310     literal_pXps, /* space */ /* NOTE: Xps is POSIX space */
311     literal_pXwd, /* word */
312     NULL, /* xdigit */
313 ph10 518 /* Negated cases */
314 ph10 756 literal_PL, /* ^alpha */
315     literal_PLl, /* ^lower */
316     literal_PLu, /* ^upper */
317     literal_PXan, /* ^alnum */
318     NULL, /* ^ascii */
319     literal_H, /* ^blank */
320     NULL, /* ^cntrl */
321     literal_PNd, /* ^digit */
322     NULL, /* ^graph */
323     NULL, /* ^print */
324     NULL, /* ^punct */
325     literal_PXps, /* ^space */ /* NOTE: Xps is POSIX space */
326     literal_PXwd, /* ^word */
327     NULL /* ^xdigit */
328 ph10 518 };
329 ph10 756 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
330 ph10 535 #endif
331 ph10 518
332 nigel 93 #define STRING(a) # a
333     #define XSTRING(s) STRING(s)
334    
335 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
336 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
337     they are documented. Always add a new error instead. Messages marked DEAD below
338 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
339     the number of relocations needed when a shared library is loaded dynamically,
340     it is now one long string. We cannot use a table of offsets, because the
341     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
342     simply count through to the one we want - this isn't a performance issue
343 ph10 507 because these strings are used only when there is a compilation error.
344 nigel 77
345 ph10 507 Each substring ends with \0 to insert a null character. This includes the final
346     substring, so that the whole string ends with \0\0, which can be detected when
347 ph10 499 counting through. */
348    
349 ph10 240 static const char error_texts[] =
350     "no error\0"
351     "\\ at end of pattern\0"
352     "\\c at end of pattern\0"
353     "unrecognized character follows \\\0"
354     "numbers out of order in {} quantifier\0"
355 nigel 77 /* 5 */
356 ph10 240 "number too big in {} quantifier\0"
357     "missing terminating ] for character class\0"
358     "invalid escape sequence in character class\0"
359     "range out of order in character class\0"
360     "nothing to repeat\0"
361 nigel 77 /* 10 */
362 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
363     "internal error: unexpected repeat\0"
364 ph10 269 "unrecognized character after (? or (?-\0"
365 ph10 240 "POSIX named classes are supported only within a class\0"
366     "missing )\0"
367 nigel 77 /* 15 */
368 ph10 240 "reference to non-existent subpattern\0"
369     "erroffset passed as NULL\0"
370     "unknown option bit(s) set\0"
371     "missing ) after comment\0"
372     "parentheses nested too deeply\0" /** DEAD **/
373 nigel 77 /* 20 */
374 ph10 240 "regular expression is too large\0"
375     "failed to get memory\0"
376     "unmatched parentheses\0"
377     "internal error: code overflow\0"
378     "unrecognized character after (?<\0"
379 nigel 77 /* 25 */
380 ph10 240 "lookbehind assertion is not fixed length\0"
381     "malformed number or name after (?(\0"
382     "conditional group contains more than two branches\0"
383     "assertion expected after (?(\0"
384     "(?R or (?[+-]digits must be followed by )\0"
385 nigel 77 /* 30 */
386 ph10 240 "unknown POSIX class name\0"
387     "POSIX collating elements are not supported\0"
388     "this version of PCRE is not compiled with PCRE_UTF8 support\0"
389     "spare error\0" /** DEAD **/
390     "character value in \\x{...} sequence is too large\0"
391 nigel 77 /* 35 */
392 ph10 240 "invalid condition (?(0)\0"
393     "\\C not allowed in lookbehind assertion\0"
394 ph10 514 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
395 ph10 240 "number after (?C is > 255\0"
396     "closing ) for (?C expected\0"
397 nigel 77 /* 40 */
398 ph10 240 "recursive call could loop indefinitely\0"
399     "unrecognized character after (?P\0"
400     "syntax error in subpattern name (missing terminator)\0"
401     "two named subpatterns have the same name\0"
402     "invalid UTF-8 string\0"
403 nigel 77 /* 45 */
404 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
405     "malformed \\P or \\p sequence\0"
406     "unknown property name after \\P or \\p\0"
407     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
408     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
409 nigel 91 /* 50 */
410 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
411     "octal value is greater than \\377 (not in UTF-8 mode)\0"
412     "internal error: overran compiling workspace\0"
413     "internal error: previously-checked referenced subpattern not found\0"
414     "DEFINE group contains more than one branch\0"
415 nigel 93 /* 55 */
416 ph10 637 "repeating a DEFINE group is not allowed\0" /** DEAD **/
417 ph10 240 "inconsistent NEWLINE options\0"
418 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
419     "a numbered reference must not be zero\0"
420 ph10 510 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
421 ph10 211 /* 60 */
422 ph10 240 "(*VERB) not recognized\0"
423 ph10 268 "number is too big\0"
424 ph10 272 "subpattern name expected\0"
425 ph10 336 "digit expected after (?+\0"
426 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
427     /* 65 */
428 ph10 510 "different names for subpatterns of the same number are not allowed\0"
429 ph10 512 "(*MARK) must have an argument\0"
430 ph10 535 "this version of PCRE is not compiled with PCRE_UCP support\0"
431 ph10 579 "\\c must be followed by an ASCII character\0"
432 ph10 654 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
433 ph10 747 /* 70 */
434     "internal error: unknown opcode in find_fixedlength()\0"
435 ph10 510 ;
436 nigel 77
437     /* Table to identify digits and hex digits. This is used when compiling
438     patterns. Note that the tables in chartables are dependent on the locale, and
439     may mark arbitrary characters as digits - but the PCRE compiling code expects
440     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
441     a private table here. It costs 256 bytes, but it is a lot faster than doing
442     character value tests (at least in some simple cases I timed), and in some
443     applications one wants PCRE to compile efficiently as well as match
444     efficiently.
445    
446     For convenience, we use the same bit definitions as in chartables:
447    
448     0x04 decimal digit
449     0x08 hexadecimal digit
450    
451     Then we can use ctype_digit and ctype_xdigit in the code. */
452    
453 ph10 392 #ifndef EBCDIC
454 ph10 391
455 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
456 ph10 391 UTF-8 mode. */
457    
458 nigel 77 static const unsigned char digitab[] =
459     {
460     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
461     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
462     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
463     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
464     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
465     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
466     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
467     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
468     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
469     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
470     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
471     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
472     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
473     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
474     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
475     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
476     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
477     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
478     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
479     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
480     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
481     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
482     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
483     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
484     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
485     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
486     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
487     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
488     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
489     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
490     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
491     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
492    
493 ph10 392 #else
494 ph10 391
495     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
496    
497 nigel 77 static const unsigned char digitab[] =
498     {
499     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
500     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
501     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
502     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
503     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
504     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
505     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
506     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
507     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
508     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
509     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
510 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
511 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
512     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
513     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
514     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
515     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
516     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
517     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
518     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
519     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
520     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
521     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
522     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
523     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
524     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
525     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
526     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
527     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
528     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
529     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
530     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
531    
532     static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
533     0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
534     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
535     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
536     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
537     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
538     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
539     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
540     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
541     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
542     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
543     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
544 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
545 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
546     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
547     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
548     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
549     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
550     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
551     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
552     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
553     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
554     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
555     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
556     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
557     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
558     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
559     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
560     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
561     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
562     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
563     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
564     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
565     #endif
566    
567    
568     /* Definition to allow mutual recursion */
569    
570     static BOOL
571 ph10 756 compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
572 ph10 642 int *, int *, branch_chain *, compile_data *, int *);
573 nigel 77
574    
575    
576     /*************************************************
577 ph10 240 * Find an error text *
578     *************************************************/
579    
580 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
581     some of the text is of unknown length, we can't use a table of offsets.
582     Instead, just count through the strings. This is not a performance issue
583 ph10 240 because it happens only when there has been a compilation error.
584    
585     Argument: the error number
586     Returns: pointer to the error string
587     */
588    
589     static const char *
590     find_error_text(int n)
591     {
592     const char *s = error_texts;
593 ph10 507 for (; n > 0; n--)
594 ph10 499 {
595     while (*s++ != 0) {};
596     if (*s == 0) return "Error text not found (please report)";
597 ph10 507 }
598 ph10 240 return s;
599     }
600    
601    
602     /*************************************************
603 ph10 640 * Check for counted repeat *
604     *************************************************/
605    
606     /* This function is called when a '{' is encountered in a place where it might
607     start a quantifier. It looks ahead to see if it really is a quantifier or not.
608     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
609     where the ddds are digits.
610    
611     Arguments:
612     p pointer to the first char after '{'
613    
614     Returns: TRUE or FALSE
615     */
616    
617     static BOOL
618 ph10 756 is_counted_repeat(const pcre_uchar *p)
619 ph10 640 {
620     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
621     while ((digitab[*p] & ctype_digit) != 0) p++;
622     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
623    
624     if (*p++ != CHAR_COMMA) return FALSE;
625     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
626    
627     if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
628     while ((digitab[*p] & ctype_digit) != 0) p++;
629    
630     return (*p == CHAR_RIGHT_CURLY_BRACKET);
631     }
632    
633    
634    
635     /*************************************************
636 nigel 77 * Handle escapes *
637     *************************************************/
638    
639     /* This function is called when a \ has been encountered. It either returns a
640     positive value for a simple escape such as \n, or a negative value which
641 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
642     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
643     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
644     ptr is pointing at the \. On exit, it is on the final character of the escape
645     sequence.
646 nigel 77
647     Arguments:
648     ptrptr points to the pattern position pointer
649     errorcodeptr points to the errorcode variable
650     bracount number of previous extracting brackets
651     options the options bits
652     isclass TRUE if inside a character class
653    
654     Returns: zero or positive => a data character
655     negative => a special escape sequence
656 ph10 213 on error, errorcodeptr is set
657 nigel 77 */
658    
659     static int
660 ph10 756 check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
661 nigel 77 int options, BOOL isclass)
662     {
663 nigel 87 BOOL utf8 = (options & PCRE_UTF8) != 0;
664 ph10 756 const pcre_uchar *ptr = *ptrptr + 1;
665 nigel 77 int c, i;
666    
667 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
668     ptr--; /* Set pointer back to the last byte */
669    
670 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
671    
672     if (c == 0) *errorcodeptr = ERR1;
673    
674 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
675     in a table. A non-zero result is something that can be returned immediately.
676 nigel 77 Otherwise further processing may be required. */
677    
678 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
679     else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */
680     else if ((i = escapes[c - CHAR_0]) != 0) c = i;
681 nigel 77
682 ph10 97 #else /* EBCDIC coding */
683 ph10 274 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
684 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
685     #endif
686    
687     /* Escapes that need further processing, or are illegal. */
688    
689     else
690     {
691 ph10 756 const pcre_uchar *oldptr;
692 nigel 93 BOOL braced, negated;
693    
694 nigel 77 switch (c)
695     {
696     /* A number of Perl escapes are not handled by PCRE. We give an explicit
697     error. */
698    
699 ph10 391 case CHAR_l:
700     case CHAR_L:
701 zherczeg 744 *errorcodeptr = ERR37;
702     break;
703    
704 ph10 391 case CHAR_u:
705 zherczeg 744 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
706     {
707     /* In JavaScript, \u must be followed by four hexadecimal numbers.
708     Otherwise it is a lowercase u letter. */
709     if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0
710     && (digitab[ptr[3]] & ctype_xdigit) != 0 && (digitab[ptr[4]] & ctype_xdigit) != 0)
711     {
712     c = 0;
713     for (i = 0; i < 4; ++i)
714     {
715     register int cc = *(++ptr);
716     #ifndef EBCDIC /* ASCII/UTF-8 coding */
717     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
718     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
719     #else /* EBCDIC coding */
720     if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
721     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
722     #endif
723     }
724     }
725     }
726     else
727     *errorcodeptr = ERR37;
728     break;
729    
730 ph10 391 case CHAR_U:
731 zherczeg 744 /* In JavaScript, \U is an uppercase U letter. */
732     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
733 nigel 77 break;
734    
735 ph10 654 /* In a character class, \g is just a literal "g". Outside a character
736 ph10 640 class, \g must be followed by one of a number of specific things:
737 ph10 345
738 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
739     backreference. If negative, it is a relative backreference. This is a Perl
740     5.10 feature.
741 ph10 345
742 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
743     is part of Perl's movement towards a unified syntax for back references. As
744     this is synonymous with \k{name}, we fudge it up by pretending it really
745     was \k.
746 ph10 345
747     (3) For Oniguruma compatibility we also support \g followed by a name or a
748     number either in angle brackets or in single quotes. However, these are
749     (possibly recursive) subroutine calls, _not_ backreferences. Just return
750 ph10 333 the -ESC_g code (cf \k). */
751 nigel 93
752 ph10 391 case CHAR_g:
753 ph10 640 if (isclass) break;
754 ph10 391 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
755 ph10 333 {
756     c = -ESC_g;
757 ph10 345 break;
758     }
759 ph10 333
760     /* Handle the Perl-compatible cases */
761 ph10 345
762 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
763 nigel 93 {
764 ph10 756 const pcre_uchar *p;
765 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
766     if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
767     if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
768 ph10 171 {
769     c = -ESC_k;
770     break;
771 ph10 172 }
772 nigel 93 braced = TRUE;
773     ptr++;
774     }
775     else braced = FALSE;
776    
777 ph10 391 if (ptr[1] == CHAR_MINUS)
778 nigel 93 {
779     negated = TRUE;
780     ptr++;
781     }
782     else negated = FALSE;
783    
784     c = 0;
785     while ((digitab[ptr[1]] & ctype_digit) != 0)
786 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
787 ph10 220
788 ph10 333 if (c < 0) /* Integer overflow */
789 ph10 213 {
790     *errorcodeptr = ERR61;
791     break;
792 ph10 220 }
793 ph10 345
794 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
795 nigel 93 {
796     *errorcodeptr = ERR57;
797 ph10 213 break;
798 nigel 93 }
799 ph10 345
800 ph10 333 if (c == 0)
801     {
802     *errorcodeptr = ERR58;
803     break;
804 ph10 345 }
805 nigel 93
806     if (negated)
807     {
808     if (c > bracount)
809     {
810     *errorcodeptr = ERR15;
811 ph10 213 break;
812 nigel 93 }
813     c = bracount - (c - 1);
814     }
815    
816     c = -(ESC_REF + c);
817     break;
818    
819 nigel 77 /* The handling of escape sequences consisting of a string of digits
820     starting with one that is not zero is not straightforward. By experiment,
821     the way Perl works seems to be as follows:
822    
823     Outside a character class, the digits are read as a decimal number. If the
824     number is less than 10, or if there are that many previous extracting
825     left brackets, then it is a back reference. Otherwise, up to three octal
826     digits are read to form an escaped byte. Thus \123 is likely to be octal
827     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
828     value is greater than 377, the least significant 8 bits are taken. Inside a
829     character class, \ followed by a digit is always an octal number. */
830    
831 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
832     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
833 nigel 77
834     if (!isclass)
835     {
836     oldptr = ptr;
837 ph10 391 c -= CHAR_0;
838 nigel 77 while ((digitab[ptr[1]] & ctype_digit) != 0)
839 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
840 ph10 333 if (c < 0) /* Integer overflow */
841 ph10 213 {
842     *errorcodeptr = ERR61;
843 ph10 220 break;
844     }
845 nigel 77 if (c < 10 || c <= bracount)
846     {
847     c = -(ESC_REF + c);
848     break;
849     }
850     ptr = oldptr; /* Put the pointer back and fall through */
851     }
852    
853     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
854     generates a binary zero byte and treats the digit as a following literal.
855     Thus we have to pull back the pointer by one. */
856    
857 ph10 391 if ((c = *ptr) >= CHAR_8)
858 nigel 77 {
859     ptr--;
860     c = 0;
861     break;
862     }
863    
864     /* \0 always starts an octal number, but we may drop through to here with a
865 nigel 91 larger first octal digit. The original code used just to take the least
866     significant 8 bits of octal numbers (I think this is what early Perls used
867     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
868     than 3 octal digits. */
869 nigel 77
870 ph10 391 case CHAR_0:
871     c -= CHAR_0;
872     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
873     c = c * 8 + *(++ptr) - CHAR_0;
874 nigel 91 if (!utf8 && c > 255) *errorcodeptr = ERR51;
875 nigel 77 break;
876    
877 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
878     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
879     treated as a data character. */
880 nigel 77
881 ph10 391 case CHAR_x:
882 zherczeg 744 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
883     {
884     /* In JavaScript, \x must be followed by two hexadecimal numbers.
885     Otherwise it is a lowercase x letter. */
886     if ((digitab[ptr[1]] & ctype_xdigit) != 0 && (digitab[ptr[2]] & ctype_xdigit) != 0)
887     {
888     c = 0;
889     for (i = 0; i < 2; ++i)
890     {
891     register int cc = *(++ptr);
892     #ifndef EBCDIC /* ASCII/UTF-8 coding */
893     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
894     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
895     #else /* EBCDIC coding */
896     if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
897     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
898     #endif
899     }
900     }
901     break;
902     }
903    
904 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
905 nigel 77 {
906 ph10 756 const pcre_uchar *pt = ptr + 2;
907 nigel 87 int count = 0;
908    
909 nigel 77 c = 0;
910     while ((digitab[*pt] & ctype_xdigit) != 0)
911     {
912 nigel 87 register int cc = *pt++;
913 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
914 nigel 77 count++;
915 nigel 87
916 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
917     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
918     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
919 ph10 97 #else /* EBCDIC coding */
920 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
921     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
922 nigel 77 #endif
923     }
924 nigel 87
925 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
926 nigel 77 {
927 nigel 87 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
928 nigel 77 ptr = pt;
929     break;
930     }
931 nigel 87
932 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
933     recognize this construct; fall through to the normal \x handling. */
934     }
935    
936 nigel 87 /* Read just a single-byte hex-defined char */
937 nigel 77
938     c = 0;
939     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
940     {
941 ph10 391 int cc; /* Some compilers don't like */
942     cc = *(++ptr); /* ++ in initializers */
943     #ifndef EBCDIC /* ASCII/UTF-8 coding */
944     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
945     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
946 ph10 97 #else /* EBCDIC coding */
947 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
948     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
949 nigel 77 #endif
950     }
951     break;
952    
953 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
954 ph10 574 An error is given if the byte following \c is not an ASCII character. This
955     coding is ASCII-specific, but then the whole concept of \cx is
956 nigel 93 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
957 nigel 77
958 ph10 391 case CHAR_c:
959 nigel 77 c = *(++ptr);
960     if (c == 0)
961     {
962     *errorcodeptr = ERR2;
963 ph10 213 break;
964 nigel 77 }
965 ph10 574 #ifndef EBCDIC /* ASCII/UTF-8 coding */
966     if (c > 127) /* Excludes all non-ASCII in either mode */
967     {
968     *errorcodeptr = ERR68;
969 ph10 579 break;
970     }
971 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
972 nigel 77 c ^= 0x40;
973 ph10 574 #else /* EBCDIC coding */
974 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
975 nigel 77 c ^= 0xC0;
976     #endif
977     break;
978    
979     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
980 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
981     otherwise, for Perl compatibility, it is a literal. This code looks a bit
982     odd, but there used to be some cases other than the default, and there may
983     be again in future, so I haven't "optimized" it. */
984 nigel 77
985     default:
986     if ((options & PCRE_EXTRA) != 0) switch(c)
987     {
988     default:
989     *errorcodeptr = ERR3;
990     break;
991     }
992     break;
993     }
994     }
995 ph10 518
996     /* Perl supports \N{name} for character names, as well as plain \N for "not
997 ph10 654 newline". PCRE does not support \N{name}. However, it does support
998 ph10 640 quantification such as \N{2,3}. */
999 nigel 77
1000 ph10 640 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1001     !is_counted_repeat(ptr+2))
1002 ph10 518 *errorcodeptr = ERR37;
1003 ph10 514
1004 ph10 518 /* If PCRE_UCP is set, we change the values for \d etc. */
1005    
1006     if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
1007     c -= (ESC_DU - ESC_D);
1008    
1009     /* Set the pointer to the final character before returning. */
1010    
1011 nigel 77 *ptrptr = ptr;
1012     return c;
1013     }
1014    
1015    
1016    
1017     #ifdef SUPPORT_UCP
1018     /*************************************************
1019     * Handle \P and \p *
1020     *************************************************/
1021    
1022     /* This function is called after \P or \p has been encountered, provided that
1023     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1024     pointing at the P or p. On exit, it is pointing at the final character of the
1025     escape sequence.
1026    
1027     Argument:
1028     ptrptr points to the pattern position pointer
1029     negptr points to a boolean that is set TRUE for negation else FALSE
1030 nigel 87 dptr points to an int that is set to the detailed property value
1031 nigel 77 errorcodeptr points to the error code variable
1032    
1033 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
1034 nigel 77 */
1035    
1036     static int
1037 ph10 756 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1038 nigel 77 {
1039     int c, i, bot, top;
1040 ph10 756 const pcre_uchar *ptr = *ptrptr;
1041 nigel 87 char name[32];
1042 nigel 77
1043     c = *(++ptr);
1044     if (c == 0) goto ERROR_RETURN;
1045    
1046     *negptr = FALSE;
1047    
1048 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1049     negation. */
1050 nigel 77
1051 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
1052 nigel 77 {
1053 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1054 nigel 77 {
1055     *negptr = TRUE;
1056     ptr++;
1057     }
1058 ph10 199 for (i = 0; i < (int)sizeof(name) - 1; i++)
1059 nigel 77 {
1060     c = *(++ptr);
1061     if (c == 0) goto ERROR_RETURN;
1062 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1063 nigel 77 name[i] = c;
1064     }
1065 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1066 nigel 77 name[i] = 0;
1067     }
1068    
1069     /* Otherwise there is just one following character */
1070    
1071     else
1072     {
1073     name[0] = c;
1074     name[1] = 0;
1075     }
1076    
1077     *ptrptr = ptr;
1078    
1079     /* Search for a recognized property name using binary chop */
1080    
1081     bot = 0;
1082     top = _pcre_utt_size;
1083    
1084     while (bot < top)
1085     {
1086 nigel 87 i = (bot + top) >> 1;
1087 ph10 240 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
1088 nigel 87 if (c == 0)
1089     {
1090     *dptr = _pcre_utt[i].value;
1091     return _pcre_utt[i].type;
1092     }
1093 nigel 77 if (c > 0) bot = i + 1; else top = i;
1094     }
1095    
1096     *errorcodeptr = ERR47;
1097     *ptrptr = ptr;
1098     return -1;
1099    
1100     ERROR_RETURN:
1101     *errorcodeptr = ERR46;
1102     *ptrptr = ptr;
1103     return -1;
1104     }
1105     #endif
1106    
1107    
1108    
1109    
1110     /*************************************************
1111     * Read repeat counts *
1112     *************************************************/
1113    
1114     /* Read an item of the form {n,m} and return the values. This is called only
1115     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1116     so the syntax is guaranteed to be correct, but we need to check the values.
1117    
1118     Arguments:
1119     p pointer to first char after '{'
1120     minp pointer to int for min
1121     maxp pointer to int for max
1122     returned as -1 if no max
1123     errorcodeptr points to error code variable
1124    
1125     Returns: pointer to '}' on success;
1126     current ptr on error, with errorcodeptr set non-zero
1127     */
1128    
1129 ph10 756 static const pcre_uchar *
1130     read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1131 nigel 77 {
1132     int min = 0;
1133     int max = -1;
1134    
1135 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
1136     an integer overflow. */
1137    
1138 ph10 391 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1139 nigel 81 if (min < 0 || min > 65535)
1140     {
1141     *errorcodeptr = ERR5;
1142     return p;
1143     }
1144 nigel 77
1145 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
1146     Also, max must not be less than min. */
1147    
1148 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1149 nigel 77 {
1150 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1151 nigel 77 {
1152     max = 0;
1153 ph10 391 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1154 nigel 81 if (max < 0 || max > 65535)
1155     {
1156     *errorcodeptr = ERR5;
1157     return p;
1158     }
1159 nigel 77 if (max < min)
1160     {
1161     *errorcodeptr = ERR4;
1162     return p;
1163     }
1164     }
1165     }
1166    
1167 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1168     '}'. */
1169 nigel 77
1170 nigel 81 *minp = min;
1171     *maxp = max;
1172 nigel 77 return p;
1173     }
1174    
1175    
1176    
1177     /*************************************************
1178 ph10 408 * Subroutine for finding forward reference *
1179 nigel 91 *************************************************/
1180    
1181 ph10 408 /* This recursive function is called only from find_parens() below. The
1182     top-level call starts at the beginning of the pattern. All other calls must
1183     start at a parenthesis. It scans along a pattern's text looking for capturing
1184 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1185     name it is given, it returns its number. Alternatively, if the name is NULL, it
1186 ph10 578 returns when it reaches a given numbered subpattern. Recursion is used to keep
1187     track of subpatterns that reset the capturing group numbers - the (?| feature.
1188 nigel 91
1189 ph10 578 This function was originally called only from the second pass, in which we know
1190     that if (?< or (?' or (?P< is encountered, the name will be correctly
1191     terminated because that is checked in the first pass. There is now one call to
1192     this function in the first pass, to check for a recursive back reference by
1193     name (so that we can make the whole group atomic). In this case, we need check
1194 ph10 579 only up to the current position in the pattern, and that is still OK because
1195     and previous occurrences will have been checked. To make this work, the test
1196     for "end of pattern" is a check against cd->end_pattern in the main loop,
1197 ph10 578 instead of looking for a binary zero. This means that the special first-pass
1198 ph10 579 call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1199     processing items within the loop are OK, because afterwards the main loop will
1200 ph10 578 terminate.)
1201    
1202 nigel 91 Arguments:
1203 ph10 408 ptrptr address of the current character pointer (updated)
1204 ph10 345 cd compile background data
1205 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1206     lorn name length, or subpattern number if name is NULL
1207     xmode TRUE if we are in /x mode
1208 ph10 579 utf8 TRUE if we are in UTF-8 mode
1209 ph10 411 count pointer to the current capturing subpattern number (updated)
1210 nigel 91
1211     Returns: the number of the named subpattern, or -1 if not found
1212     */
1213    
1214     static int
1215 ph10 756 find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
1216 ph10 556 BOOL xmode, BOOL utf8, int *count)
1217 nigel 91 {
1218 ph10 756 pcre_uchar *ptr = *ptrptr;
1219 ph10 408 int start_count = *count;
1220     int hwm_count = start_count;
1221     BOOL dup_parens = FALSE;
1222 nigel 93
1223 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1224 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1225    
1226     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1227     {
1228 ph10 544 /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1229 ph10 545
1230 ph10 544 if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1231 ph10 545
1232 ph10 544 /* Handle a normal, unnamed capturing parenthesis. */
1233 ph10 408
1234 ph10 544 else if (ptr[1] != CHAR_QUESTION_MARK)
1235 ph10 408 {
1236     *count += 1;
1237     if (name == NULL && *count == lorn) return *count;
1238 ph10 411 ptr++;
1239 ph10 408 }
1240    
1241 ph10 544 /* All cases now have (? at the start. Remember when we are in a group
1242     where the parenthesis numbers are duplicated. */
1243    
1244     else if (ptr[2] == CHAR_VERTICAL_LINE)
1245     {
1246     ptr += 3;
1247     dup_parens = TRUE;
1248     }
1249 ph10 545
1250 ph10 544 /* Handle comments; all characters are allowed until a ket is reached. */
1251    
1252     else if (ptr[2] == CHAR_NUMBER_SIGN)
1253     {
1254     for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1255     goto FAIL_EXIT;
1256 ph10 545 }
1257 ph10 544
1258 ph10 408 /* Handle a condition. If it is an assertion, just carry on so that it
1259     is processed as normal. If not, skip to the closing parenthesis of the
1260 ph10 544 condition (there can't be any nested parens). */
1261 ph10 411
1262 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1263     {
1264 ph10 411 ptr += 2;
1265 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1266     {
1267     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1268 ph10 411 if (*ptr != 0) ptr++;
1269 ph10 408 }
1270 ph10 411 }
1271    
1272 ph10 544 /* Start with (? but not a condition. */
1273 ph10 408
1274     else
1275 ph10 411 {
1276 ph10 408 ptr += 2;
1277     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1278    
1279     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1280 ph10 411
1281 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1282     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1283     {
1284     int term;
1285 ph10 756 const pcre_uchar *thisname;
1286 ph10 408 *count += 1;
1287     if (name == NULL && *count == lorn) return *count;
1288     term = *ptr++;
1289     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1290     thisname = ptr;
1291     while (*ptr != term) ptr++;
1292     if (name != NULL && lorn == ptr - thisname &&
1293     strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1294     return *count;
1295 ph10 461 term++;
1296 ph10 411 }
1297 ph10 408 }
1298 ph10 411 }
1299 ph10 408
1300 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1301 ph10 579 bars. Stop if we get to cd->end_pattern. Note that this is important for the
1302     first-pass call when this value is temporarily adjusted to stop at the current
1303 ph10 578 position. So DO NOT change this to a test for binary zero. */
1304 ph10 408
1305 ph10 578 for (; ptr < cd->end_pattern; ptr++)
1306 nigel 91 {
1307 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1308    
1309 ph10 391 if (*ptr == CHAR_BACKSLASH)
1310 nigel 93 {
1311 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1312 ph10 391 if (*ptr == CHAR_Q) for (;;)
1313 nigel 93 {
1314 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1315 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1316 ph10 391 if (*(++ptr) == CHAR_E) break;
1317 nigel 93 }
1318     continue;
1319     }
1320    
1321 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1322     are handled for real. If the first character is '^', skip it. Also, if the
1323     first few characters (either before or after ^) are \Q\E or \E we skip them
1324 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1325 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1326 nigel 93
1327 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1328 nigel 93 {
1329 ph10 340 BOOL negate_class = FALSE;
1330     for (;;)
1331     {
1332 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1333 ph10 340 {
1334 ph10 438 if (ptr[2] == CHAR_E)
1335     ptr+= 2;
1336     else if (strncmp((const char *)ptr+2,
1337 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1338 ph10 438 ptr += 4;
1339 ph10 392 else
1340 ph10 391 break;
1341 ph10 340 }
1342 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1343 ph10 461 {
1344 ph10 340 negate_class = TRUE;
1345 ph10 438 ptr++;
1346 ph10 461 }
1347 ph10 340 else break;
1348     }
1349    
1350     /* If the next character is ']', it is a data character that must be
1351 ph10 341 skipped, except in JavaScript compatibility mode. */
1352 ph10 345
1353 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1354 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1355 ph10 345 ptr++;
1356    
1357 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1358 nigel 93 {
1359 ph10 220 if (*ptr == 0) return -1;
1360 ph10 391 if (*ptr == CHAR_BACKSLASH)
1361 nigel 93 {
1362 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1363 ph10 391 if (*ptr == CHAR_Q) for (;;)
1364 nigel 93 {
1365 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1366 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1367 ph10 391 if (*(++ptr) == CHAR_E) break;
1368 nigel 93 }
1369     continue;
1370     }
1371     }
1372     continue;
1373     }
1374    
1375     /* Skip comments in /x mode */
1376    
1377 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1378 nigel 93 {
1379 ph10 579 ptr++;
1380 ph10 556 while (*ptr != 0)
1381     {
1382     if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1383     ptr++;
1384 ph10 579 #ifdef SUPPORT_UTF8
1385 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
1386     #endif
1387     }
1388 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1389 nigel 93 continue;
1390     }
1391    
1392 ph10 408 /* Check for the special metacharacters */
1393 ph10 411
1394 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1395 nigel 93 {
1396 ph10 556 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
1397 ph10 408 if (rc > 0) return rc;
1398     if (*ptr == 0) goto FAIL_EXIT;
1399 nigel 93 }
1400 ph10 411
1401 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1402     {
1403     if (dup_parens && *count < hwm_count) *count = hwm_count;
1404 ph10 545 goto FAIL_EXIT;
1405 ph10 408 }
1406 ph10 411
1407     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1408 ph10 408 {
1409     if (*count > hwm_count) hwm_count = *count;
1410     *count = start_count;
1411 ph10 411 }
1412 ph10 408 }
1413 nigel 93
1414 ph10 408 FAIL_EXIT:
1415     *ptrptr = ptr;
1416     return -1;
1417     }
1418 nigel 93
1419    
1420    
1421    
1422 ph10 408 /*************************************************
1423     * Find forward referenced subpattern *
1424     *************************************************/
1425 nigel 93
1426 ph10 408 /* This function scans along a pattern's text looking for capturing
1427     subpatterns, and counting them. If it finds a named pattern that matches the
1428     name it is given, it returns its number. Alternatively, if the name is NULL, it
1429     returns when it reaches a given numbered subpattern. This is used for forward
1430     references to subpatterns. We used to be able to start this scan from the
1431     current compiling point, using the current count value from cd->bracount, and
1432     do it all in a single loop, but the addition of the possibility of duplicate
1433     subpattern numbers means that we have to scan from the very start, in order to
1434     take account of such duplicates, and to use a recursive function to keep track
1435     of the different types of group.
1436    
1437     Arguments:
1438     cd compile background data
1439     name name to seek, or NULL if seeking a numbered subpattern
1440     lorn name length, or subpattern number if name is NULL
1441     xmode TRUE if we are in /x mode
1442 ph10 579 utf8 TRUE if we are in UTF-8 mode
1443 ph10 408
1444     Returns: the number of the found subpattern, or -1 if not found
1445     */
1446    
1447     static int
1448 ph10 756 find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
1449 ph10 556 BOOL utf8)
1450 ph10 408 {
1451 ph10 756 pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
1452 ph10 408 int count = 0;
1453     int rc;
1454    
1455     /* If the pattern does not start with an opening parenthesis, the first call
1456     to find_parens_sub() will scan right to the end (if necessary). However, if it
1457     does start with a parenthesis, find_parens_sub() will return when it hits the
1458     matching closing parens. That is why we have to have a loop. */
1459    
1460 ph10 411 for (;;)
1461     {
1462 ph10 556 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
1463 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1464     }
1465    
1466 ph10 408 return rc;
1467 nigel 91 }
1468    
1469    
1470    
1471 ph10 408
1472 nigel 91 /*************************************************
1473 nigel 77 * Find first significant op code *
1474     *************************************************/
1475    
1476     /* This is called by several functions that scan a compiled expression looking
1477     for a fixed first character, or an anchoring op code etc. It skips over things
1478 ph10 602 that do not influence this. For some calls, it makes sense to skip negative
1479     forward and all backward assertions, and also the \b assertion; for others it
1480     does not.
1481 nigel 77
1482     Arguments:
1483     code pointer to the start of the group
1484     skipassert TRUE if certain assertions are to be skipped
1485    
1486     Returns: pointer to the first significant opcode
1487     */
1488    
1489 ph10 756 static const pcre_uchar*
1490     first_significant_code(const pcre_uchar *code, BOOL skipassert)
1491 nigel 77 {
1492     for (;;)
1493     {
1494     switch ((int)*code)
1495     {
1496     case OP_ASSERT_NOT:
1497     case OP_ASSERTBACK:
1498     case OP_ASSERTBACK_NOT:
1499     if (!skipassert) return code;
1500     do code += GET(code, 1); while (*code == OP_ALT);
1501     code += _pcre_OP_lengths[*code];
1502     break;
1503    
1504     case OP_WORD_BOUNDARY:
1505     case OP_NOT_WORD_BOUNDARY:
1506     if (!skipassert) return code;
1507     /* Fall through */
1508    
1509     case OP_CALLOUT:
1510     case OP_CREF:
1511 ph10 459 case OP_NCREF:
1512 nigel 93 case OP_RREF:
1513 ph10 459 case OP_NRREF:
1514 nigel 93 case OP_DEF:
1515 nigel 77 code += _pcre_OP_lengths[*code];
1516     break;
1517    
1518     default:
1519     return code;
1520     }
1521     }
1522     /* Control never reaches here */
1523     }
1524    
1525    
1526    
1527    
1528     /*************************************************
1529 ph10 454 * Find the fixed length of a branch *
1530 nigel 77 *************************************************/
1531    
1532 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1533 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1534 ph10 461 In UTF8 mode, the result is in characters rather than bytes. The branch is
1535 ph10 454 temporarily terminated with OP_END when this function is called.
1536 nigel 77
1537 ph10 461 This function is called when a backward assertion is encountered, so that if it
1538     fails, the error message can point to the correct place in the pattern.
1539 ph10 454 However, we cannot do this when the assertion contains subroutine calls,
1540 ph10 461 because they can be forward references. We solve this by remembering this case
1541 ph10 454 and doing the check at the end; a flag specifies which mode we are running in.
1542    
1543 nigel 77 Arguments:
1544     code points to the start of the pattern (the bracket)
1545 ph10 604 utf8 TRUE in UTF-8 mode
1546 ph10 461 atend TRUE if called when the pattern is complete
1547     cd the "compile data" structure
1548 nigel 77
1549 ph10 461 Returns: the fixed length,
1550 ph10 454 or -1 if there is no fixed length,
1551 ph10 754 or -2 if \C was encountered (in UTF-8 mode only)
1552 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1553 ph10 747 or -4 if an unknown opcode was encountered (internal error)
1554 nigel 77 */
1555    
1556     static int
1557 ph10 756 find_fixedlength(pcre_uchar *code, BOOL utf8, BOOL atend, compile_data *cd)
1558 nigel 77 {
1559     int length = -1;
1560    
1561     register int branchlength = 0;
1562 ph10 756 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1563 nigel 77
1564     /* Scan along the opcodes for this branch. If we get to the end of the
1565     branch, check the length against that of the other branches. */
1566    
1567     for (;;)
1568     {
1569     int d;
1570 ph10 756 pcre_uchar *ce, *cs;
1571 nigel 77 register int op = *cc;
1572     switch (op)
1573     {
1574 ph10 604 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1575     OP_BRA (normal non-capturing bracket) because the other variants of these
1576     opcodes are all concerned with unlimited repeated groups, which of course
1577 ph10 747 are not of fixed length. */
1578 ph10 604
1579 nigel 93 case OP_CBRA:
1580 nigel 77 case OP_BRA:
1581     case OP_ONCE:
1582 ph10 733 case OP_ONCE_NC:
1583 nigel 77 case OP_COND:
1584 ph10 604 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), utf8, atend, cd);
1585 nigel 77 if (d < 0) return d;
1586     branchlength += d;
1587     do cc += GET(cc, 1); while (*cc == OP_ALT);
1588     cc += 1 + LINK_SIZE;
1589     break;
1590    
1591 ph10 747 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1592     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1593     an ALT. If it is END it's the end of the outer call. All can be handled by
1594     the same code. Note that we must not include the OP_KETRxxx opcodes here,
1595     because they all imply an unlimited repeat. */
1596 nigel 77
1597     case OP_ALT:
1598     case OP_KET:
1599     case OP_END:
1600 ph10 747 case OP_ACCEPT:
1601     case OP_ASSERT_ACCEPT:
1602 nigel 77 if (length < 0) length = branchlength;
1603     else if (length != branchlength) return -1;
1604     if (*cc != OP_ALT) return length;
1605     cc += 1 + LINK_SIZE;
1606     branchlength = 0;
1607     break;
1608 ph10 461
1609 ph10 454 /* A true recursion implies not fixed length, but a subroutine call may
1610     be OK. If the subroutine is a forward reference, we can't deal with
1611     it until the end of the pattern, so return -3. */
1612 ph10 461
1613 ph10 454 case OP_RECURSE:
1614     if (!atend) return -3;
1615 ph10 756 cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1616     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1617     if (cc > cs && cc < ce) return -1; /* Recursion */
1618 ph10 604 d = find_fixedlength(cs + 2, utf8, atend, cd);
1619 ph10 461 if (d < 0) return d;
1620 ph10 454 branchlength += d;
1621     cc += 1 + LINK_SIZE;
1622 ph10 461 break;
1623 nigel 77
1624     /* Skip over assertive subpatterns */
1625    
1626     case OP_ASSERT:
1627     case OP_ASSERT_NOT:
1628     case OP_ASSERTBACK:
1629     case OP_ASSERTBACK_NOT:
1630     do cc += GET(cc, 1); while (*cc == OP_ALT);
1631     /* Fall through */
1632    
1633     /* Skip over things that don't match chars */
1634    
1635 ph10 747 case OP_MARK:
1636     case OP_PRUNE_ARG:
1637     case OP_SKIP_ARG:
1638     case OP_THEN_ARG:
1639     cc += cc[1] + _pcre_OP_lengths[*cc];
1640     break;
1641    
1642 nigel 77 case OP_CALLOUT:
1643     case OP_CIRC:
1644 ph10 602 case OP_CIRCM:
1645 ph10 747 case OP_CLOSE:
1646     case OP_COMMIT:
1647     case OP_CREF:
1648     case OP_DEF:
1649 nigel 77 case OP_DOLL:
1650 ph10 602 case OP_DOLLM:
1651 ph10 747 case OP_EOD:
1652     case OP_EODN:
1653     case OP_FAIL:
1654     case OP_NCREF:
1655     case OP_NRREF:
1656 nigel 77 case OP_NOT_WORD_BOUNDARY:
1657 ph10 747 case OP_PRUNE:
1658     case OP_REVERSE:
1659     case OP_RREF:
1660     case OP_SET_SOM:
1661     case OP_SKIP:
1662     case OP_SOD:
1663     case OP_SOM:
1664     case OP_THEN:
1665 nigel 77 case OP_WORD_BOUNDARY:
1666     cc += _pcre_OP_lengths[*cc];
1667     break;
1668    
1669     /* Handle literal characters */
1670    
1671     case OP_CHAR:
1672 ph10 602 case OP_CHARI:
1673 nigel 91 case OP_NOT:
1674 ph10 604 case OP_NOTI:
1675 nigel 77 branchlength++;
1676     cc += 2;
1677     #ifdef SUPPORT_UTF8
1678 ph10 604 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1679 nigel 77 #endif
1680     break;
1681    
1682     /* Handle exact repetitions. The count is already in characters, but we
1683     need to skip over a multibyte character in UTF8 mode. */
1684    
1685     case OP_EXACT:
1686 ph10 747 case OP_EXACTI:
1687     case OP_NOTEXACT:
1688     case OP_NOTEXACTI:
1689 nigel 77 branchlength += GET2(cc,1);
1690     cc += 4;
1691     #ifdef SUPPORT_UTF8
1692 ph10 604 if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1693 nigel 77 #endif
1694     break;
1695    
1696     case OP_TYPEEXACT:
1697     branchlength += GET2(cc,1);
1698 ph10 220 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1699 nigel 77 cc += 4;
1700     break;
1701    
1702     /* Handle single-char matchers */
1703    
1704     case OP_PROP:
1705     case OP_NOTPROP:
1706 nigel 87 cc += 2;
1707 nigel 77 /* Fall through */
1708    
1709 ph10 747 case OP_HSPACE:
1710     case OP_VSPACE:
1711     case OP_NOT_HSPACE:
1712     case OP_NOT_VSPACE:
1713 nigel 77 case OP_NOT_DIGIT:
1714     case OP_DIGIT:
1715     case OP_NOT_WHITESPACE:
1716     case OP_WHITESPACE:
1717     case OP_NOT_WORDCHAR:
1718     case OP_WORDCHAR:
1719     case OP_ANY:
1720 ph10 342 case OP_ALLANY:
1721 nigel 77 branchlength++;
1722     cc++;
1723     break;
1724    
1725 ph10 754 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1726     otherwise \C is coded as OP_ALLANY. */
1727 nigel 77
1728     case OP_ANYBYTE:
1729     return -2;
1730    
1731     /* Check a class for variable quantification */
1732    
1733     #ifdef SUPPORT_UTF8
1734     case OP_XCLASS:
1735     cc += GET(cc, 1) - 33;
1736     /* Fall through */
1737     #endif
1738    
1739     case OP_CLASS:
1740     case OP_NCLASS:
1741     cc += 33;
1742    
1743     switch (*cc)
1744     {
1745 ph10 747 case OP_CRPLUS:
1746     case OP_CRMINPLUS:
1747 nigel 77 case OP_CRSTAR:
1748     case OP_CRMINSTAR:
1749     case OP_CRQUERY:
1750     case OP_CRMINQUERY:
1751     return -1;
1752    
1753     case OP_CRRANGE:
1754     case OP_CRMINRANGE:
1755     if (GET2(cc,1) != GET2(cc,3)) return -1;
1756     branchlength += GET2(cc,1);
1757     cc += 5;
1758     break;
1759    
1760     default:
1761     branchlength++;
1762     }
1763     break;
1764    
1765     /* Anything else is variable length */
1766    
1767 ph10 747 case OP_ANYNL:
1768     case OP_BRAMINZERO:
1769     case OP_BRAPOS:
1770     case OP_BRAPOSZERO:
1771     case OP_BRAZERO:
1772     case OP_CBRAPOS:
1773     case OP_EXTUNI:
1774     case OP_KETRMAX:
1775     case OP_KETRMIN:
1776     case OP_KETRPOS:
1777     case OP_MINPLUS:
1778     case OP_MINPLUSI:
1779     case OP_MINQUERY:
1780     case OP_MINQUERYI:
1781     case OP_MINSTAR:
1782     case OP_MINSTARI:
1783     case OP_MINUPTO:
1784     case OP_MINUPTOI:
1785     case OP_NOTMINPLUS:
1786     case OP_NOTMINPLUSI:
1787     case OP_NOTMINQUERY:
1788     case OP_NOTMINQUERYI:
1789     case OP_NOTMINSTAR:
1790     case OP_NOTMINSTARI:
1791     case OP_NOTMINUPTO:
1792     case OP_NOTMINUPTOI:
1793     case OP_NOTPLUS:
1794     case OP_NOTPLUSI:
1795     case OP_NOTPOSPLUS:
1796     case OP_NOTPOSPLUSI:
1797     case OP_NOTPOSQUERY:
1798     case OP_NOTPOSQUERYI:
1799     case OP_NOTPOSSTAR:
1800     case OP_NOTPOSSTARI:
1801     case OP_NOTPOSUPTO:
1802     case OP_NOTPOSUPTOI:
1803     case OP_NOTQUERY:
1804     case OP_NOTQUERYI:
1805     case OP_NOTSTAR:
1806     case OP_NOTSTARI:
1807     case OP_NOTUPTO:
1808     case OP_NOTUPTOI:
1809     case OP_PLUS:
1810     case OP_PLUSI:
1811     case OP_POSPLUS:
1812     case OP_POSPLUSI:
1813     case OP_POSQUERY:
1814     case OP_POSQUERYI:
1815     case OP_POSSTAR:
1816     case OP_POSSTARI:
1817     case OP_POSUPTO:
1818     case OP_POSUPTOI:
1819     case OP_QUERY:
1820     case OP_QUERYI:
1821     case OP_REF:
1822     case OP_REFI:
1823     case OP_SBRA:
1824     case OP_SBRAPOS:
1825     case OP_SCBRA:
1826     case OP_SCBRAPOS:
1827     case OP_SCOND:
1828     case OP_SKIPZERO:
1829     case OP_STAR:
1830     case OP_STARI:
1831     case OP_TYPEMINPLUS:
1832     case OP_TYPEMINQUERY:
1833     case OP_TYPEMINSTAR:
1834     case OP_TYPEMINUPTO:
1835     case OP_TYPEPLUS:
1836     case OP_TYPEPOSPLUS:
1837     case OP_TYPEPOSQUERY:
1838     case OP_TYPEPOSSTAR:
1839     case OP_TYPEPOSUPTO:
1840     case OP_TYPEQUERY:
1841     case OP_TYPESTAR:
1842     case OP_TYPEUPTO:
1843     case OP_UPTO:
1844     case OP_UPTOI:
1845     return -1;
1846    
1847     /* Catch unrecognized opcodes so that when new ones are added they
1848     are not forgotten, as has happened in the past. */
1849    
1850 nigel 77 default:
1851 ph10 747 return -4;
1852 nigel 77 }
1853     }
1854     /* Control never gets here */
1855     }
1856    
1857    
1858    
1859    
1860     /*************************************************
1861 ph10 454 * Scan compiled regex for specific bracket *
1862 nigel 77 *************************************************/
1863    
1864     /* This little function scans through a compiled pattern until it finds a
1865 ph10 454 capturing bracket with the given number, or, if the number is negative, an
1866 ph10 461 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1867     so that it can be called from pcre_study() when finding the minimum matching
1868 ph10 455 length.
1869 nigel 77
1870     Arguments:
1871     code points to start of expression
1872     utf8 TRUE in UTF-8 mode
1873 ph10 454 number the required bracket number or negative to find a lookbehind
1874 nigel 77
1875     Returns: pointer to the opcode for the bracket, or NULL if not found
1876     */
1877    
1878 ph10 756 const pcre_uchar *
1879     _pcre_find_bracket(const pcre_uchar *code, BOOL utf8, int number)
1880 nigel 77 {
1881     for (;;)
1882     {
1883     register int c = *code;
1884 ph10 618
1885 nigel 77 if (c == OP_END) return NULL;
1886 nigel 91
1887     /* XCLASS is used for classes that cannot be represented just by a bit
1888     map. This includes negated single high-valued characters. The length in
1889     the table is zero; the actual length is stored in the compiled code. */
1890    
1891     if (c == OP_XCLASS) code += GET(code, 1);
1892 ph10 461
1893 ph10 454 /* Handle recursion */
1894 ph10 461
1895 ph10 454 else if (c == OP_REVERSE)
1896     {
1897 ph10 756 if (number < 0) return (pcre_uchar *)code;
1898 ph10 454 code += _pcre_OP_lengths[c];
1899     }
1900 nigel 91
1901 nigel 93 /* Handle capturing bracket */
1902 nigel 91
1903 ph10 604 else if (c == OP_CBRA || c == OP_SCBRA ||
1904     c == OP_CBRAPOS || c == OP_SCBRAPOS)
1905 nigel 77 {
1906 nigel 93 int n = GET2(code, 1+LINK_SIZE);
1907 ph10 756 if (n == number) return (pcre_uchar *)code;
1908 nigel 93 code += _pcre_OP_lengths[c];
1909 nigel 77 }
1910 nigel 91
1911 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
1912     repeated character types, we have to test for \p and \P, which have an extra
1913 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1914 ph10 510 must add in its length. */
1915 nigel 91
1916 nigel 77 else
1917     {
1918 ph10 218 switch(c)
1919     {
1920     case OP_TYPESTAR:
1921     case OP_TYPEMINSTAR:
1922     case OP_TYPEPLUS:
1923     case OP_TYPEMINPLUS:
1924     case OP_TYPEQUERY:
1925     case OP_TYPEMINQUERY:
1926     case OP_TYPEPOSSTAR:
1927     case OP_TYPEPOSPLUS:
1928     case OP_TYPEPOSQUERY:
1929     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1930 ph10 220 break;
1931 ph10 221
1932     case OP_TYPEUPTO:
1933     case OP_TYPEMINUPTO:
1934     case OP_TYPEEXACT:
1935     case OP_TYPEPOSUPTO:
1936     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1937     break;
1938 ph10 512
1939 ph10 510 case OP_MARK:
1940     case OP_PRUNE_ARG:
1941     case OP_SKIP_ARG:
1942     code += code[1];
1943 ph10 512 break;
1944 ph10 550
1945     case OP_THEN_ARG:
1946 ph10 716 code += code[1];
1947 ph10 550 break;
1948 ph10 220 }
1949    
1950 ph10 218 /* Add in the fixed length from the table */
1951 ph10 220
1952 nigel 77 code += _pcre_OP_lengths[c];
1953 ph10 220
1954 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1955     a multi-byte character. The length in the table is a minimum, so we have to
1956     arrange to skip the extra bytes. */
1957 ph10 220
1958 ph10 107 #ifdef SUPPORT_UTF8
1959 nigel 77 if (utf8) switch(c)
1960     {
1961     case OP_CHAR:
1962 ph10 602 case OP_CHARI:
1963 nigel 77 case OP_EXACT:
1964 ph10 602 case OP_EXACTI:
1965 nigel 77 case OP_UPTO:
1966 ph10 602 case OP_UPTOI:
1967 nigel 77 case OP_MINUPTO:
1968 ph10 602 case OP_MINUPTOI:
1969 nigel 93 case OP_POSUPTO:
1970 ph10 602 case OP_POSUPTOI:
1971 nigel 77 case OP_STAR:
1972 ph10 602 case OP_STARI:
1973 nigel 77 case OP_MINSTAR:
1974 ph10 602 case OP_MINSTARI:
1975 nigel 93 case OP_POSSTAR:
1976 ph10 602 case OP_POSSTARI:
1977 nigel 77 case OP_PLUS:
1978 ph10 602 case OP_PLUSI:
1979 nigel 77 case OP_MINPLUS:
1980 ph10 602 case OP_MINPLUSI:
1981 nigel 93 case OP_POSPLUS:
1982 ph10 602 case OP_POSPLUSI:
1983 nigel 77 case OP_QUERY:
1984 ph10 602 case OP_QUERYI:
1985 nigel 77 case OP_MINQUERY:
1986 ph10 602 case OP_MINQUERYI:
1987 nigel 93 case OP_POSQUERY:
1988 ph10 602 case OP_POSQUERYI:
1989 nigel 93 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1990 nigel 77 break;
1991     }
1992 ph10 369 #else
1993     (void)(utf8); /* Keep compiler happy by referencing function argument */
1994 ph10 111 #endif
1995 nigel 77 }
1996     }
1997     }
1998    
1999    
2000    
2001     /*************************************************
2002     * Scan compiled regex for recursion reference *
2003     *************************************************/
2004    
2005     /* This little function scans through a compiled pattern until it finds an
2006     instance of OP_RECURSE.
2007    
2008     Arguments:
2009     code points to start of expression
2010     utf8 TRUE in UTF-8 mode
2011    
2012     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
2013     */
2014    
2015 ph10 756 static const pcre_uchar *
2016     find_recurse(const pcre_uchar *code, BOOL utf8)
2017 nigel 77 {
2018     for (;;)
2019     {
2020     register int c = *code;
2021     if (c == OP_END) return NULL;
2022 nigel 91 if (c == OP_RECURSE) return code;
2023 ph10 220
2024 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
2025     map. This includes negated single high-valued characters. The length in
2026     the table is zero; the actual length is stored in the compiled code. */
2027    
2028     if (c == OP_XCLASS) code += GET(code, 1);
2029    
2030 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
2031     repeated character types, we have to test for \p and \P, which have an extra
2032 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2033 ph10 510 must add in its length. */
2034 nigel 91
2035 nigel 77 else
2036     {
2037 ph10 218 switch(c)
2038     {
2039     case OP_TYPESTAR:
2040     case OP_TYPEMINSTAR:
2041     case OP_TYPEPLUS:
2042     case OP_TYPEMINPLUS:
2043     case OP_TYPEQUERY:
2044     case OP_TYPEMINQUERY:
2045     case OP_TYPEPOSSTAR:
2046     case OP_TYPEPOSPLUS:
2047     case OP_TYPEPOSQUERY:
2048     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2049 ph10 220 break;
2050 ph10 221
2051     case OP_TYPEPOSUPTO:
2052     case OP_TYPEUPTO:
2053     case OP_TYPEMINUPTO:
2054     case OP_TYPEEXACT:
2055     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2056     break;
2057 ph10 512
2058 ph10 510 case OP_MARK:
2059     case OP_PRUNE_ARG:
2060     case OP_SKIP_ARG:
2061     code += code[1];
2062 ph10 512 break;
2063 ph10 550
2064     case OP_THEN_ARG:
2065 ph10 716 code += code[1];
2066 ph10 550 break;
2067 ph10 220 }
2068    
2069 ph10 218 /* Add in the fixed length from the table */
2070    
2071 nigel 77 code += _pcre_OP_lengths[c];
2072 ph10 220
2073 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
2074     by a multi-byte character. The length in the table is a minimum, so we have
2075     to arrange to skip the extra bytes. */
2076 ph10 220
2077 ph10 107 #ifdef SUPPORT_UTF8
2078 nigel 77 if (utf8) switch(c)
2079     {
2080     case OP_CHAR:
2081 ph10 602 case OP_CHARI:
2082 nigel 77 case OP_EXACT:
2083 ph10 602 case OP_EXACTI:
2084 nigel 77 case OP_UPTO:
2085 ph10 602 case OP_UPTOI:
2086 nigel 77 case OP_MINUPTO:
2087 ph10 602 case OP_MINUPTOI:
2088 nigel 93 case OP_POSUPTO:
2089 ph10 602 case OP_POSUPTOI:
2090 nigel 77 case OP_STAR:
2091 ph10 602 case OP_STARI:
2092 nigel 77 case OP_MINSTAR:
2093 ph10 602 case OP_MINSTARI:
2094 nigel 93 case OP_POSSTAR:
2095 ph10 602 case OP_POSSTARI:
2096 nigel 77 case OP_PLUS:
2097 ph10 602 case OP_PLUSI:
2098 nigel 77 case OP_MINPLUS:
2099 ph10 602 case OP_MINPLUSI:
2100 nigel 93 case OP_POSPLUS:
2101 ph10 602 case OP_POSPLUSI:
2102 nigel 77 case OP_QUERY:
2103 ph10 602 case OP_QUERYI:
2104 nigel 77 case OP_MINQUERY:
2105 ph10 602 case OP_MINQUERYI:
2106 nigel 93 case OP_POSQUERY:
2107 ph10 602 case OP_POSQUERYI:
2108 nigel 93 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
2109 nigel 77 break;
2110     }
2111 ph10 369 #else
2112     (void)(utf8); /* Keep compiler happy by referencing function argument */
2113 ph10 111 #endif
2114 nigel 77 }
2115     }
2116     }
2117    
2118    
2119    
2120     /*************************************************
2121     * Scan compiled branch for non-emptiness *
2122     *************************************************/
2123    
2124     /* This function scans through a branch of a compiled pattern to see whether it
2125 nigel 93 can match the empty string or not. It is called from could_be_empty()
2126     below and from compile_branch() when checking for an unlimited repeat of a
2127     group that can match nothing. Note that first_significant_code() skips over
2128 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
2129     hit an unclosed bracket, we return "empty" - this means we've struck an inner
2130     bracket whose current branch will already have been scanned.
2131 nigel 77
2132     Arguments:
2133     code points to start of search
2134     endcode points to where to stop
2135     utf8 TRUE if in UTF8 mode
2136 ph10 503 cd contains pointers to tables etc.
2137 nigel 77
2138     Returns: TRUE if what is matched could be empty
2139     */
2140    
2141     static BOOL
2142 ph10 756 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2143     BOOL utf8, compile_data *cd)
2144 nigel 77 {
2145     register int c;
2146 ph10 604 for (code = first_significant_code(code + _pcre_OP_lengths[*code], TRUE);
2147 nigel 77 code < endcode;
2148 ph10 604 code = first_significant_code(code + _pcre_OP_lengths[c], TRUE))
2149 nigel 77 {
2150 ph10 756 const pcre_uchar *ccode;
2151 nigel 77
2152     c = *code;
2153 ph10 507
2154 ph10 286 /* Skip over forward assertions; the other assertions are skipped by
2155 ph10 282 first_significant_code() with a TRUE final argument. */
2156 ph10 286
2157 ph10 282 if (c == OP_ASSERT)
2158 ph10 286 {
2159 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
2160     c = *code;
2161     continue;
2162 ph10 286 }
2163 ph10 172
2164 ph10 503 /* For a recursion/subroutine call, if its end has been reached, which
2165 ph10 624 implies a backward reference subroutine call, we can scan it. If it's a
2166     forward reference subroutine call, we can't. To detect forward reference
2167 ph10 654 we have to scan up the list that is kept in the workspace. This function is
2168     called only when doing the real compile, not during the pre-compile that
2169 ph10 624 measures the size of the compiled pattern. */
2170 ph10 507
2171 ph10 503 if (c == OP_RECURSE)
2172     {
2173 ph10 756 const pcre_uchar *scode;
2174 ph10 624 BOOL empty_branch;
2175 ph10 654
2176 ph10 624 /* Test for forward reference */
2177 ph10 654
2178 ph10 624 for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2179 ph10 654 if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
2180 ph10 624
2181     /* Not a forward reference, test for completed backward reference */
2182 ph10 654
2183 ph10 624 empty_branch = FALSE;
2184     scode = cd->start_code + GET(code, 1);
2185 ph10 503 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2186 ph10 654
2187 ph10 624 /* Completed backwards reference */
2188 ph10 654
2189 ph10 503 do
2190     {
2191 ph10 504 if (could_be_empty_branch(scode, endcode, utf8, cd))
2192     {
2193     empty_branch = TRUE;
2194 ph10 507 break;
2195     }
2196 ph10 503 scode += GET(scode, 1);
2197     }
2198     while (*scode == OP_ALT);
2199 ph10 654
2200 ph10 504 if (!empty_branch) return FALSE; /* All branches are non-empty */
2201 ph10 503 continue;
2202 ph10 507 }
2203 ph10 170
2204 ph10 604 /* Groups with zero repeats can of course be empty; skip them. */
2205    
2206     if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2207     c == OP_BRAPOSZERO)
2208     {
2209     code += _pcre_OP_lengths[c];
2210     do code += GET(code, 1); while (*code == OP_ALT);
2211     c = *code;
2212     continue;
2213     }
2214    
2215     /* A nested group that is already marked as "could be empty" can just be
2216     skipped. */
2217    
2218     if (c == OP_SBRA || c == OP_SBRAPOS ||
2219     c == OP_SCBRA || c == OP_SCBRAPOS)
2220     {
2221     do code += GET(code, 1); while (*code == OP_ALT);
2222     c = *code;
2223     continue;
2224     }
2225    
2226 ph10 170 /* For other groups, scan the branches. */
2227 ph10 172
2228 ph10 604 if (c == OP_BRA || c == OP_BRAPOS ||
2229     c == OP_CBRA || c == OP_CBRAPOS ||
2230 ph10 723 c == OP_ONCE || c == OP_ONCE_NC ||
2231     c == OP_COND)
2232 nigel 77 {
2233     BOOL empty_branch;
2234     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2235 ph10 406
2236     /* If a conditional group has only one branch, there is a second, implied,
2237 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
2238     Otherwise, scan the individual branches of the group. */
2239 ph10 406
2240 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2241 nigel 77 code += GET(code, 1);
2242 ph10 395 else
2243 ph10 406 {
2244 ph10 395 empty_branch = FALSE;
2245     do
2246     {
2247 ph10 503 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
2248 ph10 395 empty_branch = TRUE;
2249     code += GET(code, 1);
2250     }
2251     while (*code == OP_ALT);
2252     if (!empty_branch) return FALSE; /* All branches are non-empty */
2253 nigel 77 }
2254 ph10 406
2255 ph10 172 c = *code;
2256 nigel 93 continue;
2257 nigel 77 }
2258    
2259 nigel 93 /* Handle the other opcodes */
2260    
2261     switch (c)
2262 nigel 77 {
2263 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
2264     cannot be represented just by a bit map. This includes negated single
2265     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
2266 ph10 220 actual length is stored in the compiled code, so we must update "code"
2267 ph10 216 here. */
2268 nigel 77
2269     #ifdef SUPPORT_UTF8
2270     case OP_XCLASS:
2271 ph10 216 ccode = code += GET(code, 1);
2272 nigel 77 goto CHECK_CLASS_REPEAT;
2273     #endif
2274    
2275     case OP_CLASS:
2276     case OP_NCLASS:
2277     ccode = code + 33;
2278    
2279     #ifdef SUPPORT_UTF8
2280     CHECK_CLASS_REPEAT:
2281     #endif
2282    
2283     switch (*ccode)
2284     {
2285     case OP_CRSTAR: /* These could be empty; continue */
2286     case OP_CRMINSTAR:
2287     case OP_CRQUERY:
2288     case OP_CRMINQUERY:
2289     break;
2290    
2291     default: /* Non-repeat => class must match */
2292     case OP_CRPLUS: /* These repeats aren't empty */
2293     case OP_CRMINPLUS:
2294     return FALSE;
2295    
2296     case OP_CRRANGE:
2297     case OP_CRMINRANGE:
2298     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2299     break;
2300     }
2301     break;
2302    
2303     /* Opcodes that must match a character */
2304    
2305     case OP_PROP:
2306     case OP_NOTPROP:
2307     case OP_EXTUNI:
2308     case OP_NOT_DIGIT:
2309     case OP_DIGIT:
2310     case OP_NOT_WHITESPACE:
2311     case OP_WHITESPACE:
2312     case OP_NOT_WORDCHAR:
2313     case OP_WORDCHAR:
2314     case OP_ANY:
2315 ph10 345 case OP_ALLANY:
2316 nigel 77 case OP_ANYBYTE:
2317     case OP_CHAR:
2318 ph10 602 case OP_CHARI:
2319 nigel 77 case OP_NOT:
2320 ph10 602 case OP_NOTI:
2321 nigel 77 case OP_PLUS:
2322     case OP_MINPLUS:
2323 nigel 93 case OP_POSPLUS:
2324 nigel 77 case OP_EXACT:
2325     case OP_NOTPLUS:
2326     case OP_NOTMINPLUS:
2327 nigel 93 case OP_NOTPOSPLUS:
2328 nigel 77 case OP_NOTEXACT:
2329     case OP_TYPEPLUS:
2330     case OP_TYPEMINPLUS:
2331 nigel 93 case OP_TYPEPOSPLUS:
2332 nigel 77 case OP_TYPEEXACT:
2333     return FALSE;
2334 ph10 227
2335     /* These are going to continue, as they may be empty, but we have to
2336     fudge the length for the \p and \P cases. */
2337    
2338 ph10 224 case OP_TYPESTAR:
2339     case OP_TYPEMINSTAR:
2340     case OP_TYPEPOSSTAR:
2341     case OP_TYPEQUERY:
2342     case OP_TYPEMINQUERY:
2343     case OP_TYPEPOSQUERY:
2344     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2345 ph10 227 break;
2346    
2347 ph10 224 /* Same for these */
2348 ph10 227
2349 ph10 224 case OP_TYPEUPTO:
2350     case OP_TYPEMINUPTO:
2351     case OP_TYPEPOSUPTO:
2352     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2353     break;
2354 nigel 77
2355     /* End of branch */
2356    
2357     case OP_KET:
2358     case OP_KETRMAX:
2359     case OP_KETRMIN:
2360 ph10 604 case OP_KETRPOS:
2361 nigel 77 case OP_ALT:
2362     return TRUE;
2363    
2364 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2365     MINUPTO, and POSUPTO may be followed by a multibyte character */
2366 nigel 77
2367     #ifdef SUPPORT_UTF8
2368     case OP_STAR:
2369 ph10 602 case OP_STARI:
2370 nigel 77 case OP_MINSTAR:
2371 ph10 602 case OP_MINSTARI:
2372 nigel 93 case OP_POSSTAR:
2373 ph10 602 case OP_POSSTARI:
2374 nigel 77 case OP_QUERY:
2375 ph10 602 case OP_QUERYI:
2376 nigel 77 case OP_MINQUERY:
2377 ph10 602 case OP_MINQUERYI:
2378 nigel 93 case OP_POSQUERY:
2379 ph10 602 case OP_POSQUERYI:
2380 ph10 426 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2381     break;
2382 ph10 461
2383 nigel 77 case OP_UPTO:
2384 ph10 602 case OP_UPTOI:
2385 nigel 77 case OP_MINUPTO:
2386 ph10 602 case OP_MINUPTOI:
2387 nigel 93 case OP_POSUPTO:
2388 ph10 602 case OP_POSUPTOI:
2389 ph10 426 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2390 nigel 77 break;
2391     #endif
2392 ph10 503
2393 ph10 510 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2394     string. */
2395    
2396     case OP_MARK:
2397     case OP_PRUNE_ARG:
2398     case OP_SKIP_ARG:
2399     code += code[1];
2400 ph10 512 break;
2401 ph10 510
2402 ph10 550 case OP_THEN_ARG:
2403 ph10 716 code += code[1];
2404 ph10 550 break;
2405    
2406 ph10 503 /* None of the remaining opcodes are required to match a character. */
2407 ph10 507
2408 ph10 503 default:
2409 ph10 507 break;
2410 nigel 77 }
2411     }
2412    
2413     return TRUE;
2414     }
2415    
2416    
2417    
2418     /*************************************************
2419     * Scan compiled regex for non-emptiness *
2420     *************************************************/
2421    
2422     /* This function is called to check for left recursive calls. We want to check
2423     the current branch of the current pattern to see if it could match the empty
2424     string. If it could, we must look outwards for branches at other levels,
2425     stopping when we pass beyond the bracket which is the subject of the recursion.
2426 ph10 654 This function is called only during the real compile, not during the
2427 ph10 624 pre-compile.
2428 nigel 77
2429     Arguments:
2430     code points to start of the recursion
2431     endcode points to where to stop (current RECURSE item)
2432     bcptr points to the chain of current (unclosed) branch starts
2433     utf8 TRUE if in UTF-8 mode
2434 ph10 507 cd pointers to tables etc
2435 nigel 77
2436     Returns: TRUE if what is matched could be empty
2437     */
2438    
2439     static BOOL
2440 ph10 756 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2441     branch_chain *bcptr, BOOL utf8, compile_data *cd)
2442 nigel 77 {
2443 ph10 475 while (bcptr != NULL && bcptr->current_branch >= code)
2444 nigel 77 {
2445 ph10 503 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2446 ph10 475 return FALSE;
2447 nigel 77 bcptr = bcptr->outer;
2448     }
2449     return TRUE;
2450     }
2451    
2452    
2453    
2454     /*************************************************
2455     * Check for POSIX class syntax *
2456     *************************************************/
2457    
2458     /* This function is called when the sequence "[:" or "[." or "[=" is
2459 ph10 295 encountered in a character class. It checks whether this is followed by a
2460 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2461 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2462 nigel 77
2463 ph10 298 Originally, this function only recognized a sequence of letters between the
2464     terminators, but it seems that Perl recognizes any sequence of characters,
2465     though of course unknown POSIX names are subsequently rejected. Perl gives an
2466     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2467     didn't consider this to be a POSIX class. Likewise for [:1234:].
2468 ph10 295
2469 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2470     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2471     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2472     below handles the special case of \], but does not try to do any other escape
2473     processing. This makes it different from Perl for cases such as [:l\ower:]
2474 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2475 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2476 ph10 295 I think.
2477    
2478 ph10 640 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2479     It seems that the appearance of a nested POSIX class supersedes an apparent
2480     external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2481 ph10 691 a digit.
2482 ph10 640
2483 ph10 661 In Perl, unescaped square brackets may also appear as part of class names. For
2484     example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2485     [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2486 ph10 691 seem right at all. PCRE does not allow closing square brackets in POSIX class
2487 ph10 661 names.
2488    
2489 ph10 295 Arguments:
2490 nigel 77 ptr pointer to the initial [
2491     endptr where to return the end pointer
2492    
2493     Returns: TRUE or FALSE
2494     */
2495    
2496     static BOOL
2497 ph10 756 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2498 nigel 77 {
2499     int terminator; /* Don't combine these lines; the Solaris cc */
2500     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2501 ph10 295 for (++ptr; *ptr != 0; ptr++)
2502 nigel 77 {
2503 ph10 654 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2504     ptr++;
2505 ph10 691 else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2506 ph10 640 else
2507 ph10 298 {
2508 ph10 391 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2509 ph10 295 {
2510     *endptr = ptr;
2511     return TRUE;
2512 ph10 298 }
2513 ph10 640 if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2514     (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2515     ptr[1] == CHAR_EQUALS_SIGN) &&
2516     check_posix_syntax(ptr, endptr))
2517 ph10 654 return FALSE;
2518 ph10 298 }
2519     }
2520 nigel 77 return FALSE;
2521     }
2522    
2523    
2524    
2525    
2526     /*************************************************
2527     * Check POSIX class name *
2528     *************************************************/
2529    
2530     /* This function is called to check the name given in a POSIX-style class entry
2531     such as [:alnum:].
2532    
2533     Arguments:
2534     ptr points to the first letter
2535     len the length of the name
2536    
2537     Returns: a value representing the name, or -1 if unknown
2538     */
2539    
2540     static int
2541 ph10 756 check_posix_name(const pcre_uchar *ptr, int len)
2542 nigel 77 {
2543 ph10 240 const char *pn = posix_names;
2544 nigel 77 register int yield = 0;
2545     while (posix_name_lengths[yield] != 0)
2546     {
2547     if (len == posix_name_lengths[yield] &&
2548 ph10 240 strncmp((const char *)ptr, pn, len) == 0) return yield;
2549 ph10 243 pn += posix_name_lengths[yield] + 1;
2550 nigel 77 yield++;
2551     }
2552     return -1;
2553     }
2554    
2555    
2556     /*************************************************
2557     * Adjust OP_RECURSE items in repeated group *
2558     *************************************************/
2559    
2560     /* OP_RECURSE items contain an offset from the start of the regex to the group
2561     that is referenced. This means that groups can be replicated for fixed
2562     repetition simply by copying (because the recursion is allowed to refer to
2563     earlier groups that are outside the current group). However, when a group is
2564 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2565     inserted before it, after it has been compiled. This means that any OP_RECURSE
2566     items within it that refer to the group itself or any contained groups have to
2567     have their offsets adjusted. That one of the jobs of this function. Before it
2568     is called, the partially compiled regex must be temporarily terminated with
2569     OP_END.
2570 nigel 77
2571 nigel 93 This function has been extended with the possibility of forward references for
2572     recursions and subroutine calls. It must also check the list of such references
2573     for the group we are dealing with. If it finds that one of the recursions in
2574     the current group is on this list, it adjusts the offset in the list, not the
2575     value in the reference (which is a group number).
2576    
2577 nigel 77 Arguments:
2578     group points to the start of the group
2579     adjust the amount by which the group is to be moved
2580     utf8 TRUE in UTF-8 mode
2581     cd contains pointers to tables etc.
2582 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2583 nigel 77
2584     Returns: nothing
2585     */
2586    
2587     static void
2588 ph10 756 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf8, compile_data *cd,
2589     pcre_uchar *save_hwm)
2590 nigel 77 {
2591 ph10 756 pcre_uchar *ptr = group;
2592 ph10 224
2593 ph10 756 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf8)) != NULL)
2594 nigel 77 {
2595 nigel 93 int offset;
2596 ph10 756 pcre_uchar *hc;
2597 nigel 93
2598     /* See if this recursion is on the forward reference list. If so, adjust the
2599     reference. */
2600 ph10 345
2601 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2602     {
2603     offset = GET(hc, 0);
2604     if (cd->start_code + offset == ptr + 1)
2605     {
2606     PUT(hc, 0, offset + adjust);
2607     break;
2608     }
2609     }
2610    
2611     /* Otherwise, adjust the recursion offset if it's after the start of this
2612     group. */
2613    
2614     if (hc >= cd->hwm)
2615     {
2616     offset = GET(ptr, 1);
2617     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2618     }
2619    
2620 nigel 77 ptr += 1 + LINK_SIZE;
2621     }
2622     }
2623    
2624    
2625    
2626     /*************************************************
2627     * Insert an automatic callout point *
2628     *************************************************/
2629    
2630     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2631     callout points before each pattern item.
2632    
2633     Arguments:
2634     code current code pointer
2635     ptr current pattern pointer
2636     cd pointers to tables etc
2637    
2638     Returns: new code pointer
2639     */
2640    
2641 ph10 756 static pcre_uchar *
2642     auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2643 nigel 77 {
2644     *code++ = OP_CALLOUT;
2645     *code++ = 255;
2646 ph10 530 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
2647     PUT(code, LINK_SIZE, 0); /* Default length */
2648 ph10 756 return code + 2 * LINK_SIZE;
2649 nigel 77 }
2650    
2651    
2652    
2653     /*************************************************
2654     * Complete a callout item *
2655     *************************************************/
2656    
2657     /* A callout item contains the length of the next item in the pattern, which
2658     we can't fill in till after we have reached the relevant point. This is used
2659     for both automatic and manual callouts.
2660    
2661     Arguments:
2662     previous_callout points to previous callout item
2663     ptr current pattern pointer
2664     cd pointers to tables etc
2665    
2666     Returns: nothing
2667     */
2668    
2669     static void
2670 ph10 756 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2671 nigel 77 {
2672 ph10 530 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2673 nigel 77 PUT(previous_callout, 2 + LINK_SIZE, length);
2674     }
2675    
2676    
2677    
2678     #ifdef SUPPORT_UCP
2679     /*************************************************
2680     * Get othercase range *
2681     *************************************************/
2682    
2683     /* This function is passed the start and end of a class range, in UTF-8 mode
2684     with UCP support. It searches up the characters, looking for internal ranges of
2685     characters in the "other" case. Each call returns the next one, updating the
2686     start address.
2687    
2688     Arguments:
2689     cptr points to starting character value; updated
2690     d end value
2691     ocptr where to put start of othercase range
2692     odptr where to put end of othercase range
2693    
2694     Yield: TRUE when range returned; FALSE when no more
2695     */
2696    
2697     static BOOL
2698 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2699     unsigned int *odptr)
2700 nigel 77 {
2701 nigel 93 unsigned int c, othercase, next;
2702 nigel 77
2703     for (c = *cptr; c <= d; c++)
2704 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2705 nigel 77
2706     if (c > d) return FALSE;
2707    
2708     *ocptr = othercase;
2709     next = othercase + 1;
2710    
2711     for (++c; c <= d; c++)
2712     {
2713 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2714 nigel 77 next++;
2715     }
2716    
2717     *odptr = next - 1;
2718     *cptr = c;
2719    
2720     return TRUE;
2721     }
2722 ph10 532
2723    
2724    
2725     /*************************************************
2726     * Check a character and a property *
2727     *************************************************/
2728    
2729     /* This function is called by check_auto_possessive() when a property item
2730     is adjacent to a fixed character.
2731    
2732     Arguments:
2733     c the character
2734     ptype the property type
2735     pdata the data for the type
2736     negated TRUE if it's a negated property (\P or \p{^)
2737 ph10 535
2738 ph10 532 Returns: TRUE if auto-possessifying is OK
2739 ph10 535 */
2740 ph10 532
2741     static BOOL
2742     check_char_prop(int c, int ptype, int pdata, BOOL negated)
2743     {
2744     const ucd_record *prop = GET_UCD(c);
2745     switch(ptype)
2746     {
2747     case PT_LAMP:
2748     return (prop->chartype == ucp_Lu ||
2749     prop->chartype == ucp_Ll ||
2750     prop->chartype == ucp_Lt) == negated;
2751    
2752     case PT_GC:
2753     return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
2754    
2755     case PT_PC:
2756     return (pdata == prop->chartype) == negated;
2757    
2758     case PT_SC:
2759     return (pdata == prop->script) == negated;
2760    
2761     /* These are specials */
2762    
2763     case PT_ALNUM:
2764     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2765     _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
2766    
2767     case PT_SPACE: /* Perl space */
2768     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2769     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2770     == negated;
2771    
2772     case PT_PXSPACE: /* POSIX space */
2773     return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2774     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2775     c == CHAR_FF || c == CHAR_CR)
2776     == negated;
2777    
2778     case PT_WORD:
2779     return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2780     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2781     c == CHAR_UNDERSCORE) == negated;
2782     }
2783 ph10 535 return FALSE;
2784 ph10 532 }
2785 nigel 77 #endif /* SUPPORT_UCP */
2786    
2787    
2788 nigel 93
2789 nigel 77 /*************************************************
2790 nigel 93 * Check if auto-possessifying is possible *
2791     *************************************************/
2792    
2793     /* This function is called for unlimited repeats of certain items, to see
2794     whether the next thing could possibly match the repeated item. If not, it makes
2795     sense to automatically possessify the repeated item.
2796    
2797     Arguments:
2798 ph10 532 previous pointer to the repeated opcode
2799 nigel 93 utf8 TRUE in UTF-8 mode
2800     ptr next character in pattern
2801     options options bits
2802     cd contains pointers to tables etc.
2803    
2804     Returns: TRUE if possessifying is wanted
2805     */
2806    
2807     static BOOL
2808 ph10 756 check_auto_possessive(const pcre_uchar *previous, BOOL utf8,
2809     const pcre_uchar *ptr, int options, compile_data *cd)
2810 nigel 93 {
2811 ph10 532 int c, next;
2812     int op_code = *previous++;
2813 nigel 93
2814     /* Skip whitespace and comments in extended mode */
2815    
2816     if ((options & PCRE_EXTENDED) != 0)
2817     {
2818     for (;;)
2819     {
2820     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2821 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2822 nigel 93 {
2823 ph10 579 ptr++;
2824 ph10 556 while (*ptr != 0)
2825     {
2826 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2827 ph10 556 ptr++;
2828 ph10 579 #ifdef SUPPORT_UTF8
2829 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2830     #endif
2831     }
2832 nigel 93 }
2833     else break;
2834     }
2835     }
2836    
2837     /* If the next item is one that we can handle, get its value. A non-negative
2838     value is a character, a negative value is an escape value. */
2839    
2840 ph10 391 if (*ptr == CHAR_BACKSLASH)
2841 nigel 93 {
2842     int temperrorcode = 0;
2843     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2844     if (temperrorcode != 0) return FALSE;
2845     ptr++; /* Point after the escape sequence */
2846     }
2847    
2848     else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2849     {
2850     #ifdef SUPPORT_UTF8
2851     if (utf8) { GETCHARINC(next, ptr); } else
2852     #endif
2853     next = *ptr++;
2854     }
2855    
2856     else return FALSE;
2857    
2858     /* Skip whitespace and comments in extended mode */
2859    
2860     if ((options & PCRE_EXTENDED) != 0)
2861     {
2862     for (;;)
2863     {
2864     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2865 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2866 nigel 93 {
2867 ph10 579 ptr++;
2868 ph10 556 while (*ptr != 0)
2869     {
2870 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2871 ph10 556 ptr++;
2872 ph10 579 #ifdef SUPPORT_UTF8
2873 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2874     #endif
2875     }
2876 nigel 93 }
2877     else break;
2878     }
2879     }
2880    
2881     /* If the next thing is itself optional, we have to give up. */
2882    
2883 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2884 ph10 391 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2885     return FALSE;
2886 nigel 93
2887 ph10 532 /* Now compare the next item with the previous opcode. First, handle cases when
2888     the next item is a character. */
2889 nigel 93
2890     if (next >= 0) switch(op_code)
2891     {
2892     case OP_CHAR:
2893 ph10 535 #ifdef SUPPORT_UTF8
2894 ph10 532 GETCHARTEST(c, previous);
2895 ph10 369 #else
2896 ph10 532 c = *previous;
2897 ph10 535 #endif
2898     return c != next;
2899 nigel 93
2900 ph10 602 /* For CHARI (caseless character) we must check the other case. If we have
2901 nigel 93 Unicode property support, we can use it to test the other case of
2902     high-valued characters. */
2903    
2904 ph10 602 case OP_CHARI:
2905 ph10 535 #ifdef SUPPORT_UTF8
2906 ph10 532 GETCHARTEST(c, previous);
2907     #else
2908     c = *previous;
2909 ph10 535 #endif
2910 ph10 532 if (c == next) return FALSE;
2911 nigel 93 #ifdef SUPPORT_UTF8
2912     if (utf8)
2913     {
2914     unsigned int othercase;
2915     if (next < 128) othercase = cd->fcc[next]; else
2916     #ifdef SUPPORT_UCP
2917 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
2918 nigel 93 #else
2919     othercase = NOTACHAR;
2920     #endif
2921 ph10 532 return (unsigned int)c != othercase;
2922 nigel 93 }
2923     else
2924     #endif /* SUPPORT_UTF8 */
2925 ph10 532 return (c != cd->fcc[next]); /* Non-UTF-8 mode */
2926 nigel 93
2927 ph10 602 /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
2928 ph10 604 opcodes are not used for multi-byte characters, because they are coded using
2929 ph10 602 an XCLASS instead. */
2930 nigel 93
2931     case OP_NOT:
2932 ph10 602 return (c = *previous) == next;
2933 ph10 604
2934     case OP_NOTI:
2935 ph10 532 if ((c = *previous) == next) return TRUE;
2936 nigel 93 #ifdef SUPPORT_UTF8
2937     if (utf8)
2938     {
2939     unsigned int othercase;
2940     if (next < 128) othercase = cd->fcc[next]; else
2941     #ifdef SUPPORT_UCP
2942 ph10 349 othercase = UCD_OTHERCASE(next);
2943 nigel 93 #else
2944     othercase = NOTACHAR;
2945     #endif
2946 ph10 532 return (unsigned int)c == othercase;
2947 nigel 93 }
2948     else
2949     #endif /* SUPPORT_UTF8 */
2950 ph10 532 return (c == cd->fcc[next]); /* Non-UTF-8 mode */
2951 nigel 93
2952 ph10 535 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
2953     When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
2954    
2955 nigel 93 case OP_DIGIT:
2956     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2957    
2958     case OP_NOT_DIGIT:
2959     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2960    
2961     case OP_WHITESPACE:
2962     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2963    
2964     case OP_NOT_WHITESPACE:
2965     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2966    
2967     case OP_WORDCHAR:
2968     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2969    
2970     case OP_NOT_WORDCHAR:
2971     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2972    
2973 ph10 180 case OP_HSPACE:
2974     case OP_NOT_HSPACE:
2975     switch(next)
2976     {
2977     case 0x09:
2978     case 0x20:
2979     case 0xa0:
2980     case 0x1680:
2981     case 0x180e:
2982     case 0x2000:
2983     case 0x2001:
2984     case 0x2002:
2985     case 0x2003:
2986     case 0x2004:
2987     case 0x2005:
2988     case 0x2006:
2989     case 0x2007:
2990     case 0x2008:
2991     case 0x2009:
2992     case 0x200A:
2993     case 0x202f:
2994     case 0x205f:
2995     case 0x3000:
2996 ph10 528 return op_code == OP_NOT_HSPACE;
2997 ph10 180 default:
2998 ph10 528 return op_code != OP_NOT_HSPACE;
2999 ph10 180 }
3000    
3001 ph10 528 case OP_ANYNL:
3002 ph10 180 case OP_VSPACE:
3003     case OP_NOT_VSPACE:
3004     switch(next)
3005     {
3006     case 0x0a:
3007     case 0x0b:
3008     case 0x0c:
3009     case 0x0d:
3010     case 0x85:
3011     case 0x2028:
3012     case 0x2029:
3013 ph10 528 return op_code == OP_NOT_VSPACE;
3014 ph10 180 default:
3015 ph10 528 return op_code != OP_NOT_VSPACE;
3016 ph10 180 }
3017    
3018 ph10 532 #ifdef SUPPORT_UCP
3019     case OP_PROP:
3020     return check_char_prop(next, previous[0], previous[1], FALSE);
3021 ph10 535
3022 ph10 532 case OP_NOTPROP:
3023     return check_char_prop(next, previous[0], previous[1], TRUE);
3024     #endif
3025    
3026 nigel 93 default:
3027     return FALSE;
3028     }
3029    
3030    
3031 ph10 535 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3032     is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3033     generated only when PCRE_UCP is *not* set, that is, when only ASCII
3034     characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
3035 ph10 532 replaced by OP_PROP codes when PCRE_UCP is set. */
3036 nigel 93
3037     switch(op_code)
3038     {
3039     case OP_CHAR:
3040 ph10 602 case OP_CHARI:
3041 ph10 535 #ifdef SUPPORT_UTF8
3042 ph10 532 GETCHARTEST(c, previous);
3043     #else
3044     c = *previous;
3045 ph10 535 #endif
3046 nigel 93 switch(-next)
3047     {
3048     case ESC_d:
3049 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
3050 nigel 93
3051     case ESC_D:
3052 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
3053 nigel 93
3054     case ESC_s:
3055 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
3056 nigel 93
3057     case ESC_S:
3058 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
3059 nigel 93
3060     case ESC_w:
3061 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
3062 nigel 93
3063     case ESC_W:
3064 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
3065 ph10 182
3066 ph10 180 case ESC_h:
3067     case ESC_H:
3068 ph10 532 switch(c)
3069 ph10 180 {
3070     case 0x09:
3071     case 0x20:
3072     case 0xa0:
3073     case 0x1680:
3074     case 0x180e:
3075     case 0x2000:
3076     case 0x2001:
3077     case 0x2002:
3078     case 0x2003:
3079     case 0x2004:
3080     case 0x2005:
3081     case 0x2006:
3082     case 0x2007:
3083     case 0x2008:
3084     case 0x2009:
3085     case 0x200A:
3086     case 0x202f:
3087     case 0x205f:
3088     case 0x3000:
3089     return -next != ESC_h;
3090     default:
3091     return -next == ESC_h;
3092 ph10 182 }
3093    
3094 ph10 180 case ESC_v:
3095     case ESC_V:
3096 ph10 532 switch(c)
3097 ph10 180 {
3098     case 0x0a:
3099     case 0x0b:
3100     case 0x0c:
3101     case 0x0d:
3102     case 0x85:
3103     case 0x2028:
3104     case 0x2029:
3105     return -next != ESC_v;
3106     default:
3107     return -next == ESC_v;
3108 ph10 182 }
3109 ph10 535
3110     /* When PCRE_UCP is set, these values get generated for \d etc. Find
3111     their substitutions and process them. The result will always be either
3112 ph10 532 -ESC_p or -ESC_P. Then fall through to process those values. */
3113 ph10 535
3114 ph10 532 #ifdef SUPPORT_UCP
3115     case ESC_du:
3116     case ESC_DU:
3117     case ESC_wu:
3118     case ESC_WU:
3119     case ESC_su:
3120     case ESC_SU:
3121     {
3122     int temperrorcode = 0;
3123     ptr = substitutes[-next - ESC_DU];
3124     next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
3125     if (temperrorcode != 0) return FALSE;
3126     ptr++; /* For compatibility */
3127     }
3128 ph10 535 /* Fall through */
3129 nigel 93
3130 ph10 532 case ESC_p:
3131     case ESC_P:
3132     {
3133     int ptype, pdata, errorcodeptr;
3134 ph10 535 BOOL negated;
3135    
3136 ph10 532 ptr--; /* Make ptr point at the p or P */
3137     ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
3138     if (ptype < 0) return FALSE;
3139     ptr++; /* Point past the final curly ket */
3140 ph10 535
3141 ph10 532 /* If the property item is optional, we have to give up. (When generated
3142     from \d etc by PCRE_UCP, this test will have been applied much earlier,
3143     to the original \d etc. At this point, ptr will point to a zero byte. */
3144 ph10 535
3145 ph10 532 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3146     strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3147     return FALSE;
3148 ph10 535
3149 ph10 532 /* Do the property check. */
3150 ph10 535
3151 ph10 532 return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
3152 ph10 535 }
3153 ph10 532 #endif
3154    
3155 nigel 93 default:
3156     return FALSE;
3157     }
3158    
3159 ph10 535 /* In principle, support for Unicode properties should be integrated here as
3160     well. It means re-organizing the above code so as to get hold of the property
3161     values before switching on the op-code. However, I wonder how many patterns
3162     combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
3163     these op-codes are never generated.) */
3164    
3165 nigel 93 case OP_DIGIT:
3166 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
3167 ph10 528 next == -ESC_h || next == -ESC_v || next == -ESC_R;
3168 nigel 93
3169     case OP_NOT_DIGIT:
3170     return next == -ESC_d;
3171    
3172     case OP_WHITESPACE:
3173 ph10 528 return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
3174 nigel 93
3175     case OP_NOT_WHITESPACE:
3176 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
3177 nigel 93
3178 ph10 180 case OP_HSPACE:
3179 ph10 535 return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
3180 ph10 528 next == -ESC_w || next == -ESC_v || next == -ESC_R;
3181 ph10 180
3182     case OP_NOT_HSPACE:
3183     return next == -ESC_h;
3184 ph10 182
3185 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
3186 ph10 535 case OP_ANYNL:
3187 ph10 182 case OP_VSPACE:
3188 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
3189    
3190     case OP_NOT_VSPACE:
3191 ph10 528 return next == -ESC_v || next == -ESC_R;
3192 ph10 180
3193 nigel 93 case OP_WORDCHAR:
3194 ph10 535 return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
3195 ph10 528 next == -ESC_v || next == -ESC_R;
3196 nigel 93
3197     case OP_NOT_WORDCHAR:
3198     return next == -ESC_w || next == -ESC_d;
3199 ph10 182
3200 nigel 93 default:
3201     return FALSE;
3202     }
3203    
3204     /* Control does not reach here */
3205     }
3206    
3207    
3208    
3209     /*************************************************
3210 nigel 77 * Compile one branch *
3211     *************************************************/
3212    
3213 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
3214 nigel 77 changed during the branch, the pointer is used to change the external options
3215 nigel 93 bits. This function is used during the pre-compile phase when we are trying
3216     to find out the amount of memory needed, as well as during the real compile
3217     phase. The value of lengthptr distinguishes the two phases.
3218 nigel 77
3219     Arguments:
3220     optionsptr pointer to the option bits
3221     codeptr points to the pointer to the current code point
3222     ptrptr points to the current pattern pointer
3223     errorcodeptr points to error code variable
3224     firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3225     reqbyteptr set to the last literal character required, else < 0
3226     bcptr points to current branch chain
3227 ph10 654 cond_depth conditional nesting depth
3228 nigel 77 cd contains pointers to tables etc.
3229 nigel 93 lengthptr NULL during the real compile phase
3230     points to length accumulator during pre-compile phase
3231 nigel 77
3232     Returns: TRUE on success
3233     FALSE, with *errorcodeptr set non-zero on error
3234     */
3235    
3236     static BOOL
3237 ph10 756 compile_branch(int *optionsptr, pcre_uchar **codeptr,
3238     const pcre_uchar **ptrptr, int *errorcodeptr, int *firstbyteptr,
3239     int *reqbyteptr, branch_chain *bcptr, int cond_depth, compile_data *cd,
3240     int *lengthptr)
3241 nigel 77 {
3242     int repeat_type, op_type;
3243     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
3244     int bravalue = 0;
3245     int greedy_default, greedy_non_default;
3246     int firstbyte, reqbyte;
3247     int zeroreqbyte, zerofirstbyte;
3248     int req_caseopt, reqvary, tempreqvary;
3249 ph10 635 int options = *optionsptr; /* May change dynamically */
3250 nigel 77 int after_manual_callout = 0;
3251 nigel 93 int length_prevgroup = 0;
3252 nigel 77 register int c;
3253 ph10 756 register pcre_uchar *code = *codeptr;
3254     pcre_uchar *last_code = code;
3255     pcre_uchar *orig_code = code;
3256     pcre_uchar *tempcode;
3257 nigel 77 BOOL inescq = FALSE;
3258     BOOL groupsetfirstbyte = FALSE;
3259 ph10 756 const pcre_uchar *ptr = *ptrptr;
3260     const pcre_uchar *tempptr;
3261     const pcre_uchar *nestptr = NULL;
3262     pcre_uchar *previous = NULL;
3263     pcre_uchar *previous_callout = NULL;
3264     pcre_uchar *save_hwm = NULL;
3265     pcre_uchar classbits[32];
3266 nigel 77
3267 ph10 635 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3268 ph10 654 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3269 ph10 635 dynamically as we process the pattern. */
3270    
3271 nigel 77 #ifdef SUPPORT_UTF8
3272     BOOL class_utf8;
3273     BOOL utf8 = (options & PCRE_UTF8) != 0;
3274 ph10 756 pcre_uint8 *class_utf8data;
3275     pcre_uint8 *class_utf8data_base;
3276     pcre_uint8 utf8_char[6];
3277 nigel 77 #else
3278     BOOL utf8 = FALSE;
3279     #endif
3280    
3281 ph10 475 #ifdef PCRE_DEBUG
3282 nigel 93 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3283     #endif
3284    
3285 nigel 77 /* Set up the default and non-default settings for greediness */
3286    
3287     greedy_default = ((options & PCRE_UNGREEDY) != 0);
3288     greedy_non_default = greedy_default ^ 1;
3289    
3290     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3291     matching encountered yet". It gets changed to REQ_NONE if we hit something that
3292     matches a non-fixed char first char; reqbyte just remains unset if we never
3293     find one.
3294    
3295     When we hit a repeat whose minimum is zero, we may have to adjust these values
3296     to take the zero repeat into account. This is implemented by setting them to
3297     zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
3298     item types that can be repeated set these backoff variables appropriately. */
3299    
3300     firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
3301    
3302     /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
3303     according to the current setting of the caseless flag. REQ_CASELESS is a bit
3304     value > 255. It is added into the firstbyte or reqbyte variables to record the
3305     case status of the value. This is used only for ASCII characters. */
3306    
3307     req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3308    
3309     /* Switch on next character until the end of the branch */
3310    
3311     for (;; ptr++)
3312     {
3313     BOOL negate_class;
3314 ph10 286 BOOL should_flip_negation;
3315 nigel 77 BOOL possessive_quantifier;
3316     BOOL is_quantifier;
3317 nigel 93 BOOL is_recurse;
3318 ph10 180 BOOL reset_bracount;
3319 nigel 77 int class_charcount;
3320     int class_lastchar;
3321     int newoptions;
3322     int recno;
3323 ph10 172 int refsign;
3324 nigel 77 int skipbytes;
3325     int subreqbyte;
3326     int subfirstbyte;
3327 nigel 93 int terminator;
3328 nigel 77 int mclength;
3329 ph10 733 int tempbracount;
3330 ph10 756 pcre_uchar mcbuffer[8];
3331 nigel 77
3332 nigel 93 /* Get next byte in the pattern */
3333 nigel 77
3334     c = *ptr;
3335 ph10 345
3336 ph10 535 /* If we are at the end of a nested substitution, revert to the outer level
3337 ph10 518 string. Nesting only happens one level deep. */
3338    
3339     if (c == 0 && nestptr != NULL)
3340     {
3341     ptr = nestptr;
3342     nestptr = NULL;
3343     c = *ptr;
3344     }
3345    
3346 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
3347     previous cycle of this loop. */
3348    
3349     if (lengthptr != NULL)
3350     {
3351 ph10 475 #ifdef PCRE_DEBUG
3352 nigel 93 if (code > cd->hwm) cd->hwm = code; /* High water info */
3353     #endif
3354 ph10 505 if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */
3355 nigel 93 {
3356     *errorcodeptr = ERR52;
3357     goto FAILED;
3358     }
3359    
3360     /* There is at least one situation where code goes backwards: this is the
3361     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3362     the class is simply eliminated. However, it is created first, so we have to
3363     allow memory for it. Therefore, don't ever reduce the length at this point.
3364     */
3365    
3366     if (code < last_code) code = last_code;
3367 ph10 202
3368     /* Paranoid check for integer overflow */
3369    
3370     if (OFLOW_MAX - *lengthptr < code - last_code)
3371     {
3372     *errorcodeptr = ERR20;
3373     goto FAILED;
3374     }
3375    
3376 ph10 530 *lengthptr += (int)(code - last_code);
3377 ph10 751 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code),
3378     c));
3379 nigel 93
3380     /* If "previous" is set and it is not at the start of the work space, move
3381     it back to there, in order to avoid filling up the work space. Otherwise,
3382     if "previous" is NULL, reset the current code pointer to the start. */
3383    
3384     if (previous != NULL)
3385     {
3386     if (previous > orig_code)
3387     {
3388     memmove(orig_code, previous, code - previous);
3389     code -= previous - orig_code;
3390     previous = orig_code;
3391     }
3392     }
3393     else code = orig_code;
3394    
3395     /* Remember where this code item starts so we can pick up the length
3396     next time round. */
3397    
3398     last_code = code;
3399     }
3400    
3401     /* In the real compile phase, just check the workspace used by the forward
3402     reference list. */
3403    
3404 ph10 505 else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
3405 nigel 93 {
3406     *errorcodeptr = ERR52;
3407     goto FAILED;
3408     }
3409    
3410 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
3411    
3412     if (inescq && c != 0)
3413     {
3414 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3415 nigel 77 {
3416     inescq = FALSE;
3417     ptr++;
3418     continue;
3419     }
3420     else
3421     {
3422     if (previous_callout != NULL)
3423     {
3424 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3425     complete_callout(previous_callout, ptr, cd);
3426 nigel 77 previous_callout = NULL;
3427     }
3428     if ((options & PCRE_AUTO_CALLOUT) != 0)
3429     {
3430     previous_callout = code;
3431     code = auto_callout(code, ptr, cd);
3432     }
3433     goto NORMAL_CHAR;
3434     }
3435     }
3436    
3437     /* Fill in length of a previous callout, except when the next thing is
3438     a quantifier. */
3439    
3440 ph10 392 is_quantifier =
3441 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3442     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3443 nigel 77
3444     if (!is_quantifier && previous_callout != NULL &&
3445     after_manual_callout-- <= 0)
3446     {
3447 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3448     complete_callout(previous_callout, ptr, cd);
3449 nigel 77 previous_callout = NULL;
3450     }
3451    
3452 ph10 635 /* In extended mode, skip white space and comments. */
3453 nigel 77
3454     if ((options & PCRE_EXTENDED) != 0)
3455     {
3456     if ((cd->ctypes[c] & ctype_space) != 0) continue;
3457 ph10 391 if (c == CHAR_NUMBER_SIGN)
3458 nigel 77 {
3459 ph10 579 ptr++;
3460 ph10 556 while (*ptr != 0)
3461 nigel 91 {
3462 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3463 ph10 556 ptr++;
3464 ph10 579 #ifdef SUPPORT_UTF8
3465 ph10 556 if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
3466     #endif
3467 nigel 91 }
3468 nigel 93 if (*ptr != 0) continue;
3469    
3470 nigel 91 /* Else fall through to handle end of string */
3471     c = 0;
3472 nigel 77 }
3473     }
3474    
3475     /* No auto callout for quantifiers. */
3476    
3477     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3478     {
3479     previous_callout = code;
3480     code = auto_callout(code, ptr, cd);
3481     }
3482    
3483     switch(c)
3484     {
3485 nigel 93 /* ===================================================================*/
3486     case 0: /* The branch terminates at string end */
3487 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
3488     case CHAR_RIGHT_PARENTHESIS:
3489 nigel 77 *firstbyteptr = firstbyte;
3490     *reqbyteptr = reqbyte;
3491     *codeptr = code;
3492     *ptrptr = ptr;
3493 nigel 93 if (lengthptr != NULL)
3494     {
3495 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
3496     {
3497     *errorcodeptr = ERR20;
3498     goto FAILED;
3499     }
3500 ph10 530 *lengthptr += (int)(code - last_code); /* To include callout length */
3501 nigel 93 DPRINTF((">> end branch\n"));
3502     }
3503 nigel 77 return TRUE;
3504    
3505 nigel 93
3506     /* ===================================================================*/
3507 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
3508     the setting of any following char as a first character. */
3509    
3510 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
3511 ph10 602 previous = NULL;
3512 nigel 77 if ((options & PCRE_MULTILINE) != 0)
3513     {
3514     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3515 ph10 602 *code++ = OP_CIRCM;
3516 nigel 77 }
3517 ph10 602 else *code++ = OP_CIRC;
3518 nigel 77 break;
3519    
3520 ph10 391 case CHAR_DOLLAR_SIGN:
3521 nigel 77 previous = NULL;
3522 ph10 602 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3523 nigel 77 break;
3524    
3525     /* There can never be a first char if '.' is first, whatever happens about
3526     repeats. The value of reqbyte doesn't change either. */
3527    
3528 ph10 391 case CHAR_DOT:
3529 nigel 77 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3530     zerofirstbyte = firstbyte;
3531     zeroreqbyte = reqbyte;
3532     previous = code;
3533 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3534 nigel 77 break;
3535    
3536 nigel 93
3537     /* ===================================================================*/
3538 nigel 87 /* Character classes. If the included characters are all < 256, we build a
3539     32-byte bitmap of the permitted characters, except in the special case
3540     where there is only one such character. For negated classes, we build the
3541     map as usual, then invert it at the end. However, we use a different opcode
3542     so that data characters > 255 can be handled correctly.
3543 nigel 77
3544     If the class contains characters outside the 0-255 range, a different
3545     opcode is compiled. It may optionally have a bit map for characters < 256,
3546     but those above are are explicitly listed afterwards. A flag byte tells
3547     whether the bitmap is present, and whether this is a negated class or not.
3548 ph10 345
3549 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
3550     default (Perl) mode, it is treated as a data character. */
3551 ph10 345
3552 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
3553 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3554     {
3555     *errorcodeptr = ERR64;
3556 ph10 345 goto FAILED;
3557 ph10 336 }
3558 ph10 345 goto NORMAL_CHAR;
3559 nigel 77
3560 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
3561 nigel 77 previous = code;
3562    
3563     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3564     they are encountered at the top level, so we'll do that too. */
3565    
3566 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3567 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
3568 ph10 295 check_posix_syntax(ptr, &tempptr))
3569 nigel 77 {
3570 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3571 nigel 77 goto FAILED;
3572     }
3573    
3574 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
3575 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
3576 ph10 205 skip them too. This makes for compatibility with Perl. */
3577 ph10 208
3578 ph10 205 negate_class = FALSE;
3579     for (;;)
3580 nigel 77 {
3581     c = *(++ptr);
3582 ph10 391 if (c == CHAR_BACKSLASH)
3583 ph10 205 {
3584 ph10 392 if (ptr[1] == CHAR_E)
3585 ph10 391 ptr++;
3586 ph10 392 else if (strncmp((const char *)ptr+1,
3587     STR_Q STR_BACKSLASH STR_E, 3) == 0)
3588 ph10 391 ptr += 3;
3589 ph10 392 else
3590 ph10 391 break;
3591 ph10 205 }
3592 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3593 ph10 205 negate_class = TRUE;
3594     else break;
3595 ph10 208 }
3596 ph10 345
3597     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3598     an initial ']' is taken as a data character -- the code below handles
3599 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3600     [^] must match any character, so generate OP_ALLANY. */
3601 ph10 345
3602 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3603 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3604 ph10 341 {
3605     *code++ = negate_class? OP_ALLANY : OP_FAIL;
3606     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3607     zerofirstbyte = firstbyte;
3608     break;
3609 ph10 345 }
3610 nigel 77
3611 ph10 286 /* If a class contains a negative special such as \S, we need to flip the
3612     negation flag at the end, so that support for characters > 255 works
3613 ph10 264 correctly (they are all included in the class). */
3614    
3615     should_flip_negation = FALSE;
3616    
3617 nigel 77 /* Keep a count of chars with values < 256 so that we can optimize the case
3618 nigel 93 of just a single character (as long as it's < 256). However, For higher
3619     valued UTF-8 characters, we don't yet do any optimization. */
3620 nigel 77
3621     class_charcount = 0;
3622     class_lastchar = -1;
3623    
3624 nigel 93 /* Initialize the 32-char bit map to all zeros. We build the map in a
3625     temporary bit of memory, in case the class contains only 1 character (less
3626     than 256), because in that case the compiled code doesn't use the bit map.
3627     */
3628    
3629 ph10 756 memset(classbits, 0, 32 * sizeof(pcre_uint8));
3630 nigel 93
3631 nigel 77 #ifdef SUPPORT_UTF8
3632     class_utf8 = FALSE; /* No chars >= 256 */
3633 nigel 93 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
3634 ph10 309 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
3635 nigel 77 #endif
3636    
3637     /* Process characters until ] is reached. By writing this as a "do" it
3638 nigel 93 means that an initial ] is taken as a data character. At the start of the
3639     loop, c contains the first byte of the character. */
3640 nigel 77
3641 nigel 93 if (c != 0) do
3642 nigel 77 {
3643 ph10 756 const pcre_uchar *oldptr;
3644 nigel 93
3645 nigel 77 #ifdef SUPPORT_UTF8
3646     if (utf8 && c > 127)
3647     { /* Braces are required because the */
3648     GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
3649     }
3650 ph10 535
3651 ph10 300 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3652 ph10 309 data and reset the pointer. This is so that very large classes that
3653 ph10 300 contain a zillion UTF-8 characters no longer overwrite the work space
3654 ph10 309 (which is on the stack). */
3655    
3656 ph10 300 if (lengthptr != NULL)
3657     {
3658     *lengthptr += class_utf8data - class_utf8data_base;
3659 ph10 309 class_utf8data = class_utf8data_base;
3660     }
3661    
3662 nigel 77 #endif
3663    
3664     /* Inside \Q...\E everything is literal except \E */
3665    
3666     if (inescq)
3667     {
3668 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */
3669 nigel 77 {
3670 nigel 93 inescq = FALSE; /* Reset literal state */
3671     ptr++; /* Skip the 'E' */
3672     continue; /* Carry on with next */
3673 nigel 77 }
3674 nigel 93 goto CHECK_RANGE; /* Could be range if \E follows */
3675 nigel 77 }
3676    
3677     /* Handle POSIX class names. Perl allows a negation extension of the
3678     form [:^name:]. A square bracket that doesn't match the syntax is
3679     treated as a literal. We also recognize the POSIX constructions
3680     [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3681     5.6 and 5.8 do. */
3682    
3683 ph10 391 if (c == CHAR_LEFT_SQUARE_BRACKET &&
3684 ph10 392 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3685 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3686 nigel 77 {
3687     BOOL local_negate = FALSE;
3688 nigel 87 int posix_class, taboffset, tabopt;
3689 ph10 756 register const pcre_uint8 *cbits = cd->cbits;
3690     pcre_uint8 pbits[32];
3691 nigel 77
3692 ph10 391 if (ptr[1] != CHAR_COLON)
3693 nigel 77 {
3694     *errorcodeptr = ERR31;
3695     goto FAILED;
3696     }
3697    
3698     ptr += 2;
3699 ph10 391 if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3700 nigel 77 {
3701     local_negate = TRUE;
3702 ph10 286 should_flip_negation = TRUE; /* Note negative special */
3703 nigel 77 ptr++;
3704     }
3705    
3706 ph10 530 posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3707 nigel 77 if (posix_class < 0)
3708     {
3709     *errorcodeptr = ERR30;
3710     goto FAILED;
3711     }
3712    
3713     /* If matching is caseless, upper and lower are converted to
3714     alpha. This relies on the fact that the class table starts with
3715     alpha, lower, upper as the first 3 entries. */
3716    
3717     if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3718     posix_class = 0;
3719 ph10 535
3720     /* When PCRE_UCP is set, some of the POSIX classes are converted to
3721 ph10 518 different escape sequences that use Unicode properties. */
3722 ph10 535
3723 ph10 518 #ifdef SUPPORT_UCP
3724     if ((options & PCRE_UCP) != 0)
3725     {
3726     int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3727     if (posix_substitutes[pc] != NULL)
3728     {
3729 ph10 535 nestptr = tempptr + 1;
3730 ph10 518 ptr = posix_substitutes[pc] - 1;
3731 ph10 535 continue;
3732     }
3733     }
3734     #endif
3735 ph10 518 /* In the non-UCP case, we build the bit map for the POSIX class in a
3736     chunk of local store because we may be adding and subtracting from it,
3737     and we don't want to subtract bits that may be in the main map already.
3738     At the end we or the result into the bit map that is being built. */
3739 nigel 77
3740     posix_class *= 3;
3741 nigel 87
3742     /* Copy in the first table (always present) */
3743    
3744     memcpy(pbits, cbits + posix_class_maps[posix_class],
3745 ph10 756 32 * sizeof(pcre_uint8));
3746 nigel 87
3747     /* If there is a second table, add or remove it as required. */
3748    
3749     taboffset = posix_class_maps[posix_class + 1];
3750     tabopt = posix_class_maps[posix_class + 2];
3751    
3752     if (taboffset >= 0)
3753 nigel 77 {
3754 nigel 87 if (tabopt >= 0)
3755     for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3756 nigel 77 else
3757 nigel 87 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3758 nigel 77 }
3759    
3760 nigel 87 /* Not see if we need to remove any special characters. An option
3761     value of 1 removes vertical space and 2 removes underscore. */
3762    
3763     if (tabopt < 0) tabopt = -tabopt;
3764     if (tabopt == 1) pbits[1] &= ~0x3c;
3765     else if (tabopt == 2) pbits[11] &= 0x7f;
3766    
3767     /* Add the POSIX table or its complement into the main table that is
3768     being built and we are done. */
3769    
3770     if (local_negate)
3771     for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3772     else
3773     for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3774    
3775 nigel 77 ptr = tempptr + 1;
3776     class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3777     continue; /* End of POSIX syntax handling */
3778     }
3779    
3780     /* Backslash may introduce a single character, or it may introduce one
3781 nigel 93 of the specials, which just set a flag. The sequence \b is a special
3782 ph10 513 case. Inside a class (and only there) it is treated as backspace. We
3783     assume that other escapes have more than one character in them, so set
3784     class_charcount bigger than one. Unrecognized escapes fall through and
3785     are either treated as literal characters (by default), or are faulted if
3786     PCRE_EXTRA is set. */
3787 nigel 77
3788 ph10 391 if (c == CHAR_BACKSLASH)
3789 nigel 77 {
3790 nigel 93 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3791     if (*errorcodeptr != 0) goto FAILED;
3792 nigel 77
3793 ph10 513 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
3794 nigel 77 else if (-c == ESC_Q) /* Handle start of quoted string */
3795     {
3796 ph10 391 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3797 nigel 77 {
3798     ptr += 2; /* avoid empty string */
3799     }
3800     else inescq = TRUE;
3801     continue;
3802     }
3803 ph10 220 else if (-c == ESC_E) continue; /* Ignore orphan \E */