/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 932 - (hide annotations) (download)
Fri Feb 24 18:54:43 2012 UTC (14 months, 4 weeks ago) by ph10
File MIME type: text/plain
File size: 269244 byte(s)
Add support for PCRE_INFO_MAXLOOKBEHIND.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 836 Copyright (c) 1997-2012 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 ph10 836 /* When PCRE_DEBUG is defined, we need the pcre(16)_printint() function, which
57     is also used by pcretest. PCRE_DEBUG is not defined when building a production
58 ph10 842 library. We do not need to select pcre16_printint.c specially, because the
59 ph10 836 COMPILE_PCREx macro will already be appropriately set. */
60 nigel 85
61 ph10 475 #ifdef PCRE_DEBUG
62 ph10 836 /* pcre_printint.c should not include any headers */
63     #define PCRE_INCLUDED
64     #include "pcre_printint.c"
65     #undef PCRE_INCLUDED
66 nigel 85 #endif
67    
68    
69 ph10 178 /* Macro for setting individual bits in class bitmaps. */
70    
71     #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
72    
73 ph10 202 /* Maximum length value to check against when making sure that the integer that
74     holds the compiled pattern length does not overflow. We make it a bit less than
75     INT_MAX to allow for adding in group terminating bytes, so that we don't have
76     to check them every time. */
77 ph10 178
78 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
79    
80    
81 nigel 77 /*************************************************
82     * Code parameters and static tables *
83     *************************************************/
84    
85 nigel 93 /* This value specifies the size of stack workspace that is used during the
86     first pre-compile phase that determines how much memory is required. The regex
87     is partly compiled into this space, but the compiled parts are discarded as
88     soon as they can be, so that hopefully there will never be an overrun. The code
89     does, however, check for an overrun. The largest amount I've seen used is 218,
90     so this number is very generous.
91 nigel 77
92 nigel 93 The same workspace is used during the second, actual compile phase for
93     remembering forward references to groups so that they can be filled in at the
94     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
95 ph10 836 is 4 there is plenty of room for most patterns. However, the memory can get
96     filled up by repetitions of forward references, for example patterns like
97     /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
98     that the workspace is expanded using malloc() in this situation. The value
99     below is therefore a minimum, and we put a maximum on it for safety. The
100     minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
101     kicks in at the same number of forward references in all cases. */
102 nigel 77
103 ph10 836 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
104     #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
105 nigel 77
106 ph10 507 /* The overrun tests check for a slightly smaller size so that they detect the
107 ph10 505 overrun before it actually does run off the end of the data block. */
108 nigel 93
109 ph10 836 #define WORK_SIZE_SAFETY_MARGIN (100)
110 ph10 505
111 ph10 836 /* Private flags added to firstchar and reqchar. */
112 ph10 505
113 ph10 836 #define REQ_CASELESS 0x10000000l /* Indicates caselessness */
114     #define REQ_VARY 0x20000000l /* Reqchar followed non-literal item */
115    
116     /* Repeated character flags. */
117    
118     #define UTF_LENGTH 0x10000000l /* The char contains its length. */
119    
120 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
121     are simple data values; negative values are for special things like \d and so
122     on. Zero means further processing is needed (for things like \x), or the escape
123     is invalid. */
124    
125 ph10 391 #ifndef EBCDIC
126    
127     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
128 ph10 392 in UTF-8 mode. */
129 ph10 391
130 ph10 392 static const short int escapes[] = {
131 ph10 391 0, 0,
132     0, 0,
133 ph10 392 0, 0,
134     0, 0,
135     0, 0,
136 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
137 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
138 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
139 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
140     -ESC_B, -ESC_C,
141     -ESC_D, -ESC_E,
142     0, -ESC_G,
143     -ESC_H, 0,
144     0, -ESC_K,
145 ph10 391 0, 0,
146 ph10 514 -ESC_N, 0,
147 ph10 391 -ESC_P, -ESC_Q,
148     -ESC_R, -ESC_S,
149 ph10 392 0, 0,
150     -ESC_V, -ESC_W,
151     -ESC_X, 0,
152     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
153 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
154 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
155 ph10 391 CHAR_GRAVE_ACCENT, 7,
156 ph10 392 -ESC_b, 0,
157     -ESC_d, ESC_e,
158 ph10 391 ESC_f, 0,
159     -ESC_h, 0,
160 ph10 392 0, -ESC_k,
161 ph10 391 0, 0,
162     ESC_n, 0,
163 ph10 392 -ESC_p, 0,
164     ESC_r, -ESC_s,
165 ph10 391 ESC_tee, 0,
166 ph10 392 -ESC_v, -ESC_w,
167     0, 0,
168 ph10 391 -ESC_z
169 nigel 77 };
170    
171 ph10 392 #else
172 ph10 391
173     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
174    
175 nigel 77 static const short int escapes[] = {
176     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
177     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
178     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
179     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
180     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
181     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
182     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
183     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
184 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
185 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
186 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
187 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
188 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
189     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
190     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
191     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
192 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
193 ph10 514 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
194 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
195 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
196 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
197     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
198     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
199     };
200     #endif
201    
202    
203 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
204     searched linearly. Put all the names into a single string, in order to reduce
205 ph10 392 the number of relocations when a shared library is dynamically linked. The
206     string is built from string macros so that it works in UTF-8 mode on EBCDIC
207 ph10 391 platforms. */
208 ph10 210
209     typedef struct verbitem {
210 ph10 510 int len; /* Length of verb name */
211     int op; /* Op when no arg, or -1 if arg mandatory */
212     int op_arg; /* Op when arg present, or -1 if not allowed */
213 ph10 211 } verbitem;
214 ph10 210
215 ph10 240 static const char verbnames[] =
216 ph10 510 "\0" /* Empty name is a shorthand for MARK */
217 ph10 512 STRING_MARK0
218 ph10 391 STRING_ACCEPT0
219     STRING_COMMIT0
220     STRING_F0
221     STRING_FAIL0
222     STRING_PRUNE0
223     STRING_SKIP0
224     STRING_THEN;
225 ph10 240
226 ph10 327 static const verbitem verbs[] = {
227 ph10 510 { 0, -1, OP_MARK },
228 ph10 512 { 4, -1, OP_MARK },
229 ph10 510 { 6, OP_ACCEPT, -1 },
230     { 6, OP_COMMIT, -1 },
231     { 1, OP_FAIL, -1 },
232     { 4, OP_FAIL, -1 },
233     { 5, OP_PRUNE, OP_PRUNE_ARG },
234     { 4, OP_SKIP, OP_SKIP_ARG },
235     { 4, OP_THEN, OP_THEN_ARG }
236 ph10 210 };
237    
238 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
239 ph10 210
240    
241 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
242     now all in a single string, to reduce the number of relocations when a shared
243 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
244     length entry. The first three must be alpha, lower, upper, as this is assumed
245     for handling case independence. */
246 nigel 77
247 ph10 240 static const char posix_names[] =
248 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
249     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
250 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
251     STRING_word0 STRING_xdigit;
252 nigel 77
253 ph10 836 static const pcre_uint8 posix_name_lengths[] = {
254 nigel 77 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
255    
256 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
257     base map, with an optional addition or removal of another map. Then, for some
258     classes, there is some additional tweaking: for [:blank:] the vertical space
259     characters are removed, and for [:alpha:] and [:alnum:] the underscore
260     character is removed. The triples in the table consist of the base map offset,
261     second map offset or -1 if no second map, and a non-negative value for map
262     addition or a negative value for map subtraction (if there are two maps). The
263     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
264     remove vertical space characters, 2 => remove underscore. */
265 nigel 77
266     static const int posix_class_maps[] = {
267 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
268     cbit_lower, -1, 0, /* lower */
269     cbit_upper, -1, 0, /* upper */
270     cbit_word, -1, 2, /* alnum - word without underscore */
271     cbit_print, cbit_cntrl, 0, /* ascii */
272     cbit_space, -1, 1, /* blank - a GNU extension */
273     cbit_cntrl, -1, 0, /* cntrl */
274     cbit_digit, -1, 0, /* digit */
275     cbit_graph, -1, 0, /* graph */
276     cbit_print, -1, 0, /* print */
277     cbit_punct, -1, 0, /* punct */
278     cbit_space, -1, 0, /* space */
279     cbit_word, -1, 0, /* word - a Perl extension */
280     cbit_xdigit,-1, 0 /* xdigit */
281 nigel 77 };
282    
283 ph10 535 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
284     substitutes must be in the order of the names, defined above, and there are
285 ph10 518 both positive and negative cases. NULL means no substitute. */
286 nigel 77
287 ph10 518 #ifdef SUPPORT_UCP
288 ph10 836 static const pcre_uchar string_PNd[] = {
289     CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
290     CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
291     static const pcre_uchar string_pNd[] = {
292     CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
293     CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
294     static const pcre_uchar string_PXsp[] = {
295     CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
296     CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
297     static const pcre_uchar string_pXsp[] = {
298     CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
299     CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
300     static const pcre_uchar string_PXwd[] = {
301     CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
302     CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
303     static const pcre_uchar string_pXwd[] = {
304     CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
305     CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
306    
307     static const pcre_uchar *substitutes[] = {
308     string_PNd, /* \D */
309     string_pNd, /* \d */
310     string_PXsp, /* \S */ /* NOTE: Xsp is Perl space */
311     string_pXsp, /* \s */
312     string_PXwd, /* \W */
313     string_pXwd /* \w */
314 ph10 518 };
315 ph10 535
316 ph10 836 static const pcre_uchar string_pL[] = {
317     CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
318     CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
319     static const pcre_uchar string_pLl[] = {
320     CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
321     CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
322     static const pcre_uchar string_pLu[] = {
323     CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
324     CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
325     static const pcre_uchar string_pXan[] = {
326     CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
327     CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
328     static const pcre_uchar string_h[] = {
329     CHAR_BACKSLASH, CHAR_h, '\0' };
330     static const pcre_uchar string_pXps[] = {
331     CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
332     CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
333     static const pcre_uchar string_PL[] = {
334     CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
335     CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
336     static const pcre_uchar string_PLl[] = {
337     CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
338     CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
339     static const pcre_uchar string_PLu[] = {
340     CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
341     CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
342     static const pcre_uchar string_PXan[] = {
343     CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
344     CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
345     static const pcre_uchar string_H[] = {
346     CHAR_BACKSLASH, CHAR_H, '\0' };
347     static const pcre_uchar string_PXps[] = {
348     CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
349     CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
350    
351     static const pcre_uchar *posix_substitutes[] = {
352     string_pL, /* alpha */
353     string_pLl, /* lower */
354     string_pLu, /* upper */
355     string_pXan, /* alnum */
356     NULL, /* ascii */
357     string_h, /* blank */
358     NULL, /* cntrl */
359     string_pNd, /* digit */
360     NULL, /* graph */
361     NULL, /* print */
362     NULL, /* punct */
363     string_pXps, /* space */ /* NOTE: Xps is POSIX space */
364     string_pXwd, /* word */
365     NULL, /* xdigit */
366 ph10 518 /* Negated cases */
367 ph10 836 string_PL, /* ^alpha */
368     string_PLl, /* ^lower */
369     string_PLu, /* ^upper */
370     string_PXan, /* ^alnum */
371     NULL, /* ^ascii */
372     string_H, /* ^blank */
373     NULL, /* ^cntrl */
374     string_PNd, /* ^digit */
375     NULL, /* ^graph */
376     NULL, /* ^print */
377     NULL, /* ^punct */
378     string_PXps, /* ^space */ /* NOTE: Xps is POSIX space */
379     string_PXwd, /* ^word */
380     NULL /* ^xdigit */
381 ph10 518 };
382 ph10 836 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
383 ph10 535 #endif
384 ph10 518
385 nigel 93 #define STRING(a) # a
386     #define XSTRING(s) STRING(s)
387    
388 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
389 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
390     they are documented. Always add a new error instead. Messages marked DEAD below
391 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
392     the number of relocations needed when a shared library is loaded dynamically,
393     it is now one long string. We cannot use a table of offsets, because the
394     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
395     simply count through to the one we want - this isn't a performance issue
396 ph10 507 because these strings are used only when there is a compilation error.
397 nigel 77
398 ph10 507 Each substring ends with \0 to insert a null character. This includes the final
399     substring, so that the whole string ends with \0\0, which can be detected when
400 ph10 499 counting through. */
401    
402 ph10 240 static const char error_texts[] =
403     "no error\0"
404     "\\ at end of pattern\0"
405     "\\c at end of pattern\0"
406     "unrecognized character follows \\\0"
407     "numbers out of order in {} quantifier\0"
408 nigel 77 /* 5 */
409 ph10 240 "number too big in {} quantifier\0"
410     "missing terminating ] for character class\0"
411     "invalid escape sequence in character class\0"
412     "range out of order in character class\0"
413     "nothing to repeat\0"
414 nigel 77 /* 10 */
415 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
416     "internal error: unexpected repeat\0"
417 ph10 269 "unrecognized character after (? or (?-\0"
418 ph10 240 "POSIX named classes are supported only within a class\0"
419     "missing )\0"
420 nigel 77 /* 15 */
421 ph10 240 "reference to non-existent subpattern\0"
422     "erroffset passed as NULL\0"
423     "unknown option bit(s) set\0"
424     "missing ) after comment\0"
425     "parentheses nested too deeply\0" /** DEAD **/
426 nigel 77 /* 20 */
427 ph10 240 "regular expression is too large\0"
428     "failed to get memory\0"
429     "unmatched parentheses\0"
430     "internal error: code overflow\0"
431     "unrecognized character after (?<\0"
432 nigel 77 /* 25 */
433 ph10 240 "lookbehind assertion is not fixed length\0"
434     "malformed number or name after (?(\0"
435     "conditional group contains more than two branches\0"
436     "assertion expected after (?(\0"
437     "(?R or (?[+-]digits must be followed by )\0"
438 nigel 77 /* 30 */
439 ph10 240 "unknown POSIX class name\0"
440     "POSIX collating elements are not supported\0"
441 ph10 848 "this version of PCRE is compiled without UTF support\0"
442 ph10 240 "spare error\0" /** DEAD **/
443     "character value in \\x{...} sequence is too large\0"
444 nigel 77 /* 35 */
445 ph10 240 "invalid condition (?(0)\0"
446     "\\C not allowed in lookbehind assertion\0"
447 ph10 514 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
448 ph10 240 "number after (?C is > 255\0"
449     "closing ) for (?C expected\0"
450 nigel 77 /* 40 */
451 ph10 240 "recursive call could loop indefinitely\0"
452     "unrecognized character after (?P\0"
453     "syntax error in subpattern name (missing terminator)\0"
454     "two named subpatterns have the same name\0"
455     "invalid UTF-8 string\0"
456 nigel 77 /* 45 */
457 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
458     "malformed \\P or \\p sequence\0"
459     "unknown property name after \\P or \\p\0"
460     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
461     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
462 nigel 91 /* 50 */
463 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
464 ph10 848 "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
465 ph10 240 "internal error: overran compiling workspace\0"
466     "internal error: previously-checked referenced subpattern not found\0"
467     "DEFINE group contains more than one branch\0"
468 nigel 93 /* 55 */
469 ph10 637 "repeating a DEFINE group is not allowed\0" /** DEAD **/
470 ph10 240 "inconsistent NEWLINE options\0"
471 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
472     "a numbered reference must not be zero\0"
473 ph10 510 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
474 ph10 211 /* 60 */
475 ph10 240 "(*VERB) not recognized\0"
476 ph10 268 "number is too big\0"
477 ph10 272 "subpattern name expected\0"
478 ph10 336 "digit expected after (?+\0"
479 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
480     /* 65 */
481 ph10 510 "different names for subpatterns of the same number are not allowed\0"
482 ph10 512 "(*MARK) must have an argument\0"
483 ph10 848 "this version of PCRE is not compiled with Unicode property support\0"
484 ph10 579 "\\c must be followed by an ASCII character\0"
485 ph10 654 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
486 ph10 747 /* 70 */
487     "internal error: unknown opcode in find_fixedlength()\0"
488 ph10 836 "\\N is not supported in a class\0"
489     "too many forward references\0"
490 ph10 848 "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
491 ph10 903 "invalid UTF-16 string\0"
492 ph10 510 ;
493 nigel 77
494     /* Table to identify digits and hex digits. This is used when compiling
495     patterns. Note that the tables in chartables are dependent on the locale, and
496     may mark arbitrary characters as digits - but the PCRE compiling code expects
497     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
498     a private table here. It costs 256 bytes, but it is a lot faster than doing
499     character value tests (at least in some simple cases I timed), and in some
500     applications one wants PCRE to compile efficiently as well as match
501     efficiently.
502    
503     For convenience, we use the same bit definitions as in chartables:
504    
505     0x04 decimal digit
506     0x08 hexadecimal digit
507    
508     Then we can use ctype_digit and ctype_xdigit in the code. */
509    
510 ph10 836 /* Using a simple comparison for decimal numbers rather than a memory read
511     is much faster, and the resulting code is simpler (the compiler turns it
512     into a subtraction and unsigned comparison). */
513    
514     #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
515    
516 ph10 392 #ifndef EBCDIC
517 ph10 391
518 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
519 ph10 391 UTF-8 mode. */
520    
521 ph10 836 static const pcre_uint8 digitab[] =
522 nigel 77 {
523     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
524     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
525     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
526     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
527     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
528     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
529     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
530     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
531     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
532     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
533     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
534     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
535     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
536     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
537     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
538     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
539     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
540     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
541     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
542     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
543     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
544     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
545     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
546     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
547     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
548     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
549     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
550     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
551     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
552     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
553     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
554     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
555    
556 ph10 392 #else
557 ph10 391
558     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
559    
560 ph10 836 static const pcre_uint8 digitab[] =
561 nigel 77 {
562     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
563     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
564     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
565     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
566     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
567     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
568     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
569     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
570     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
571     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
572     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
573 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
574 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
575     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
576     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
577     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
578     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
579     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
580     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
581     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
582     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
583     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
584     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
585     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
586     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
587     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
588     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
589     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
590     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
591     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
592     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
593     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
594    
595 ph10 836 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
596 nigel 77 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
597     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
598     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
599     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
600     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
601     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
602     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
603     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
604     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
605     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
606     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
607 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
608 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
609     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
610     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
611     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
612     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
613     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
614     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
615     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
616     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
617     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
618     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
619     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
620     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
621     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
622     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
623     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
624     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
625     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
626     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
627     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
628     #endif
629    
630    
631     /* Definition to allow mutual recursion */
632    
633     static BOOL
634 ph10 836 compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
635 ph10 642 int *, int *, branch_chain *, compile_data *, int *);
636 nigel 77
637    
638    
639     /*************************************************
640 ph10 240 * Find an error text *
641     *************************************************/
642    
643 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
644     some of the text is of unknown length, we can't use a table of offsets.
645     Instead, just count through the strings. This is not a performance issue
646 ph10 240 because it happens only when there has been a compilation error.
647    
648     Argument: the error number
649     Returns: pointer to the error string
650     */
651    
652     static const char *
653     find_error_text(int n)
654     {
655     const char *s = error_texts;
656 ph10 507 for (; n > 0; n--)
657 ph10 499 {
658     while (*s++ != 0) {};
659     if (*s == 0) return "Error text not found (please report)";
660 ph10 507 }
661 ph10 240 return s;
662     }
663    
664    
665     /*************************************************
666 ph10 836 * Expand the workspace *
667     *************************************************/
668    
669     /* This function is called during the second compiling phase, if the number of
670     forward references fills the existing workspace, which is originally a block on
671     the stack. A larger block is obtained from malloc() unless the ultimate limit
672     has been reached or the increase will be rather small.
673    
674     Argument: pointer to the compile data block
675     Returns: 0 if all went well, else an error number
676     */
677    
678     static int
679     expand_workspace(compile_data *cd)
680     {
681     pcre_uchar *newspace;
682     int newsize = cd->workspace_size * 2;
683    
684     if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
685     if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
686     newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
687     return ERR72;
688    
689     newspace = (PUBL(malloc))(IN_UCHARS(newsize));
690     if (newspace == NULL) return ERR21;
691     memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
692     cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
693     if (cd->workspace_size > COMPILE_WORK_SIZE)
694     (PUBL(free))((void *)cd->start_workspace);
695     cd->start_workspace = newspace;
696     cd->workspace_size = newsize;
697     return 0;
698     }
699    
700    
701    
702     /*************************************************
703 ph10 640 * Check for counted repeat *
704     *************************************************/
705    
706     /* This function is called when a '{' is encountered in a place where it might
707     start a quantifier. It looks ahead to see if it really is a quantifier or not.
708     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
709     where the ddds are digits.
710    
711     Arguments:
712     p pointer to the first char after '{'
713    
714     Returns: TRUE or FALSE
715     */
716    
717     static BOOL
718 ph10 836 is_counted_repeat(const pcre_uchar *p)
719 ph10 640 {
720 ph10 836 if (!IS_DIGIT(*p)) return FALSE;
721     p++;
722     while (IS_DIGIT(*p)) p++;
723 ph10 640 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
724    
725     if (*p++ != CHAR_COMMA) return FALSE;
726     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
727    
728 ph10 836 if (!IS_DIGIT(*p)) return FALSE;
729     p++;
730     while (IS_DIGIT(*p)) p++;
731 ph10 640
732     return (*p == CHAR_RIGHT_CURLY_BRACKET);
733     }
734    
735    
736    
737     /*************************************************
738 nigel 77 * Handle escapes *
739     *************************************************/
740    
741     /* This function is called when a \ has been encountered. It either returns a
742     positive value for a simple escape such as \n, or a negative value which
743 nigel 93 encodes one of the more complicated things such as \d. A backreference to group
744     n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
745     UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
746     ptr is pointing at the \. On exit, it is on the final character of the escape
747     sequence.
748 nigel 77
749     Arguments:
750     ptrptr points to the pattern position pointer
751     errorcodeptr points to the errorcode variable
752     bracount number of previous extracting brackets
753     options the options bits
754     isclass TRUE if inside a character class
755    
756     Returns: zero or positive => a data character
757     negative => a special escape sequence
758 ph10 213 on error, errorcodeptr is set
759 nigel 77 */
760    
761     static int
762 ph10 836 check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
763 nigel 77 int options, BOOL isclass)
764     {
765 ph10 836 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
766     BOOL utf = (options & PCRE_UTF8) != 0;
767     const pcre_uchar *ptr = *ptrptr + 1;
768     pcre_int32 c;
769     int i;
770 nigel 77
771 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
772     ptr--; /* Set pointer back to the last byte */
773    
774 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
775    
776     if (c == 0) *errorcodeptr = ERR1;
777    
778 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
779     in a table. A non-zero result is something that can be returned immediately.
780 nigel 77 Otherwise further processing may be required. */
781    
782 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
783 ph10 836 /* Not alphanumeric */
784     else if (c < CHAR_0 || c > CHAR_z) {}
785 ph10 391 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
786 nigel 77
787 ph10 97 #else /* EBCDIC coding */
788 ph10 836 /* Not alphanumeric */
789     else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
790 nigel 77 else if ((i = escapes[c - 0x48]) != 0) c = i;
791     #endif
792    
793     /* Escapes that need further processing, or are illegal. */
794    
795     else
796     {
797 ph10 836 const pcre_uchar *oldptr;
798 nigel 93 BOOL braced, negated;
799    
800 nigel 77 switch (c)
801     {
802     /* A number of Perl escapes are not handled by PCRE. We give an explicit
803     error. */
804    
805 ph10 391 case CHAR_l:
806     case CHAR_L:
807 zherczeg 744 *errorcodeptr = ERR37;
808     break;
809    
810 ph10 391 case CHAR_u:
811 zherczeg 744 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
812     {
813     /* In JavaScript, \u must be followed by four hexadecimal numbers.
814     Otherwise it is a lowercase u letter. */
815 ph10 836 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
816     && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
817     && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
818     && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
819 zherczeg 744 {
820     c = 0;
821     for (i = 0; i < 4; ++i)
822     {
823     register int cc = *(++ptr);
824     #ifndef EBCDIC /* ASCII/UTF-8 coding */
825     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
826     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
827     #else /* EBCDIC coding */
828     if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
829     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
830     #endif
831     }
832     }
833     }
834     else
835     *errorcodeptr = ERR37;
836     break;
837    
838 ph10 391 case CHAR_U:
839 zherczeg 744 /* In JavaScript, \U is an uppercase U letter. */
840     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
841 nigel 77 break;
842    
843 ph10 654 /* In a character class, \g is just a literal "g". Outside a character
844 ph10 640 class, \g must be followed by one of a number of specific things:
845 ph10 345
846 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
847     backreference. If negative, it is a relative backreference. This is a Perl
848     5.10 feature.
849 ph10 345
850 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
851     is part of Perl's movement towards a unified syntax for back references. As
852     this is synonymous with \k{name}, we fudge it up by pretending it really
853     was \k.
854 ph10 345
855     (3) For Oniguruma compatibility we also support \g followed by a name or a
856     number either in angle brackets or in single quotes. However, these are
857     (possibly recursive) subroutine calls, _not_ backreferences. Just return
858 ph10 333 the -ESC_g code (cf \k). */
859 nigel 93
860 ph10 391 case CHAR_g:
861 ph10 640 if (isclass) break;
862 ph10 391 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
863 ph10 333 {
864     c = -ESC_g;
865 ph10 345 break;
866     }
867 ph10 333
868     /* Handle the Perl-compatible cases */
869 ph10 345
870 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
871 nigel 93 {
872 ph10 836 const pcre_uchar *p;
873 ph10 391 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
874 ph10 836 if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
875 ph10 391 if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
876 ph10 171 {
877     c = -ESC_k;
878     break;
879 ph10 172 }
880 nigel 93 braced = TRUE;
881     ptr++;
882     }
883     else braced = FALSE;
884    
885 ph10 391 if (ptr[1] == CHAR_MINUS)
886 nigel 93 {
887     negated = TRUE;
888     ptr++;
889     }
890     else negated = FALSE;
891    
892 ph10 836 /* The integer range is limited by the machine's int representation. */
893 nigel 93 c = 0;
894 ph10 836 while (IS_DIGIT(ptr[1]))
895     {
896     if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
897     {
898     c = -1;
899     break;
900     }
901 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
902 ph10 836 }
903     if (((unsigned int)c) > INT_MAX) /* Integer overflow */
904 ph10 213 {
905 ph10 836 while (IS_DIGIT(ptr[1]))
906     ptr++;
907 ph10 213 *errorcodeptr = ERR61;
908     break;
909 ph10 220 }
910 ph10 345
911 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
912 nigel 93 {
913     *errorcodeptr = ERR57;
914 ph10 213 break;
915 nigel 93 }
916 ph10 345
917 ph10 333 if (c == 0)
918     {
919     *errorcodeptr = ERR58;
920     break;
921 ph10 345 }
922 nigel 93
923     if (negated)
924     {
925     if (c > bracount)
926     {
927     *errorcodeptr = ERR15;
928 ph10 213 break;
929 nigel 93 }
930     c = bracount - (c - 1);
931     }
932    
933     c = -(ESC_REF + c);
934     break;
935    
936 nigel 77 /* The handling of escape sequences consisting of a string of digits
937     starting with one that is not zero is not straightforward. By experiment,
938     the way Perl works seems to be as follows:
939    
940     Outside a character class, the digits are read as a decimal number. If the
941     number is less than 10, or if there are that many previous extracting
942     left brackets, then it is a back reference. Otherwise, up to three octal
943     digits are read to form an escaped byte. Thus \123 is likely to be octal
944     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
945     value is greater than 377, the least significant 8 bits are taken. Inside a
946     character class, \ followed by a digit is always an octal number. */
947    
948 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
949     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
950 nigel 77
951     if (!isclass)
952     {
953     oldptr = ptr;
954 ph10 836 /* The integer range is limited by the machine's int representation. */
955 ph10 391 c -= CHAR_0;
956 ph10 836 while (IS_DIGIT(ptr[1]))
957     {
958     if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
959     {
960     c = -1;
961     break;
962     }
963 ph10 391 c = c * 10 + *(++ptr) - CHAR_0;
964 ph10 836 }
965     if (((unsigned int)c) > INT_MAX) /* Integer overflow */
966 ph10 213 {
967 ph10 836 while (IS_DIGIT(ptr[1]))
968     ptr++;
969 ph10 213 *errorcodeptr = ERR61;
970 ph10 220 break;
971     }
972 nigel 77 if (c < 10 || c <= bracount)
973     {
974     c = -(ESC_REF + c);
975     break;
976     }
977     ptr = oldptr; /* Put the pointer back and fall through */
978     }
979    
980     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
981     generates a binary zero byte and treats the digit as a following literal.
982     Thus we have to pull back the pointer by one. */
983    
984 ph10 391 if ((c = *ptr) >= CHAR_8)
985 nigel 77 {
986     ptr--;
987     c = 0;
988     break;
989     }
990    
991     /* \0 always starts an octal number, but we may drop through to here with a
992 nigel 91 larger first octal digit. The original code used just to take the least
993     significant 8 bits of octal numbers (I think this is what early Perls used
994 ph10 849 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
995     but no more than 3 octal digits. */
996 nigel 77
997 ph10 391 case CHAR_0:
998     c -= CHAR_0;
999     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1000     c = c * 8 + *(++ptr) - CHAR_0;
1001 ph10 903 #ifdef COMPILE_PCRE8
1002 ph10 836 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1003 ph10 903 #endif
1004 nigel 77 break;
1005    
1006 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
1007 ph10 836 than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
1008     If not, { is treated as a data character. */
1009 nigel 77
1010 ph10 391 case CHAR_x:
1011 zherczeg 744 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1012     {
1013     /* In JavaScript, \x must be followed by two hexadecimal numbers.
1014     Otherwise it is a lowercase x letter. */
1015 ph10 836 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1016     && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1017 zherczeg 744 {
1018     c = 0;
1019     for (i = 0; i < 2; ++i)
1020     {
1021     register int cc = *(++ptr);
1022     #ifndef EBCDIC /* ASCII/UTF-8 coding */
1023     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1024     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1025     #else /* EBCDIC coding */
1026     if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1027     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1028     #endif
1029     }
1030     }
1031     break;
1032     }
1033    
1034 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1035 nigel 77 {
1036 ph10 836 const pcre_uchar *pt = ptr + 2;
1037 nigel 87
1038 nigel 77 c = 0;
1039 ph10 836 while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
1040 nigel 77 {
1041 nigel 87 register int cc = *pt++;
1042 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1043 nigel 87
1044 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1045     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1046     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1047 ph10 97 #else /* EBCDIC coding */
1048 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1049     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1050 nigel 77 #endif
1051 ph10 836
1052     #ifdef COMPILE_PCRE8
1053     if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; }
1054     #else
1055     #ifdef COMPILE_PCRE16
1056     if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; }
1057     #endif
1058     #endif
1059 nigel 77 }
1060 nigel 87
1061 ph10 836 if (c < 0)
1062     {
1063     while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
1064     *errorcodeptr = ERR34;
1065     }
1066    
1067 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1068 nigel 77 {
1069 ph10 836 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1070 nigel 77 ptr = pt;
1071     break;
1072     }
1073 nigel 87
1074 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
1075     recognize this construct; fall through to the normal \x handling. */
1076     }
1077    
1078 nigel 87 /* Read just a single-byte hex-defined char */
1079 nigel 77
1080     c = 0;
1081 ph10 836 while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1082 nigel 77 {
1083 ph10 391 int cc; /* Some compilers don't like */
1084     cc = *(++ptr); /* ++ in initializers */
1085     #ifndef EBCDIC /* ASCII/UTF-8 coding */
1086     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1087     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1088 ph10 97 #else /* EBCDIC coding */
1089 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
1090     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1091 nigel 77 #endif
1092     }
1093     break;
1094    
1095 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1096 ph10 574 An error is given if the byte following \c is not an ASCII character. This
1097     coding is ASCII-specific, but then the whole concept of \cx is
1098 nigel 93 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1099 nigel 77
1100 ph10 391 case CHAR_c:
1101 nigel 77 c = *(++ptr);
1102     if (c == 0)
1103     {
1104     *errorcodeptr = ERR2;
1105 ph10 213 break;
1106 nigel 77 }
1107 ph10 574 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1108     if (c > 127) /* Excludes all non-ASCII in either mode */
1109     {
1110     *errorcodeptr = ERR68;
1111 ph10 579 break;
1112     }
1113 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1114 nigel 77 c ^= 0x40;
1115 ph10 574 #else /* EBCDIC coding */
1116 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
1117 nigel 77 c ^= 0xC0;
1118     #endif
1119     break;
1120    
1121     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1122 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
1123     otherwise, for Perl compatibility, it is a literal. This code looks a bit
1124     odd, but there used to be some cases other than the default, and there may
1125     be again in future, so I haven't "optimized" it. */
1126 nigel 77
1127     default:
1128     if ((options & PCRE_EXTRA) != 0) switch(c)
1129     {
1130     default:
1131     *errorcodeptr = ERR3;
1132     break;
1133     }
1134     break;
1135     }
1136     }
1137 ph10 518
1138     /* Perl supports \N{name} for character names, as well as plain \N for "not
1139 ph10 654 newline". PCRE does not support \N{name}. However, it does support
1140 ph10 640 quantification such as \N{2,3}. */
1141 nigel 77
1142 ph10 640 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1143     !is_counted_repeat(ptr+2))
1144 ph10 518 *errorcodeptr = ERR37;
1145 ph10 514
1146 ph10 518 /* If PCRE_UCP is set, we change the values for \d etc. */
1147    
1148     if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
1149     c -= (ESC_DU - ESC_D);
1150    
1151     /* Set the pointer to the final character before returning. */
1152    
1153 nigel 77 *ptrptr = ptr;
1154     return c;
1155     }
1156    
1157    
1158    
1159     #ifdef SUPPORT_UCP
1160     /*************************************************
1161     * Handle \P and \p *
1162     *************************************************/
1163    
1164     /* This function is called after \P or \p has been encountered, provided that
1165     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1166     pointing at the P or p. On exit, it is pointing at the final character of the
1167     escape sequence.
1168    
1169     Argument:
1170     ptrptr points to the pattern position pointer
1171     negptr points to a boolean that is set TRUE for negation else FALSE
1172 nigel 87 dptr points to an int that is set to the detailed property value
1173 nigel 77 errorcodeptr points to the error code variable
1174    
1175 nigel 87 Returns: type value from ucp_type_table, or -1 for an invalid type
1176 nigel 77 */
1177    
1178     static int
1179 ph10 836 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1180 nigel 77 {
1181     int c, i, bot, top;
1182 ph10 836 const pcre_uchar *ptr = *ptrptr;
1183     pcre_uchar name[32];
1184 nigel 77
1185     c = *(++ptr);
1186     if (c == 0) goto ERROR_RETURN;
1187    
1188     *negptr = FALSE;
1189    
1190 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1191     negation. */
1192 nigel 77
1193 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
1194 nigel 77 {
1195 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1196 nigel 77 {
1197     *negptr = TRUE;
1198     ptr++;
1199     }
1200 ph10 836 for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1201 nigel 77 {
1202     c = *(++ptr);
1203     if (c == 0) goto ERROR_RETURN;
1204 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1205 nigel 77 name[i] = c;
1206     }
1207 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1208 nigel 77 name[i] = 0;
1209     }
1210    
1211     /* Otherwise there is just one following character */
1212    
1213     else
1214     {
1215     name[0] = c;
1216     name[1] = 0;
1217     }
1218    
1219     *ptrptr = ptr;
1220    
1221     /* Search for a recognized property name using binary chop */
1222    
1223     bot = 0;
1224 ph10 836 top = PRIV(utt_size);
1225 nigel 77
1226     while (bot < top)
1227     {
1228 nigel 87 i = (bot + top) >> 1;
1229 ph10 836 c = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1230 nigel 87 if (c == 0)
1231     {
1232 ph10 836 *dptr = PRIV(utt)[i].value;
1233     return PRIV(utt)[i].type;
1234 nigel 87 }
1235 nigel 77 if (c > 0) bot = i + 1; else top = i;
1236     }
1237    
1238     *errorcodeptr = ERR47;
1239     *ptrptr = ptr;
1240     return -1;
1241    
1242     ERROR_RETURN:
1243     *errorcodeptr = ERR46;
1244     *ptrptr = ptr;
1245     return -1;
1246     }
1247     #endif
1248    
1249    
1250    
1251    
1252     /*************************************************
1253     * Read repeat counts *
1254     *************************************************/
1255    
1256     /* Read an item of the form {n,m} and return the values. This is called only
1257     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1258     so the syntax is guaranteed to be correct, but we need to check the values.
1259    
1260     Arguments:
1261     p pointer to first char after '{'
1262     minp pointer to int for min
1263     maxp pointer to int for max
1264     returned as -1 if no max
1265     errorcodeptr points to error code variable
1266    
1267     Returns: pointer to '}' on success;
1268     current ptr on error, with errorcodeptr set non-zero
1269     */
1270    
1271 ph10 836 static const pcre_uchar *
1272     read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1273 nigel 77 {
1274     int min = 0;
1275     int max = -1;
1276    
1277 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
1278     an integer overflow. */
1279    
1280 ph10 836 while (IS_DIGIT(*p)) min = min * 10 + *p++ - CHAR_0;
1281 nigel 81 if (min < 0 || min > 65535)
1282     {
1283     *errorcodeptr = ERR5;
1284     return p;
1285     }
1286 nigel 77
1287 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
1288     Also, max must not be less than min. */
1289    
1290 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1291 nigel 77 {
1292 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1293 nigel 77 {
1294     max = 0;
1295 ph10 836 while(IS_DIGIT(*p)) max = max * 10 + *p++ - CHAR_0;
1296 nigel 81 if (max < 0 || max > 65535)
1297     {
1298     *errorcodeptr = ERR5;
1299     return p;
1300     }
1301 nigel 77 if (max < min)
1302     {
1303     *errorcodeptr = ERR4;
1304     return p;
1305     }
1306     }
1307     }
1308    
1309 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1310     '}'. */
1311 nigel 77
1312 nigel 81 *minp = min;
1313     *maxp = max;
1314 nigel 77 return p;
1315     }
1316    
1317    
1318    
1319     /*************************************************
1320 ph10 408 * Subroutine for finding forward reference *
1321 nigel 91 *************************************************/
1322    
1323 ph10 408 /* This recursive function is called only from find_parens() below. The
1324     top-level call starts at the beginning of the pattern. All other calls must
1325     start at a parenthesis. It scans along a pattern's text looking for capturing
1326 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1327     name it is given, it returns its number. Alternatively, if the name is NULL, it
1328 ph10 578 returns when it reaches a given numbered subpattern. Recursion is used to keep
1329     track of subpatterns that reset the capturing group numbers - the (?| feature.
1330 nigel 91
1331 ph10 578 This function was originally called only from the second pass, in which we know
1332     that if (?< or (?' or (?P< is encountered, the name will be correctly
1333     terminated because that is checked in the first pass. There is now one call to
1334     this function in the first pass, to check for a recursive back reference by
1335     name (so that we can make the whole group atomic). In this case, we need check
1336 ph10 579 only up to the current position in the pattern, and that is still OK because
1337     and previous occurrences will have been checked. To make this work, the test
1338     for "end of pattern" is a check against cd->end_pattern in the main loop,
1339 ph10 578 instead of looking for a binary zero. This means that the special first-pass
1340 ph10 579 call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1341     processing items within the loop are OK, because afterwards the main loop will
1342 ph10 578 terminate.)
1343    
1344 nigel 91 Arguments:
1345 ph10 408 ptrptr address of the current character pointer (updated)
1346 ph10 345 cd compile background data
1347 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1348     lorn name length, or subpattern number if name is NULL
1349     xmode TRUE if we are in /x mode
1350 ph10 836 utf TRUE if we are in UTF-8 / UTF-16 mode
1351 ph10 411 count pointer to the current capturing subpattern number (updated)
1352 nigel 91
1353     Returns: the number of the named subpattern, or -1 if not found
1354     */
1355    
1356     static int
1357 ph10 836 find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
1358     BOOL xmode, BOOL utf, int *count)
1359 nigel 91 {
1360 ph10 836 pcre_uchar *ptr = *ptrptr;
1361 ph10 408 int start_count = *count;
1362     int hwm_count = start_count;
1363     BOOL dup_parens = FALSE;
1364 nigel 93
1365 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1366 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1367    
1368     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1369     {
1370 ph10 544 /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1371 ph10 545
1372 ph10 544 if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1373 ph10 545
1374 ph10 544 /* Handle a normal, unnamed capturing parenthesis. */
1375 ph10 408
1376 ph10 544 else if (ptr[1] != CHAR_QUESTION_MARK)
1377 ph10 408 {
1378     *count += 1;
1379     if (name == NULL && *count == lorn) return *count;
1380 ph10 411 ptr++;
1381 ph10 408 }
1382    
1383 ph10 544 /* All cases now have (? at the start. Remember when we are in a group
1384     where the parenthesis numbers are duplicated. */
1385    
1386     else if (ptr[2] == CHAR_VERTICAL_LINE)
1387     {
1388     ptr += 3;
1389     dup_parens = TRUE;
1390     }
1391 ph10 545
1392 ph10 544 /* Handle comments; all characters are allowed until a ket is reached. */
1393    
1394     else if (ptr[2] == CHAR_NUMBER_SIGN)
1395     {
1396     for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1397     goto FAIL_EXIT;
1398 ph10 545 }
1399 ph10 544
1400 ph10 408 /* Handle a condition. If it is an assertion, just carry on so that it
1401     is processed as normal. If not, skip to the closing parenthesis of the
1402 ph10 544 condition (there can't be any nested parens). */
1403 ph10 411
1404 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1405     {
1406 ph10 411 ptr += 2;
1407 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1408     {
1409     while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1410 ph10 411 if (*ptr != 0) ptr++;
1411 ph10 408 }
1412 ph10 411 }
1413    
1414 ph10 544 /* Start with (? but not a condition. */
1415 ph10 408
1416     else
1417 ph10 411 {
1418 ph10 408 ptr += 2;
1419     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1420    
1421     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1422 ph10 411
1423 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1424     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1425     {
1426     int term;
1427 ph10 836 const pcre_uchar *thisname;
1428 ph10 408 *count += 1;
1429     if (name == NULL && *count == lorn) return *count;
1430     term = *ptr++;
1431     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1432     thisname = ptr;
1433     while (*ptr != term) ptr++;
1434     if (name != NULL && lorn == ptr - thisname &&
1435 ph10 836 STRNCMP_UC_UC(name, thisname, lorn) == 0)
1436 ph10 408 return *count;
1437 ph10 461 term++;
1438 ph10 411 }
1439 ph10 408 }
1440 ph10 411 }
1441 ph10 408
1442 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1443 ph10 579 bars. Stop if we get to cd->end_pattern. Note that this is important for the
1444     first-pass call when this value is temporarily adjusted to stop at the current
1445 ph10 578 position. So DO NOT change this to a test for binary zero. */
1446 ph10 408
1447 ph10 578 for (; ptr < cd->end_pattern; ptr++)
1448 nigel 91 {
1449 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1450    
1451 ph10 391 if (*ptr == CHAR_BACKSLASH)
1452 nigel 93 {
1453 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1454 ph10 391 if (*ptr == CHAR_Q) for (;;)
1455 nigel 93 {
1456 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1457 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1458 ph10 391 if (*(++ptr) == CHAR_E) break;
1459 nigel 93 }
1460     continue;
1461     }
1462    
1463 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1464     are handled for real. If the first character is '^', skip it. Also, if the
1465     first few characters (either before or after ^) are \Q\E or \E we skip them
1466 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1467 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1468 nigel 93
1469 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1470 nigel 93 {
1471 ph10 340 BOOL negate_class = FALSE;
1472     for (;;)
1473     {
1474 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1475 ph10 340 {
1476 ph10 438 if (ptr[2] == CHAR_E)
1477     ptr+= 2;
1478 ph10 836 else if (STRNCMP_UC_C8(ptr + 2,
1479 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1480 ph10 438 ptr += 4;
1481 ph10 392 else
1482 ph10 391 break;
1483 ph10 340 }
1484 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1485 ph10 461 {
1486 ph10 340 negate_class = TRUE;
1487 ph10 438 ptr++;
1488 ph10 461 }
1489 ph10 340 else break;
1490     }
1491    
1492     /* If the next character is ']', it is a data character that must be
1493 ph10 341 skipped, except in JavaScript compatibility mode. */
1494 ph10 345
1495 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1496 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1497 ph10 345 ptr++;
1498    
1499 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1500 nigel 93 {
1501 ph10 220 if (*ptr == 0) return -1;
1502 ph10 391 if (*ptr == CHAR_BACKSLASH)
1503 nigel 93 {
1504 ph10 408 if (*(++ptr) == 0) goto FAIL_EXIT;
1505 ph10 391 if (*ptr == CHAR_Q) for (;;)
1506 nigel 93 {
1507 ph10 391 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1508 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1509 ph10 391 if (*(++ptr) == CHAR_E) break;
1510 nigel 93 }
1511     continue;
1512     }
1513     }
1514     continue;
1515     }
1516    
1517     /* Skip comments in /x mode */
1518    
1519 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1520 nigel 93 {
1521 ph10 579 ptr++;
1522 ph10 556 while (*ptr != 0)
1523     {
1524     if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1525     ptr++;
1526 ph10 836 #ifdef SUPPORT_UTF
1527     if (utf) FORWARDCHAR(ptr);
1528 ph10 556 #endif
1529     }
1530 ph10 408 if (*ptr == 0) goto FAIL_EXIT;
1531 nigel 93 continue;
1532     }
1533    
1534 ph10 408 /* Check for the special metacharacters */
1535 ph10 411
1536 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1537 nigel 93 {
1538 ph10 836 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);
1539 ph10 408 if (rc > 0) return rc;
1540     if (*ptr == 0) goto FAIL_EXIT;
1541 nigel 93 }
1542 ph10 411
1543 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1544     {
1545     if (dup_parens && *count < hwm_count) *count = hwm_count;
1546 ph10 545 goto FAIL_EXIT;
1547 ph10 408 }
1548 ph10 411
1549     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1550 ph10 408 {
1551     if (*count > hwm_count) hwm_count = *count;
1552     *count = start_count;
1553 ph10 411 }
1554 ph10 408 }
1555 nigel 93
1556 ph10 408 FAIL_EXIT:
1557     *ptrptr = ptr;
1558     return -1;
1559     }
1560 nigel 93
1561    
1562    
1563    
1564 ph10 408 /*************************************************
1565     * Find forward referenced subpattern *
1566     *************************************************/
1567 nigel 93
1568 ph10 408 /* This function scans along a pattern's text looking for capturing
1569     subpatterns, and counting them. If it finds a named pattern that matches the
1570     name it is given, it returns its number. Alternatively, if the name is NULL, it
1571     returns when it reaches a given numbered subpattern. This is used for forward
1572     references to subpatterns. We used to be able to start this scan from the
1573     current compiling point, using the current count value from cd->bracount, and
1574     do it all in a single loop, but the addition of the possibility of duplicate
1575     subpattern numbers means that we have to scan from the very start, in order to
1576     take account of such duplicates, and to use a recursive function to keep track
1577     of the different types of group.
1578    
1579     Arguments:
1580     cd compile background data
1581     name name to seek, or NULL if seeking a numbered subpattern
1582     lorn name length, or subpattern number if name is NULL
1583     xmode TRUE if we are in /x mode
1584 ph10 836 utf TRUE if we are in UTF-8 / UTF-16 mode
1585 ph10 408
1586     Returns: the number of the found subpattern, or -1 if not found
1587     */
1588    
1589     static int
1590 ph10 836 find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
1591     BOOL utf)
1592 ph10 408 {
1593 ph10 836 pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
1594 ph10 408 int count = 0;
1595     int rc;
1596    
1597     /* If the pattern does not start with an opening parenthesis, the first call
1598     to find_parens_sub() will scan right to the end (if necessary). However, if it
1599     does start with a parenthesis, find_parens_sub() will return when it hits the
1600     matching closing parens. That is why we have to have a loop. */
1601    
1602 ph10 411 for (;;)
1603     {
1604 ph10 836 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);
1605 ph10 411 if (rc > 0 || *ptr++ == 0) break;
1606     }
1607    
1608 ph10 408 return rc;
1609 nigel 91 }
1610    
1611    
1612    
1613 ph10 408
1614 nigel 91 /*************************************************
1615 nigel 77 * Find first significant op code *
1616     *************************************************/
1617    
1618     /* This is called by several functions that scan a compiled expression looking
1619     for a fixed first character, or an anchoring op code etc. It skips over things
1620 ph10 602 that do not influence this. For some calls, it makes sense to skip negative
1621     forward and all backward assertions, and also the \b assertion; for others it
1622     does not.
1623 nigel 77
1624     Arguments:
1625     code pointer to the start of the group
1626     skipassert TRUE if certain assertions are to be skipped
1627    
1628     Returns: pointer to the first significant opcode
1629     */
1630    
1631 ph10 836 static const pcre_uchar*
1632     first_significant_code(const pcre_uchar *code, BOOL skipassert)
1633 nigel 77 {
1634     for (;;)
1635     {
1636     switch ((int)*code)
1637     {
1638     case OP_ASSERT_NOT:
1639     case OP_ASSERTBACK:
1640     case OP_ASSERTBACK_NOT:
1641     if (!skipassert) return code;
1642     do code += GET(code, 1); while (*code == OP_ALT);
1643 ph10 836 code += PRIV(OP_lengths)[*code];
1644 nigel 77 break;
1645    
1646     case OP_WORD_BOUNDARY:
1647     case OP_NOT_WORD_BOUNDARY:
1648     if (!skipassert) return code;
1649     /* Fall through */
1650    
1651     case OP_CALLOUT:
1652     case OP_CREF:
1653 ph10 459 case OP_NCREF:
1654 nigel 93 case OP_RREF:
1655 ph10 459 case OP_NRREF:
1656 nigel 93 case OP_DEF:
1657 ph10 836 code += PRIV(OP_lengths)[*code];
1658 nigel 77 break;
1659    
1660     default:
1661     return code;
1662     }
1663     }
1664     /* Control never reaches here */
1665     }
1666    
1667    
1668    
1669    
1670     /*************************************************
1671 ph10 454 * Find the fixed length of a branch *
1672 nigel 77 *************************************************/
1673    
1674 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1675 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1676 ph10 461 In UTF8 mode, the result is in characters rather than bytes. The branch is
1677 ph10 454 temporarily terminated with OP_END when this function is called.
1678 nigel 77
1679 ph10 461 This function is called when a backward assertion is encountered, so that if it
1680     fails, the error message can point to the correct place in the pattern.
1681 ph10 454 However, we cannot do this when the assertion contains subroutine calls,
1682 ph10 461 because they can be forward references. We solve this by remembering this case
1683 ph10 454 and doing the check at the end; a flag specifies which mode we are running in.
1684    
1685 nigel 77 Arguments:
1686     code points to the start of the pattern (the bracket)
1687 ph10 836 utf TRUE in UTF-8 / UTF-16 mode
1688 ph10 461 atend TRUE if called when the pattern is complete
1689     cd the "compile data" structure
1690 nigel 77
1691 ph10 461 Returns: the fixed length,
1692 ph10 454 or -1 if there is no fixed length,
1693 ph10 754 or -2 if \C was encountered (in UTF-8 mode only)
1694 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1695 ph10 747 or -4 if an unknown opcode was encountered (internal error)
1696 nigel 77 */
1697    
1698     static int
1699 ph10 836 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1700 nigel 77 {
1701     int length = -1;
1702    
1703     register int branchlength = 0;
1704 ph10 836 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1705 nigel 77
1706     /* Scan along the opcodes for this branch. If we get to the end of the
1707     branch, check the length against that of the other branches. */
1708    
1709     for (;;)
1710     {
1711     int d;
1712 ph10 836 pcre_uchar *ce, *cs;
1713 nigel 77 register int op = *cc;
1714 ph10 842
1715 nigel 77 switch (op)
1716     {
1717 ph10 604 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1718     OP_BRA (normal non-capturing bracket) because the other variants of these
1719     opcodes are all concerned with unlimited repeated groups, which of course
1720 ph10 747 are not of fixed length. */
1721 ph10 604
1722 nigel 93 case OP_CBRA:
1723 nigel 77 case OP_BRA:
1724     case OP_ONCE:
1725 ph10 733 case OP_ONCE_NC:
1726 nigel 77 case OP_COND:
1727 ph10 836 d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1728 nigel 77 if (d < 0) return d;
1729     branchlength += d;
1730     do cc += GET(cc, 1); while (*cc == OP_ALT);
1731     cc += 1 + LINK_SIZE;
1732     break;
1733    
1734 ph10 747 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1735     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1736     an ALT. If it is END it's the end of the outer call. All can be handled by
1737     the same code. Note that we must not include the OP_KETRxxx opcodes here,
1738     because they all imply an unlimited repeat. */
1739 nigel 77
1740     case OP_ALT:
1741     case OP_KET:
1742     case OP_END:
1743 ph10 747 case OP_ACCEPT:
1744     case OP_ASSERT_ACCEPT:
1745 nigel 77 if (length < 0) length = branchlength;
1746     else if (length != branchlength) return -1;
1747     if (*cc != OP_ALT) return length;
1748     cc += 1 + LINK_SIZE;
1749     branchlength = 0;
1750     break;
1751 ph10 461
1752 ph10 454 /* A true recursion implies not fixed length, but a subroutine call may
1753     be OK. If the subroutine is a forward reference, we can't deal with
1754     it until the end of the pattern, so return -3. */
1755 ph10 461
1756 ph10 454 case OP_RECURSE:
1757     if (!atend) return -3;
1758 ph10 836 cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1759     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1760     if (cc > cs && cc < ce) return -1; /* Recursion */
1761     d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1762 ph10 461 if (d < 0) return d;
1763 ph10 454 branchlength += d;
1764     cc += 1 + LINK_SIZE;
1765 ph10 461 break;
1766 nigel 77
1767     /* Skip over assertive subpatterns */
1768    
1769     case OP_ASSERT:
1770     case OP_ASSERT_NOT:
1771     case OP_ASSERTBACK:
1772     case OP_ASSERTBACK_NOT:
1773     do cc += GET(cc, 1); while (*cc == OP_ALT);
1774 ph10 836 cc += PRIV(OP_lengths)[*cc];
1775 ph10 842 break;
1776 nigel 77
1777     /* Skip over things that don't match chars */
1778    
1779 ph10 747 case OP_MARK:
1780     case OP_PRUNE_ARG:
1781     case OP_SKIP_ARG:
1782     case OP_THEN_ARG:
1783 ph10 836 cc += cc[1] + PRIV(OP_lengths)[*cc];
1784 ph10 747 break;
1785    
1786 nigel 77 case OP_CALLOUT:
1787     case OP_CIRC:
1788 ph10 602 case OP_CIRCM:
1789 ph10 747 case OP_CLOSE:
1790     case OP_COMMIT:
1791     case OP_CREF:
1792     case OP_DEF:
1793 nigel 77 case OP_DOLL:
1794 ph10 602 case OP_DOLLM:
1795 ph10 747 case OP_EOD:
1796     case OP_EODN:
1797     case OP_FAIL:
1798     case OP_NCREF:
1799     case OP_NRREF:
1800 nigel 77 case OP_NOT_WORD_BOUNDARY:
1801 ph10 747 case OP_PRUNE:
1802     case OP_REVERSE:
1803     case OP_RREF:
1804     case OP_SET_SOM:
1805     case OP_SKIP:
1806     case OP_SOD:
1807     case OP_SOM:
1808     case OP_THEN:
1809 nigel 77 case OP_WORD_BOUNDARY:
1810 ph10 836 cc += PRIV(OP_lengths)[*cc];
1811 nigel 77 break;
1812    
1813     /* Handle literal characters */
1814    
1815     case OP_CHAR:
1816 ph10 602 case OP_CHARI:
1817 nigel 91 case OP_NOT:
1818 ph10 604 case OP_NOTI:
1819 nigel 77 branchlength++;
1820     cc += 2;
1821 ph10 836 #ifdef SUPPORT_UTF
1822     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1823 nigel 77 #endif
1824     break;
1825    
1826     /* Handle exact repetitions. The count is already in characters, but we
1827     need to skip over a multibyte character in UTF8 mode. */
1828    
1829     case OP_EXACT:
1830 ph10 747 case OP_EXACTI:
1831     case OP_NOTEXACT:
1832     case OP_NOTEXACTI:
1833 nigel 77 branchlength += GET2(cc,1);
1834 ph10 836 cc += 2 + IMM2_SIZE;
1835     #ifdef SUPPORT_UTF
1836     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1837 nigel 77 #endif
1838     break;
1839    
1840     case OP_TYPEEXACT:
1841     branchlength += GET2(cc,1);
1842 ph10 836 if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
1843     cc += 1 + IMM2_SIZE + 1;
1844 nigel 77 break;
1845    
1846     /* Handle single-char matchers */
1847    
1848     case OP_PROP:
1849     case OP_NOTPROP:
1850 nigel 87 cc += 2;
1851 nigel 77 /* Fall through */
1852    
1853 ph10 747 case OP_HSPACE:
1854     case OP_VSPACE:
1855     case OP_NOT_HSPACE:
1856     case OP_NOT_VSPACE:
1857 nigel 77 case OP_NOT_DIGIT:
1858     case OP_DIGIT:
1859     case OP_NOT_WHITESPACE:
1860     case OP_WHITESPACE:
1861     case OP_NOT_WORDCHAR:
1862     case OP_WORDCHAR:
1863     case OP_ANY:
1864 ph10 342 case OP_ALLANY:
1865 nigel 77 branchlength++;
1866     cc++;
1867     break;
1868    
1869 ph10 836 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1870 ph10 754 otherwise \C is coded as OP_ALLANY. */
1871 nigel 77
1872     case OP_ANYBYTE:
1873     return -2;
1874    
1875     /* Check a class for variable quantification */
1876    
1877 ph10 836 #if defined SUPPORT_UTF || defined COMPILE_PCRE16
1878 nigel 77 case OP_XCLASS:
1879 ph10 836 cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
1880 nigel 77 /* Fall through */
1881     #endif
1882    
1883     case OP_CLASS:
1884     case OP_NCLASS:
1885 ph10 836 cc += PRIV(OP_lengths)[OP_CLASS];
1886 nigel 77
1887     switch (*cc)
1888     {
1889 ph10 747 case OP_CRPLUS:
1890     case OP_CRMINPLUS:
1891 nigel 77 case OP_CRSTAR:
1892     case OP_CRMINSTAR:
1893     case OP_CRQUERY:
1894     case OP_CRMINQUERY:
1895     return -1;
1896    
1897     case OP_CRRANGE:
1898     case OP_CRMINRANGE:
1899 ph10 836 if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1900 nigel 77 branchlength += GET2(cc,1);
1901 ph10 836 cc += 1 + 2 * IMM2_SIZE;
1902 nigel 77 break;
1903    
1904     default:
1905     branchlength++;
1906     }
1907     break;
1908    
1909     /* Anything else is variable length */
1910    
1911 ph10 747 case OP_ANYNL:
1912     case OP_BRAMINZERO:
1913     case OP_BRAPOS:
1914     case OP_BRAPOSZERO:
1915     case OP_BRAZERO:
1916     case OP_CBRAPOS:
1917     case OP_EXTUNI:
1918     case OP_KETRMAX:
1919     case OP_KETRMIN:
1920     case OP_KETRPOS:
1921     case OP_MINPLUS:
1922     case OP_MINPLUSI:
1923     case OP_MINQUERY:
1924     case OP_MINQUERYI:
1925     case OP_MINSTAR:
1926     case OP_MINSTARI:
1927     case OP_MINUPTO:
1928     case OP_MINUPTOI:
1929     case OP_NOTMINPLUS:
1930     case OP_NOTMINPLUSI:
1931     case OP_NOTMINQUERY:
1932     case OP_NOTMINQUERYI:
1933     case OP_NOTMINSTAR:
1934     case OP_NOTMINSTARI:
1935     case OP_NOTMINUPTO:
1936     case OP_NOTMINUPTOI:
1937     case OP_NOTPLUS:
1938     case OP_NOTPLUSI:
1939     case OP_NOTPOSPLUS:
1940     case OP_NOTPOSPLUSI:
1941     case OP_NOTPOSQUERY:
1942     case OP_NOTPOSQUERYI:
1943     case OP_NOTPOSSTAR:
1944     case OP_NOTPOSSTARI:
1945     case OP_NOTPOSUPTO:
1946     case OP_NOTPOSUPTOI:
1947     case OP_NOTQUERY:
1948     case OP_NOTQUERYI:
1949     case OP_NOTSTAR:
1950     case OP_NOTSTARI:
1951     case OP_NOTUPTO:
1952     case OP_NOTUPTOI:
1953     case OP_PLUS:
1954     case OP_PLUSI:
1955     case OP_POSPLUS:
1956     case OP_POSPLUSI:
1957     case OP_POSQUERY:
1958     case OP_POSQUERYI:
1959     case OP_POSSTAR:
1960     case OP_POSSTARI:
1961     case OP_POSUPTO:
1962     case OP_POSUPTOI:
1963     case OP_QUERY:
1964     case OP_QUERYI:
1965     case OP_REF:
1966     case OP_REFI:
1967     case OP_SBRA:
1968     case OP_SBRAPOS:
1969     case OP_SCBRA:
1970     case OP_SCBRAPOS:
1971     case OP_SCOND:
1972     case OP_SKIPZERO:
1973     case OP_STAR:
1974     case OP_STARI:
1975     case OP_TYPEMINPLUS:
1976     case OP_TYPEMINQUERY:
1977     case OP_TYPEMINSTAR:
1978     case OP_TYPEMINUPTO:
1979     case OP_TYPEPLUS:
1980     case OP_TYPEPOSPLUS:
1981     case OP_TYPEPOSQUERY:
1982     case OP_TYPEPOSSTAR:
1983     case OP_TYPEPOSUPTO:
1984     case OP_TYPEQUERY:
1985     case OP_TYPESTAR:
1986     case OP_TYPEUPTO:
1987     case OP_UPTO:
1988     case OP_UPTOI:
1989     return -1;
1990    
1991     /* Catch unrecognized opcodes so that when new ones are added they
1992     are not forgotten, as has happened in the past. */
1993    
1994 nigel 77 default:
1995 ph10 747 return -4;
1996 nigel 77 }
1997     }
1998     /* Control never gets here */
1999     }
2000    
2001    
2002    
2003    
2004     /*************************************************
2005 ph10 454 * Scan compiled regex for specific bracket *
2006 nigel 77 *************************************************/
2007    
2008     /* This little function scans through a compiled pattern until it finds a
2009 ph10 454 capturing bracket with the given number, or, if the number is negative, an
2010 ph10 461 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2011     so that it can be called from pcre_study() when finding the minimum matching
2012 ph10 455 length.
2013 nigel 77
2014     Arguments:
2015     code points to start of expression
2016 ph10 836 utf TRUE in UTF-8 / UTF-16 mode
2017 ph10 454 number the required bracket number or negative to find a lookbehind
2018 nigel 77
2019     Returns: pointer to the opcode for the bracket, or NULL if not found
2020     */
2021    
2022 ph10 836 const pcre_uchar *
2023     PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2024 nigel 77 {
2025     for (;;)
2026     {
2027     register int c = *code;
2028 ph10 618
2029 nigel 77 if (c == OP_END) return NULL;
2030 nigel 91
2031     /* XCLASS is used for classes that cannot be represented just by a bit
2032     map. This includes negated single high-valued characters. The length in
2033     the table is zero; the actual length is stored in the compiled code. */
2034    
2035     if (c == OP_XCLASS) code += GET(code, 1);
2036 ph10 461
2037 ph10 454 /* Handle recursion */
2038 ph10 461
2039 ph10 454 else if (c == OP_REVERSE)
2040     {
2041 ph10 836 if (number < 0) return (pcre_uchar *)code;
2042     code += PRIV(OP_lengths)[c];
2043 ph10 454 }
2044 nigel 91
2045 nigel 93 /* Handle capturing bracket */
2046 nigel 91
2047 ph10 604 else if (c == OP_CBRA || c == OP_SCBRA ||
2048     c == OP_CBRAPOS || c == OP_SCBRAPOS)
2049 nigel 77 {
2050 nigel 93 int n = GET2(code, 1+LINK_SIZE);
2051 ph10 836 if (n == number) return (pcre_uchar *)code;
2052     code += PRIV(OP_lengths)[c];
2053 nigel 77 }
2054 nigel 91
2055 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
2056     repeated character types, we have to test for \p and \P, which have an extra
2057 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2058 ph10 510 must add in its length. */
2059 nigel 91
2060 nigel 77 else
2061     {
2062 ph10 218 switch(c)
2063     {
2064     case OP_TYPESTAR:
2065     case OP_TYPEMINSTAR:
2066     case OP_TYPEPLUS:
2067     case OP_TYPEMINPLUS:
2068     case OP_TYPEQUERY:
2069     case OP_TYPEMINQUERY:
2070     case OP_TYPEPOSSTAR:
2071     case OP_TYPEPOSPLUS:
2072     case OP_TYPEPOSQUERY:
2073     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2074 ph10 220 break;
2075 ph10 221
2076     case OP_TYPEUPTO:
2077     case OP_TYPEMINUPTO:
2078     case OP_TYPEEXACT:
2079     case OP_TYPEPOSUPTO:
2080 ph10 836 if (code[1 + IMM2_SIZE] == OP_PROP
2081     || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2082 ph10 221 break;
2083 ph10 512
2084 ph10 510 case OP_MARK:
2085     case OP_PRUNE_ARG:
2086     case OP_SKIP_ARG:
2087     code += code[1];
2088 ph10 512 break;
2089 ph10 550
2090     case OP_THEN_ARG:
2091 ph10 716 code += code[1];
2092 ph10 550 break;
2093 ph10 220 }
2094    
2095 ph10 218 /* Add in the fixed length from the table */
2096 ph10 220
2097 ph10 836 code += PRIV(OP_lengths)[c];
2098 ph10 220
2099 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2100     a multi-byte character. The length in the table is a minimum, so we have to
2101     arrange to skip the extra bytes. */
2102 ph10 220
2103 ph10 836 #ifdef SUPPORT_UTF
2104     if (utf) switch(c)
2105 nigel 77 {
2106     case OP_CHAR:
2107 ph10 602 case OP_CHARI:
2108 nigel 77 case OP_EXACT:
2109 ph10 602 case OP_EXACTI:
2110 nigel 77 case OP_UPTO:
2111 ph10 602 case OP_UPTOI:
2112 nigel 77 case OP_MINUPTO:
2113 ph10 602 case OP_MINUPTOI:
2114 nigel 93 case OP_POSUPTO:
2115 ph10 602 case OP_POSUPTOI:
2116 nigel 77 case OP_STAR:
2117 ph10 602 case OP_STARI:
2118 nigel 77 case OP_MINSTAR:
2119 ph10 602 case OP_MINSTARI:
2120 nigel 93 case OP_POSSTAR:
2121 ph10 602 case OP_POSSTARI:
2122 nigel 77 case OP_PLUS:
2123 ph10 602 case OP_PLUSI:
2124 nigel 77 case OP_MINPLUS:
2125 ph10 602 case OP_MINPLUSI:
2126 nigel 93 case OP_POSPLUS:
2127 ph10 602 case OP_POSPLUSI:
2128 nigel 77 case OP_QUERY:
2129 ph10 602 case OP_QUERYI:
2130 nigel 77 case OP_MINQUERY:
2131 ph10 602 case OP_MINQUERYI:
2132 nigel 93 case OP_POSQUERY:
2133 ph10 602 case OP_POSQUERYI:
2134 ph10 836 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2135 nigel 77 break;
2136     }
2137 ph10 369 #else
2138 ph10 836 (void)(utf); /* Keep compiler happy by referencing function argument */
2139 ph10 111 #endif
2140 nigel 77 }
2141     }
2142     }
2143    
2144    
2145    
2146     /*************************************************
2147     * Scan compiled regex for recursion reference *
2148     *************************************************/
2149    
2150     /* This little function scans through a compiled pattern until it finds an
2151     instance of OP_RECURSE.
2152    
2153     Arguments:
2154     code points to start of expression
2155 ph10 836 utf TRUE in UTF-8 / UTF-16 mode
2156 nigel 77
2157     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
2158     */
2159    
2160 ph10 836 static const pcre_uchar *
2161     find_recurse(const pcre_uchar *code, BOOL utf)
2162 nigel 77 {
2163     for (;;)
2164     {
2165     register int c = *code;
2166     if (c == OP_END) return NULL;
2167 nigel 91 if (c == OP_RECURSE) return code;
2168 ph10 220
2169 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
2170     map. This includes negated single high-valued characters. The length in
2171     the table is zero; the actual length is stored in the compiled code. */
2172    
2173     if (c == OP_XCLASS) code += GET(code, 1);
2174    
2175 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
2176     repeated character types, we have to test for \p and \P, which have an extra
2177 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2178 ph10 510 must add in its length. */
2179 nigel 91
2180 nigel 77 else
2181     {
2182 ph10 218 switch(c)
2183     {
2184     case OP_TYPESTAR:
2185     case OP_TYPEMINSTAR:
2186     case OP_TYPEPLUS:
2187     case OP_TYPEMINPLUS:
2188     case OP_TYPEQUERY:
2189     case OP_TYPEMINQUERY:
2190     case OP_TYPEPOSSTAR:
2191     case OP_TYPEPOSPLUS:
2192     case OP_TYPEPOSQUERY:
2193     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2194 ph10 220 break;
2195 ph10 221
2196     case OP_TYPEPOSUPTO:
2197     case OP_TYPEUPTO:
2198     case OP_TYPEMINUPTO:
2199     case OP_TYPEEXACT:
2200 ph10 836 if (code[1 + IMM2_SIZE] == OP_PROP
2201     || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2202 ph10 221 break;
2203 ph10 512
2204 ph10 510 case OP_MARK:
2205     case OP_PRUNE_ARG:
2206     case OP_SKIP_ARG:
2207     code += code[1];
2208 ph10 512 break;
2209 ph10 550
2210     case OP_THEN_ARG:
2211 ph10 716 code += code[1];
2212 ph10 550 break;
2213 ph10 220 }
2214    
2215 ph10 218 /* Add in the fixed length from the table */
2216    
2217 ph10 836 code += PRIV(OP_lengths)[c];
2218 ph10 220
2219 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
2220     by a multi-byte character. The length in the table is a minimum, so we have
2221     to arrange to skip the extra bytes. */
2222 ph10 220
2223 ph10 836 #ifdef SUPPORT_UTF
2224     if (utf) switch(c)
2225 nigel 77 {
2226     case OP_CHAR:
2227 ph10 602 case OP_CHARI:
2228 zherczeg 924 case OP_NOT:
2229     case OP_NOTI:
2230 nigel 77 case OP_EXACT:
2231 ph10 602 case OP_EXACTI:
2232 zherczeg 924 case OP_NOTEXACT:
2233     case OP_NOTEXACTI:
2234 nigel 77 case OP_UPTO:
2235 ph10 602 case OP_UPTOI:
2236 zherczeg 924 case OP_NOTUPTO:
2237     case OP_NOTUPTOI:
2238 nigel 77 case OP_MINUPTO:
2239 ph10 602 case OP_MINUPTOI:
2240 zherczeg 924 case OP_NOTMINUPTO:
2241     case OP_NOTMINUPTOI:
2242 nigel 93 case OP_POSUPTO:
2243 ph10 602 case OP_POSUPTOI:
2244 zherczeg 924 case OP_NOTPOSUPTO:
2245     case OP_NOTPOSUPTOI:
2246 nigel 77 case OP_STAR:
2247 ph10 602 case OP_STARI:
2248 zherczeg 924 case OP_NOTSTAR:
2249     case OP_NOTSTARI:
2250 nigel 77 case OP_MINSTAR:
2251 ph10 602 case OP_MINSTARI:
2252 zherczeg 924 case OP_NOTMINSTAR:
2253     case OP_NOTMINSTARI:
2254 nigel 93 case OP_POSSTAR:
2255 ph10 602 case OP_POSSTARI:
2256 zherczeg 924 case OP_NOTPOSSTAR:
2257     case OP_NOTPOSSTARI:
2258 nigel 77 case OP_PLUS:
2259 ph10 602 case OP_PLUSI:
2260 zherczeg 924 case OP_NOTPLUS:
2261     case OP_NOTPLUSI:
2262 nigel 77 case OP_MINPLUS:
2263 ph10 602 case OP_MINPLUSI:
2264 zherczeg 924 case OP_NOTMINPLUS:
2265     case OP_NOTMINPLUSI:
2266 nigel 93 case OP_POSPLUS:
2267 ph10 602 case OP_POSPLUSI:
2268 zherczeg 924 case OP_NOTPOSPLUS:
2269     case OP_NOTPOSPLUSI:
2270 nigel 77 case OP_QUERY:
2271 ph10 602 case OP_QUERYI:
2272 zherczeg 924 case OP_NOTQUERY:
2273     case OP_NOTQUERYI:
2274 nigel 77 case OP_MINQUERY:
2275 ph10 602 case OP_MINQUERYI:
2276 zherczeg 924 case OP_NOTMINQUERY:
2277     case OP_NOTMINQUERYI:
2278 nigel 93 case OP_POSQUERY:
2279 ph10 602 case OP_POSQUERYI:
2280 zherczeg 924 case OP_NOTPOSQUERY:
2281     case OP_NOTPOSQUERYI:
2282 ph10 836 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2283 nigel 77 break;
2284     }
2285 ph10 369 #else
2286 ph10 836 (void)(utf); /* Keep compiler happy by referencing function argument */
2287 ph10 111 #endif
2288 nigel 77 }
2289     }
2290     }
2291    
2292    
2293    
2294     /*************************************************
2295     * Scan compiled branch for non-emptiness *
2296     *************************************************/
2297    
2298     /* This function scans through a branch of a compiled pattern to see whether it
2299 nigel 93 can match the empty string or not. It is called from could_be_empty()
2300     below and from compile_branch() when checking for an unlimited repeat of a
2301     group that can match nothing. Note that first_significant_code() skips over
2302 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
2303     hit an unclosed bracket, we return "empty" - this means we've struck an inner
2304     bracket whose current branch will already have been scanned.
2305 nigel 77
2306     Arguments:
2307     code points to start of search
2308     endcode points to where to stop
2309 ph10 836 utf TRUE if in UTF-8 / UTF-16 mode
2310 ph10 503 cd contains pointers to tables etc.
2311 nigel 77
2312     Returns: TRUE if what is matched could be empty
2313     */
2314    
2315     static BOOL
2316 ph10 836 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2317     BOOL utf, compile_data *cd)
2318 nigel 77 {
2319     register int c;
2320 ph10 836 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2321 nigel 77 code < endcode;
2322 ph10 836 code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2323 nigel 77 {
2324 ph10 836 const pcre_uchar *ccode;
2325 nigel 77
2326     c = *code;
2327 ph10 507
2328 ph10 286 /* Skip over forward assertions; the other assertions are skipped by
2329 ph10 282 first_significant_code() with a TRUE final argument. */
2330 ph10 286
2331 ph10 282 if (c == OP_ASSERT)
2332 ph10 286 {
2333 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
2334     c = *code;
2335     continue;
2336 ph10 286 }
2337 ph10 172
2338 ph10 503 /* For a recursion/subroutine call, if its end has been reached, which
2339 ph10 624 implies a backward reference subroutine call, we can scan it. If it's a
2340     forward reference subroutine call, we can't. To detect forward reference
2341 ph10 654 we have to scan up the list that is kept in the workspace. This function is
2342     called only when doing the real compile, not during the pre-compile that
2343 ph10 624 measures the size of the compiled pattern. */
2344 ph10 507
2345 ph10 503 if (c == OP_RECURSE)
2346     {
2347 ph10 836 const pcre_uchar *scode;
2348 ph10 624 BOOL empty_branch;
2349 ph10 654
2350 ph10 624 /* Test for forward reference */
2351 ph10 654
2352 ph10 624 for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2353 ph10 654 if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
2354 ph10 624
2355     /* Not a forward reference, test for completed backward reference */
2356 ph10 654
2357 ph10 624 empty_branch = FALSE;
2358     scode = cd->start_code + GET(code, 1);
2359 ph10 503 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2360 ph10 654
2361 ph10 624 /* Completed backwards reference */
2362 ph10 654
2363 ph10 503 do
2364     {
2365 ph10 836 if (could_be_empty_branch(scode, endcode, utf, cd))
2366 ph10 504 {
2367     empty_branch = TRUE;
2368 ph10 507 break;
2369     }
2370 ph10 503 scode += GET(scode, 1);
2371     }
2372     while (*scode == OP_ALT);
2373 ph10 654
2374 ph10 504 if (!empty_branch) return FALSE; /* All branches are non-empty */
2375 ph10 503 continue;
2376 ph10 507 }
2377 ph10 170
2378 ph10 604 /* Groups with zero repeats can of course be empty; skip them. */
2379    
2380     if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2381     c == OP_BRAPOSZERO)
2382     {
2383 ph10 836 code += PRIV(OP_lengths)[c];
2384 ph10 604 do code += GET(code, 1); while (*code == OP_ALT);
2385     c = *code;
2386     continue;
2387     }
2388    
2389     /* A nested group that is already marked as "could be empty" can just be
2390     skipped. */
2391    
2392     if (c == OP_SBRA || c == OP_SBRAPOS ||
2393     c == OP_SCBRA || c == OP_SCBRAPOS)
2394     {
2395     do code += GET(code, 1); while (*code == OP_ALT);
2396     c = *code;
2397     continue;
2398     }
2399    
2400 ph10 170 /* For other groups, scan the branches. */
2401 ph10 172
2402 ph10 604 if (c == OP_BRA || c == OP_BRAPOS ||
2403     c == OP_CBRA || c == OP_CBRAPOS ||
2404 ph10 723 c == OP_ONCE || c == OP_ONCE_NC ||
2405     c == OP_COND)
2406 nigel 77 {
2407     BOOL empty_branch;
2408     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2409 ph10 406
2410     /* If a conditional group has only one branch, there is a second, implied,
2411 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
2412     Otherwise, scan the individual branches of the group. */
2413 ph10 406
2414 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2415 nigel 77 code += GET(code, 1);
2416 ph10 395 else
2417 ph10 406 {
2418 ph10 395 empty_branch = FALSE;
2419     do
2420     {
2421 ph10 836 if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))
2422 ph10 395 empty_branch = TRUE;
2423     code += GET(code, 1);
2424     }
2425     while (*code == OP_ALT);
2426     if (!empty_branch) return FALSE; /* All branches are non-empty */
2427 nigel 77 }
2428 ph10 406
2429 ph10 172 c = *code;
2430 nigel 93 continue;
2431 nigel 77 }
2432    
2433 nigel 93 /* Handle the other opcodes */
2434    
2435     switch (c)
2436 nigel 77 {
2437 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
2438     cannot be represented just by a bit map. This includes negated single
2439 ph10 836 high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2440 ph10 220 actual length is stored in the compiled code, so we must update "code"
2441 ph10 216 here. */
2442 nigel 77
2443 ph10 836 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2444 nigel 77 case OP_XCLASS:
2445 ph10 216 ccode = code += GET(code, 1);
2446 nigel 77 goto CHECK_CLASS_REPEAT;
2447     #endif
2448    
2449     case OP_CLASS:
2450     case OP_NCLASS:
2451 ph10 836 ccode = code + PRIV(OP_lengths)[OP_CLASS];
2452 nigel 77
2453 ph10 836 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2454 nigel 77 CHECK_CLASS_REPEAT:
2455     #endif
2456    
2457     switch (*ccode)
2458     {
2459     case OP_CRSTAR: /* These could be empty; continue */
2460     case OP_CRMINSTAR:
2461     case OP_CRQUERY:
2462     case OP_CRMINQUERY:
2463     break;
2464    
2465     default: /* Non-repeat => class must match */
2466     case OP_CRPLUS: /* These repeats aren't empty */
2467     case OP_CRMINPLUS:
2468     return FALSE;
2469    
2470     case OP_CRRANGE:
2471     case OP_CRMINRANGE:
2472     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2473     break;
2474     }
2475     break;
2476    
2477     /* Opcodes that must match a character */
2478    
2479     case OP_PROP:
2480     case OP_NOTPROP:
2481     case OP_EXTUNI:
2482     case OP_NOT_DIGIT:
2483     case OP_DIGIT:
2484     case OP_NOT_WHITESPACE:
2485     case OP_WHITESPACE:
2486     case OP_NOT_WORDCHAR:
2487     case OP_WORDCHAR:
2488     case OP_ANY:
2489 ph10 345 case OP_ALLANY:
2490 nigel 77 case OP_ANYBYTE:
2491     case OP_CHAR:
2492 ph10 602 case OP_CHARI:
2493 nigel 77 case OP_NOT:
2494 ph10 602 case OP_NOTI:
2495 nigel 77 case OP_PLUS:
2496     case OP_MINPLUS:
2497 nigel 93 case OP_POSPLUS:
2498 nigel 77 case OP_EXACT:
2499     case OP_NOTPLUS:
2500     case OP_NOTMINPLUS:
2501 nigel 93 case OP_NOTPOSPLUS:
2502 nigel 77 case OP_NOTEXACT:
2503     case OP_TYPEPLUS:
2504     case OP_TYPEMINPLUS:
2505 nigel 93 case OP_TYPEPOSPLUS:
2506 nigel 77 case OP_TYPEEXACT:
2507     return FALSE;
2508 ph10 227
2509     /* These are going to continue, as they may be empty, but we have to
2510     fudge the length for the \p and \P cases. */
2511    
2512 ph10 224 case OP_TYPESTAR:
2513     case OP_TYPEMINSTAR:
2514     case OP_TYPEPOSSTAR:
2515     case OP_TYPEQUERY:
2516     case OP_TYPEMINQUERY:
2517     case OP_TYPEPOSQUERY:
2518     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2519 ph10 227 break;
2520    
2521 ph10 224 /* Same for these */
2522 ph10 227
2523 ph10 224 case OP_TYPEUPTO:
2524     case OP_TYPEMINUPTO:
2525     case OP_TYPEPOSUPTO:
2526 ph10 836 if (code[1 + IMM2_SIZE] == OP_PROP
2527     || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2528 ph10 224 break;
2529 nigel 77
2530     /* End of branch */
2531    
2532     case OP_KET:
2533     case OP_KETRMAX:
2534     case OP_KETRMIN:
2535 ph10 604 case OP_KETRPOS:
2536 nigel 77 case OP_ALT:
2537     return TRUE;
2538    
2539 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2540     MINUPTO, and POSUPTO may be followed by a multibyte character */
2541 nigel 77
2542 ph10 836 #ifdef SUPPORT_UTF
2543 nigel 77 case OP_STAR:
2544 ph10 602 case OP_STARI:
2545 nigel 77 case OP_MINSTAR:
2546 ph10 602 case OP_MINSTARI:
2547 nigel 93 case OP_POSSTAR:
2548 ph10 602 case OP_POSSTARI:
2549 nigel 77 case OP_QUERY:
2550 ph10 602 case OP_QUERYI:
2551 nigel 77 case OP_MINQUERY:
2552 ph10 602 case OP_MINQUERYI:
2553 nigel 93 case OP_POSQUERY:
2554 ph10 602 case OP_POSQUERYI:
2555 ph10 836 if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2556 ph10 426 break;
2557 ph10 461
2558 nigel 77 case OP_UPTO:
2559 ph10 602 case OP_UPTOI:
2560 nigel 77 case OP_MINUPTO:
2561 ph10 602 case OP_MINUPTOI:
2562 nigel 93 case OP_POSUPTO:
2563 ph10 602 case OP_POSUPTOI:
2564 ph10 836 if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2565 nigel 77 break;
2566     #endif
2567 ph10 503
2568 ph10 510 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2569     string. */
2570    
2571     case OP_MARK:
2572     case OP_PRUNE_ARG:
2573     case OP_SKIP_ARG:
2574     code += code[1];
2575 ph10 512 break;
2576 ph10 510
2577 ph10 550 case OP_THEN_ARG:
2578 ph10 716 code += code[1];
2579 ph10 550 break;
2580    
2581 ph10 503 /* None of the remaining opcodes are required to match a character. */
2582 ph10 507
2583 ph10 503 default:
2584 ph10 507 break;
2585 nigel 77 }
2586     }
2587    
2588     return TRUE;
2589     }
2590    
2591    
2592    
2593     /*************************************************
2594     * Scan compiled regex for non-emptiness *
2595     *************************************************/
2596    
2597     /* This function is called to check for left recursive calls. We want to check
2598     the current branch of the current pattern to see if it could match the empty
2599     string. If it could, we must look outwards for branches at other levels,
2600     stopping when we pass beyond the bracket which is the subject of the recursion.
2601 ph10 654 This function is called only during the real compile, not during the
2602 ph10 624 pre-compile.
2603 nigel 77
2604     Arguments:
2605     code points to start of the recursion
2606     endcode points to where to stop (current RECURSE item)
2607     bcptr points to the chain of current (unclosed) branch starts
2608 ph10 836 utf TRUE if in UTF-8 / UTF-16 mode
2609 ph10 507 cd pointers to tables etc
2610 nigel 77
2611     Returns: TRUE if what is matched could be empty
2612     */
2613    
2614     static BOOL
2615 ph10 836 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2616     branch_chain *bcptr, BOOL utf, compile_data *cd)
2617 nigel 77 {
2618 ph10 475 while (bcptr != NULL && bcptr->current_branch >= code)
2619 nigel 77 {
2620 ph10 836 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))
2621 ph10 475 return FALSE;
2622 nigel 77 bcptr = bcptr->outer;
2623     }
2624     return TRUE;
2625     }
2626    
2627    
2628    
2629     /*************************************************
2630     * Check for POSIX class syntax *
2631     *************************************************/
2632    
2633     /* This function is called when the sequence "[:" or "[." or "[=" is
2634 ph10 295 encountered in a character class. It checks whether this is followed by a
2635 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2636 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2637 nigel 77
2638 ph10 298 Originally, this function only recognized a sequence of letters between the
2639     terminators, but it seems that Perl recognizes any sequence of characters,
2640     though of course unknown POSIX names are subsequently rejected. Perl gives an
2641     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2642     didn't consider this to be a POSIX class. Likewise for [:1234:].
2643 ph10 295
2644 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2645     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2646     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2647     below handles the special case of \], but does not try to do any other escape
2648     processing. This makes it different from Perl for cases such as [:l\ower:]
2649 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2650 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2651 ph10 295 I think.
2652    
2653 ph10 640 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2654     It seems that the appearance of a nested POSIX class supersedes an apparent
2655     external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2656 ph10 691 a digit.
2657 ph10 640
2658 ph10 661 In Perl, unescaped square brackets may also appear as part of class names. For
2659     example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2660     [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2661 ph10 691 seem right at all. PCRE does not allow closing square brackets in POSIX class
2662 ph10 661 names.
2663    
2664 ph10 295 Arguments:
2665 nigel 77 ptr pointer to the initial [
2666     endptr where to return the end pointer
2667    
2668     Returns: TRUE or FALSE
2669     */
2670    
2671     static BOOL
2672 ph10 836 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2673 nigel 77 {
2674     int terminator; /* Don't combine these lines; the Solaris cc */
2675     terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2676 ph10 295 for (++ptr; *ptr != 0; ptr++)
2677 nigel 77 {
2678 ph10 654 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2679     ptr++;
2680 ph10 691 else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2681 ph10 640 else
2682 ph10 298 {
2683 ph10 391 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2684 ph10 295 {
2685     *endptr = ptr;
2686     return TRUE;
2687 ph10 298 }
2688 ph10 640 if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2689     (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2690     ptr[1] == CHAR_EQUALS_SIGN) &&
2691     check_posix_syntax(ptr, endptr))
2692 ph10 654 return FALSE;
2693 ph10 298 }
2694     }
2695 nigel 77 return FALSE;
2696     }
2697    
2698    
2699    
2700    
2701     /*************************************************
2702     * Check POSIX class name *
2703     *************************************************/
2704    
2705     /* This function is called to check the name given in a POSIX-style class entry
2706     such as [:alnum:].
2707    
2708     Arguments:
2709     ptr points to the first letter
2710     len the length of the name
2711    
2712     Returns: a value representing the name, or -1 if unknown
2713     */
2714    
2715     static int
2716 ph10 836 check_posix_name(const pcre_uchar *ptr, int len)
2717 nigel 77 {
2718 ph10 240 const char *pn = posix_names;
2719 nigel 77 register int yield = 0;
2720     while (posix_name_lengths[yield] != 0)
2721     {
2722     if (len == posix_name_lengths[yield] &&
2723 ph10 836 STRNCMP_UC_C8(ptr, pn, len) == 0) return yield;
2724 ph10 243 pn += posix_name_lengths[yield] + 1;
2725 nigel 77 yield++;
2726     }
2727     return -1;
2728     }
2729    
2730    
2731     /*************************************************
2732     * Adjust OP_RECURSE items in repeated group *
2733     *************************************************/
2734    
2735     /* OP_RECURSE items contain an offset from the start of the regex to the group
2736     that is referenced. This means that groups can be replicated for fixed
2737     repetition simply by copying (because the recursion is allowed to refer to
2738     earlier groups that are outside the current group). However, when a group is
2739 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2740     inserted before it, after it has been compiled. This means that any OP_RECURSE
2741     items within it that refer to the group itself or any contained groups have to
2742     have their offsets adjusted. That one of the jobs of this function. Before it
2743     is called, the partially compiled regex must be temporarily terminated with
2744     OP_END.
2745 nigel 77
2746 nigel 93 This function has been extended with the possibility of forward references for
2747     recursions and subroutine calls. It must also check the list of such references
2748     for the group we are dealing with. If it finds that one of the recursions in
2749     the current group is on this list, it adjusts the offset in the list, not the
2750     value in the reference (which is a group number).
2751    
2752 nigel 77 Arguments:
2753     group points to the start of the group
2754     adjust the amount by which the group is to be moved
2755 ph10 836 utf TRUE in UTF-8 / UTF-16 mode
2756 nigel 77 cd contains pointers to tables etc.
2757 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2758 nigel 77
2759     Returns: nothing
2760     */
2761    
2762     static void
2763 ph10 836 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
2764     pcre_uchar *save_hwm)
2765 nigel 77 {
2766 ph10 836 pcre_uchar *ptr = group;
2767 ph10 224
2768 ph10 836 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
2769 nigel 77 {
2770 nigel 93 int offset;
2771 ph10 836 pcre_uchar *hc;
2772 nigel 93
2773     /* See if this recursion is on the forward reference list. If so, adjust the
2774     reference. */
2775 ph10 345
2776 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2777     {
2778     offset = GET(hc, 0);
2779     if (cd->start_code + offset == ptr + 1)
2780     {
2781     PUT(hc, 0, offset + adjust);
2782     break;
2783     }
2784     }
2785    
2786     /* Otherwise, adjust the recursion offset if it's after the start of this
2787     group. */
2788    
2789     if (hc >= cd->hwm)
2790     {
2791     offset = GET(ptr, 1);
2792     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2793     }
2794    
2795 nigel 77 ptr += 1 + LINK_SIZE;
2796     }
2797     }
2798    
2799    
2800    
2801     /*************************************************
2802     * Insert an automatic callout point *
2803     *************************************************/
2804    
2805     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2806     callout points before each pattern item.
2807    
2808     Arguments:
2809     code current code pointer
2810     ptr current pattern pointer
2811     cd pointers to tables etc
2812    
2813     Returns: new code pointer
2814     */
2815    
2816 ph10 836 static pcre_uchar *
2817     auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2818 nigel 77 {
2819     *code++ = OP_CALLOUT;
2820     *code++ = 255;
2821 ph10 530 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
2822     PUT(code, LINK_SIZE, 0); /* Default length */
2823 ph10 836 return code + 2 * LINK_SIZE;
2824 nigel 77 }
2825    
2826    
2827    
2828     /*************************************************
2829     * Complete a callout item *
2830     *************************************************/
2831    
2832     /* A callout item contains the length of the next item in the pattern, which
2833     we can't fill in till after we have reached the relevant point. This is used
2834     for both automatic and manual callouts.
2835    
2836     Arguments:
2837     previous_callout points to previous callout item
2838     ptr current pattern pointer
2839     cd pointers to tables etc
2840    
2841     Returns: nothing
2842     */
2843    
2844     static void
2845 ph10 836 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2846 nigel 77 {
2847 ph10 530 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2848 nigel 77 PUT(previous_callout, 2 + LINK_SIZE, length);
2849     }
2850    
2851    
2852    
2853     #ifdef SUPPORT_UCP
2854     /*************************************************
2855     * Get othercase range *
2856     *************************************************/
2857    
2858     /* This function is passed the start and end of a class range, in UTF-8 mode
2859     with UCP support. It searches up the characters, looking for internal ranges of
2860     characters in the "other" case. Each call returns the next one, updating the
2861     start address.
2862    
2863     Arguments:
2864     cptr points to starting character value; updated
2865     d end value
2866     ocptr where to put start of othercase range
2867     odptr where to put end of othercase range
2868    
2869     Yield: TRUE when range returned; FALSE when no more
2870     */
2871    
2872     static BOOL
2873 nigel 93 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2874     unsigned int *odptr)
2875 nigel 77 {
2876 nigel 93 unsigned int c, othercase, next;
2877 nigel 77
2878     for (c = *cptr; c <= d; c++)
2879 ph10 349 { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2880 nigel 77
2881     if (c > d) return FALSE;
2882    
2883     *ocptr = othercase;
2884     next = othercase + 1;
2885    
2886     for (++c; c <= d; c++)
2887     {
2888 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2889 nigel 77 next++;
2890     }
2891    
2892     *odptr = next - 1;
2893     *cptr = c;
2894    
2895     return TRUE;
2896     }
2897 ph10 532
2898    
2899    
2900     /*************************************************
2901     * Check a character and a property *
2902     *************************************************/
2903    
2904     /* This function is called by check_auto_possessive() when a property item
2905     is adjacent to a fixed character.
2906    
2907     Arguments:
2908     c the character
2909     ptype the property type
2910     pdata the data for the type
2911     negated TRUE if it's a negated property (\P or \p{^)
2912 ph10 535
2913 ph10 532 Returns: TRUE if auto-possessifying is OK
2914 ph10 535 */
2915 ph10 532
2916     static BOOL
2917     check_char_prop(int c, int ptype, int pdata, BOOL negated)
2918     {
2919     const ucd_record *prop = GET_UCD(c);
2920     switch(ptype)
2921     {
2922     case PT_LAMP:
2923     return (prop->chartype == ucp_Lu ||
2924     prop->chartype == ucp_Ll ||
2925     prop->chartype == ucp_Lt) == negated;
2926    
2927     case PT_GC:
2928 ph10 836 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2929 ph10 532
2930     case PT_PC:
2931     return (pdata == prop->chartype) == negated;
2932    
2933     case PT_SC:
2934     return (pdata == prop->script) == negated;
2935    
2936     /* These are specials */
2937    
2938     case PT_ALNUM:
2939 ph10 836 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2940     PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2941 ph10 532
2942     case PT_SPACE: /* Perl space */
2943 ph10 836 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2944 ph10 532 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2945     == negated;
2946    
2947     case PT_PXSPACE: /* POSIX space */
2948 ph10 836 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2949 ph10 532 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2950     c == CHAR_FF || c == CHAR_CR)
2951     == negated;
2952    
2953     case PT_WORD:
2954 ph10 836 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2955     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2956 ph10 532 c == CHAR_UNDERSCORE) == negated;
2957     }
2958 ph10 535 return FALSE;
2959 ph10 532 }
2960 nigel 77 #endif /* SUPPORT_UCP */
2961    
2962    
2963 nigel 93
2964 nigel 77 /*************************************************
2965 nigel 93 * Check if auto-possessifying is possible *
2966     *************************************************/
2967    
2968     /* This function is called for unlimited repeats of certain items, to see
2969     whether the next thing could possibly match the repeated item. If not, it makes
2970     sense to automatically possessify the repeated item.
2971    
2972     Arguments:
2973 ph10 532 previous pointer to the repeated opcode
2974 ph10 836 utf TRUE in UTF-8 / UTF-16 mode
2975 nigel 93 ptr next character in pattern
2976     options options bits
2977     cd contains pointers to tables etc.
2978    
2979     Returns: TRUE if possessifying is wanted
2980     */
2981    
2982     static BOOL
2983 ph10 836 check_auto_possessive(const pcre_uchar *previous, BOOL utf,
2984     const pcre_uchar *ptr, int options, compile_data *cd)
2985 nigel 93 {
2986 ph10 836 pcre_int32 c, next;
2987 ph10 532 int op_code = *previous++;
2988 nigel 93
2989     /* Skip whitespace and comments in extended mode */
2990    
2991     if ((options & PCRE_EXTENDED) != 0)
2992     {
2993     for (;;)
2994     {
2995 ph10 836 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2996 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
2997 nigel 93 {
2998 ph10 579 ptr++;
2999 ph10 556 while (*ptr != 0)
3000     {
3001 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3002 ph10 556 ptr++;
3003 ph10 836 #ifdef SUPPORT_UTF
3004     if (utf) FORWARDCHAR(ptr);
3005 ph10 556 #endif
3006     }
3007 nigel 93 }
3008     else break;
3009     }
3010     }
3011    
3012     /* If the next item is one that we can handle, get its value. A non-negative
3013     value is a character, a negative value is an escape value. */
3014    
3015 ph10 391 if (*ptr == CHAR_BACKSLASH)
3016 nigel 93 {
3017     int temperrorcode = 0;
3018     next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
3019     if (temperrorcode != 0) return FALSE;
3020     ptr++; /* Point after the escape sequence */
3021     }
3022 ph10 836 else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
3023 nigel 93 {
3024 ph10 836 #ifdef SUPPORT_UTF
3025     if (utf) { GETCHARINC(next, ptr); } else
3026 nigel 93 #endif
3027     next = *ptr++;
3028     }
3029     else return FALSE;
3030    
3031     /* Skip whitespace and comments in extended mode */
3032    
3033     if ((options & PCRE_EXTENDED) != 0)
3034     {
3035     for (;;)
3036     {
3037 ph10 836 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3038 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
3039 nigel 93 {
3040 ph10 579 ptr++;
3041 ph10 556 while (*ptr != 0)
3042     {
3043 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3044 ph10 556 ptr++;
3045 ph10 836 #ifdef SUPPORT_UTF
3046     if (utf) FORWARDCHAR(ptr);
3047 ph10 556 #endif
3048     }
3049 nigel 93 }
3050     else break;
3051     }
3052     }
3053    
3054     /* If the next thing is itself optional, we have to give up. */
3055    
3056 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3057 ph10 836 STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3058 ph10 391 return FALSE;
3059 nigel 93
3060 ph10 532 /* Now compare the next item with the previous opcode. First, handle cases when
3061     the next item is a character. */
3062 nigel 93
3063     if (next >= 0) switch(op_code)
3064     {
3065     case OP_CHAR:
3066 ph10 836 #ifdef SUPPORT_UTF
3067 ph10 532 GETCHARTEST(c, previous);
3068 ph10 369 #else
3069 ph10 532 c = *previous;
3070 ph10 535 #endif
3071     return c != next;
3072 nigel 93
3073 ph10 602 /* For CHARI (caseless character) we must check the other case. If we have
3074 nigel 93 Unicode property support, we can use it to test the other case of
3075     high-valued characters. */
3076    
3077 ph10 602 case OP_CHARI:
3078 ph10 836 #ifdef SUPPORT_UTF
3079 ph10 532 GETCHARTEST(c, previous);
3080     #else
3081     c = *previous;
3082 ph10 535 #endif
3083 ph10 532 if (c == next) return FALSE;
3084 ph10 836 #ifdef SUPPORT_UTF
3085     if (utf)
3086 nigel 93 {
3087     unsigned int othercase;
3088     if (next < 128) othercase = cd->fcc[next]; else
3089     #ifdef SUPPORT_UCP
3090 ph10 349 othercase = UCD_OTHERCASE((unsigned int)next);
3091 nigel 93 #else
3092     othercase = NOTACHAR;
3093     #endif
3094 ph10 532 return (unsigned int)c != othercase;
3095 nigel 93 }
3096     else
3097 ph10 836 #endif /* SUPPORT_UTF */
3098 ph10 904 return (c != TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */
3099 nigel 93
3100     case OP_NOT:
3101 zherczeg 924 #ifdef SUPPORT_UTF
3102     GETCHARTEST(c, previous);
3103     #else
3104     c = *previous;
3105     #endif
3106     return c == next;
3107 ph10 604
3108     case OP_NOTI:
3109 ph10 836 #ifdef SUPPORT_UTF
3110 zherczeg 924 GETCHARTEST(c, previous);
3111     #else
3112     c = *previous;
3113     #endif
3114     if (c == next) return TRUE;
3115     #ifdef SUPPORT_UTF
3116 ph10 836 if (utf)
3117 nigel 93 {
3118     unsigned int othercase;
3119     if (next < 128) othercase = cd->fcc[next]; else
3120     #ifdef SUPPORT_UCP
3121 zherczeg 924 othercase = UCD_OTHERCASE((unsigned int)next);
3122 nigel 93 #else
3123     othercase = NOTACHAR;
3124     #endif
3125 ph10 532 return (unsigned int)c == othercase;
3126 nigel 93 }
3127     else
3128 ph10 836 #endif /* SUPPORT_UTF */
3129 zherczeg 924 return (c == TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */
3130 nigel 93
3131 ph10 535 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3132     When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3133    
3134 nigel 93 case OP_DIGIT:
3135     return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
3136    
3137     case OP_NOT_DIGIT:
3138     return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
3139    
3140     case OP_WHITESPACE:
3141     return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
3142    
3143     case OP_NOT_WHITESPACE:
3144     return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
3145    
3146     case OP_WORDCHAR:
3147     return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
3148    
3149     case OP_NOT_WORDCHAR:
3150     return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
3151    
3152 ph10 180 case OP_HSPACE:
3153     case OP_NOT_HSPACE:
3154     switch(next)
3155     {
3156     case 0x09:
3157     case 0x20:
3158     case 0xa0:
3159     case 0x1680:
3160     case 0x180e:
3161     case 0x2000:
3162     case 0x2001:
3163     case 0x2002:
3164     case 0x2003:
3165     case 0x2004:
3166     case 0x2005:
3167     case 0x2006:
3168     case 0x2007:
3169     case 0x2008:
3170     case 0x2009:
3171     case 0x200A:
3172     case 0x202f:
3173     case 0x205f:
3174     case 0x3000:
3175 ph10 528 return op_code == OP_NOT_HSPACE;
3176 ph10 180 default:
3177 ph10 528 return op_code != OP_NOT_HSPACE;
3178 ph10 180 }
3179    
3180 ph10 528 case OP_ANYNL:
3181 ph10 180 case OP_VSPACE:
3182     case OP_NOT_VSPACE:
3183     switch(next)
3184     {
3185     case 0x0a:
3186     case 0x0b:
3187     case 0x0c:
3188     case 0x0d:
3189     case 0x85:
3190     case 0x2028:
3191     case 0x2029:
3192 ph10 528 return op_code == OP_NOT_VSPACE;
3193 ph10 180 default:
3194 ph10 528 return op_code != OP_NOT_VSPACE;
3195 ph10 180 }
3196    
3197 ph10 532 #ifdef SUPPORT_UCP
3198     case OP_PROP:
3199     return check_char_prop(next, previous[0], previous[1], FALSE);
3200 ph10 535
3201 ph10 532 case OP_NOTPROP:
3202     return check_char_prop(next, previous[0], previous[1], TRUE);
3203     #endif
3204    
3205 nigel 93 default:
3206     return FALSE;
3207     }
3208    
3209    
3210 ph10 535 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3211     is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3212     generated only when PCRE_UCP is *not* set, that is, when only ASCII
3213     characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
3214 ph10 532 replaced by OP_PROP codes when PCRE_UCP is set. */
3215 nigel 93
3216     switch(op_code)
3217     {
3218     case OP_CHAR:
3219 ph10 602 case OP_CHARI:
3220 ph10 836 #ifdef SUPPORT_UTF
3221 ph10 532 GETCHARTEST(c, previous);
3222     #else
3223     c = *previous;
3224 ph10 535 #endif
3225 nigel 93 switch(-next)
3226     {
3227     case ESC_d:
3228 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
3229 nigel 93
3230     case ESC_D:
3231 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
3232 nigel 93
3233     case ESC_s:
3234 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
3235 nigel 93
3236     case ESC_S:
3237 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
3238 nigel 93
3239     case ESC_w:
3240 ph10 532 return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
3241 nigel 93
3242     case ESC_W:
3243 ph10 532 return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
3244 ph10 182
3245 ph10 180 case ESC_h:
3246     case ESC_H:
3247 ph10 532 switch(c)
3248 ph10 180 {
3249     case 0x09:
3250     case 0x20:
3251     case 0xa0:
3252     case 0x1680:
3253     case 0x180e:
3254     case 0x2000:
3255     case 0x2001:
3256     case 0x2002:
3257     case 0x2003:
3258     case 0x2004:
3259     case 0x2005:
3260     case 0x2006:
3261     case 0x2007:
3262     case 0x2008:
3263     case 0x2009:
3264     case 0x200A:
3265     case 0x202f:
3266     case 0x205f:
3267     case 0x3000:
3268     return -next != ESC_h;
3269     default:
3270     return -next == ESC_h;
3271 ph10 182 }
3272    
3273 ph10 180 case ESC_v:
3274     case ESC_V:
3275 ph10 532 switch(c)
3276 ph10 180 {
3277     case 0x0a:
3278     case 0x0b:
3279     case 0x0c:
3280     case 0x0d:
3281     case 0x85:
3282     case 0x2028:
3283     case 0x2029:
3284     return -next != ESC_v;
3285     default:
3286     return -next == ESC_v;
3287 ph10 182 }
3288 ph10 535
3289     /* When PCRE_UCP is set, these values get generated for \d etc. Find
3290     their substitutions and process them. The result will always be either
3291 ph10 532 -ESC_p or -ESC_P. Then fall through to process those values. */
3292 ph10 535
3293 ph10 532 #ifdef SUPPORT_UCP
3294     case ESC_du:
3295     case ESC_DU:
3296     case ESC_wu:
3297     case ESC_WU:
3298     case ESC_su:
3299     case ESC_SU:
3300     {
3301     int temperrorcode = 0;
3302     ptr = substitutes[-next - ESC_DU];
3303     next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
3304     if (temperrorcode != 0) return FALSE;
3305     ptr++; /* For compatibility */
3306     }
3307 ph10 535 /* Fall through */
3308 nigel 93
3309 ph10 532 case ESC_p:
3310     case ESC_P:
3311     {
3312     int ptype, pdata, errorcodeptr;
3313 ph10 535 BOOL negated;
3314    
3315 ph10 532 ptr--; /* Make ptr point at the p or P */
3316     ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
3317     if (ptype < 0) return FALSE;
3318     ptr++; /* Point past the final curly ket */
3319 ph10 535
3320 ph10 532 /* If the property item is optional, we have to give up. (When generated
3321     from \d etc by PCRE_UCP, this test will have been applied much earlier,
3322     to the original \d etc. At this point, ptr will point to a zero byte. */
3323 ph10 535
3324 ph10 532 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3325 ph10 836 STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3326 ph10 532 return FALSE;
3327 ph10 535
3328 ph10 532 /* Do the property check. */
3329 ph10 535
3330 ph10 532 return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
3331 ph10 535 }
3332 ph10 532 #endif
3333    
3334 nigel 93 default:
3335     return FALSE;
3336     }
3337    
3338 ph10 535 /* In principle, support for Unicode properties should be integrated here as
3339     well. It means re-organizing the above code so as to get hold of the property
3340     values before switching on the op-code. However, I wonder how many patterns
3341     combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
3342     these op-codes are never generated.) */
3343    
3344 nigel 93 case OP_DIGIT:
3345 ph10 180 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
3346 ph10 528 next == -ESC_h || next == -ESC_v || next == -ESC_R;
3347 nigel 93
3348     case OP_NOT_DIGIT:
3349     return next == -ESC_d;
3350    
3351     case OP_WHITESPACE:
3352 ph10 528 return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
3353 nigel 93
3354     case OP_NOT_WHITESPACE:
3355 ph10 180 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
3356 nigel 93
3357 ph10 180 case OP_HSPACE:
3358 ph10 535 return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
3359 ph10 528 next == -ESC_w || next == -ESC_v || next == -ESC_R;
3360 ph10 180
3361     case OP_NOT_HSPACE:
3362     return next == -ESC_h;
3363 ph10 182
3364 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
3365 ph10 535 case OP_ANYNL:
3366 ph10 182 case OP_VSPACE:
3367 ph10 180 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
3368    
3369     case OP_NOT_VSPACE:
3370 ph10 528 return next == -ESC_v || next == -ESC_R;
3371 ph10 180
3372 nigel 93 case OP_WORDCHAR:
3373 ph10 535 return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
3374 ph10 528 next == -ESC_v || next == -ESC_R;
3375 nigel 93
3376     case OP_NOT_WORDCHAR:
3377     return next == -ESC_w || next == -ESC_d;
3378 ph10 182
3379 nigel 93 default:
3380     return FALSE;
3381     }
3382    
3383     /* Control does not reach here */
3384     }
3385    
3386    
3387    
3388     /*************************************************
3389 nigel 77 * Compile one branch *
3390     *************************************************/
3391    
3392 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
3393 nigel 77 changed during the branch, the pointer is used to change the external options
3394 nigel 93 bits. This function is used during the pre-compile phase when we are trying
3395     to find out the amount of memory needed, as well as during the real compile
3396     phase. The value of lengthptr distinguishes the two phases.
3397 nigel 77
3398     Arguments:
3399     optionsptr pointer to the option bits
3400     codeptr points to the pointer to the current code point
3401     ptrptr points to the current pattern pointer
3402     errorcodeptr points to error code variable
3403 ph10 836 firstcharptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3404     reqcharptr set to the last literal character required, else < 0
3405 nigel 77 bcptr points to current branch chain
3406 ph10 654 cond_depth conditional nesting depth
3407 nigel 77 cd contains pointers to tables etc.
3408 nigel 93 lengthptr NULL during the real compile phase
3409     points to length accumulator during pre-compile phase
3410 nigel 77
3411     Returns: TRUE on success
3412     FALSE, with *errorcodeptr set non-zero on error
3413     */
3414    
3415     static BOOL
3416 ph10 836 compile_branch(int *optionsptr, pcre_uchar **codeptr,
3417     const pcre_uchar **ptrptr, int *errorcodeptr, pcre_int32 *firstcharptr,
3418     pcre_int32 *reqcharptr, branch_chain *bcptr, int cond_depth,
3419     compile_data *cd, int *lengthptr)
3420 nigel 77 {
3421     int repeat_type, op_type;
3422     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
3423     int bravalue = 0;
3424     int greedy_default, greedy_non_default;
3425 ph10 836 pcre_int32 firstchar, reqchar;
3426     pcre_int32 zeroreqchar, zerofirstchar;
3427     pcre_int32 req_caseopt, reqvary, tempreqvary;
3428 ph10 635 int options = *optionsptr; /* May change dynamically */
3429 nigel 77 int after_manual_callout = 0;
3430 nigel 93 int length_prevgroup = 0;
3431 nigel 77 register int c;
3432 ph10 836 register pcre_uchar *code = *codeptr;
3433     pcre_uchar *last_code = code;
3434     pcre_uchar *orig_code = code;
3435     pcre_uchar *tempcode;
3436 nigel 77 BOOL inescq = FALSE;
3437 ph10 836 BOOL groupsetfirstchar = FALSE;
3438     const pcre_uchar *ptr = *ptrptr;
3439     const pcre_uchar *tempptr;
3440     const pcre_uchar *nestptr = NULL;
3441     pcre_uchar *previous = NULL;
3442     pcre_uchar *previous_callout = NULL;
3443     pcre_uchar *save_hwm = NULL;
3444     pcre_uint8 classbits[32];
3445 nigel 77
3446 ph10 635 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3447 ph10 654 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3448 ph10 635 dynamically as we process the pattern. */
3449    
3450 ph10 836 #ifdef SUPPORT_UTF
3451     /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3452     BOOL utf = (options & PCRE_UTF8) != 0;
3453     pcre_uchar utf_chars[6];
3454 nigel 77 #else
3455 ph10 836 BOOL utf = FALSE;
3456 nigel 77 #endif
3457    
3458 ph10 836 /* Helper variables for OP_XCLASS opcode (for characters > 255). */
3459    
3460     #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3461     BOOL xclass;
3462     pcre_uchar *class_uchardata;
3463     pcre_uchar *class_uchardata_base;
3464     #endif
3465    
3466 ph10 475 #ifdef PCRE_DEBUG
3467 nigel 93 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3468     #endif
3469    
3470 nigel 77 /* Set up the default and non-default settings for greediness */
3471    
3472     greedy_default = ((options & PCRE_UNGREEDY) != 0);
3473     greedy_non_default = greedy_default ^ 1;
3474    
3475     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3476     matching encountered yet". It gets changed to REQ_NONE if we hit something that
3477 ph10 836 matches a non-fixed char first char; reqchar just remains unset if we never
3478 nigel 77 find one.
3479    
3480     When we hit a repeat whose minimum is zero, we may have to adjust these values
3481     to take the zero repeat into account. This is implemented by setting them to
3482 ph10 836 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
3483 nigel 77 item types that can be repeated set these backoff variables appropriately. */
3484    
3485 ph10 836 firstchar = reqchar = zerofirstchar = zeroreqchar = REQ_UNSET;
3486 nigel 77
3487 ph10 836 /* The variable req_caseopt contains either the REQ_CASELESS value
3488     or zero, according to the current setting of the caseless flag. The
3489     REQ_CASELESS leaves the lower 28 bit empty. It is added into the
3490     firstchar or reqchar variables to record the case status of the
3491     value. This is used only for ASCII characters. */
3492 nigel 77
3493 ph10 836 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
3494 nigel 77
3495     /* Switch on next character until the end of the branch */
3496    
3497     for (;; ptr++)
3498     {
3499     BOOL negate_class;
3500 ph10 286 BOOL should_flip_negation;
3501 nigel 77 BOOL possessive_quantifier;
3502     BOOL is_quantifier;
3503 nigel 93 BOOL is_recurse;
3504 ph10 180 BOOL reset_bracount;
3505 ph10 836 int class_has_8bitchar;
3506     int class_single_char;
3507 nigel 77 int newoptions;
3508     int recno;
3509 ph10 172 int refsign;
3510 nigel 77 int skipbytes;
3511 ph10 836 int subreqchar;
3512     int subfirstchar;
3513 nigel 93 int terminator;
3514 nigel 77 int mclength;
3515 ph10 733 int tempbracount;
3516 ph10 836 pcre_uchar mcbuffer[8];
3517 nigel 77
3518 ph10 836 /* Get next character in the pattern */
3519 nigel 77
3520     c = *ptr;
3521 ph10 345
3522 ph10 535 /* If we are at the end of a nested substitution, revert to the outer level
3523 ph10 518 string. Nesting only happens one level deep. */
3524    
3525     if (c == 0 && nestptr != NULL)
3526     {
3527     ptr = nestptr;
3528     nestptr = NULL;
3529     c = *ptr;
3530     }
3531    
3532 nigel 93 /* If we are in the pre-compile phase, accumulate the length used for the
3533     previous cycle of this loop. */
3534    
3535     if (lengthptr != NULL)
3536     {
3537 ph10 475 #ifdef PCRE_DEBUG
3538 nigel 93 if (code > cd->hwm) cd->hwm = code; /* High water info */
3539     #endif
3540 ph10 836 if (code > cd->start_workspace + cd->workspace_size -
3541     WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */
3542 nigel 93 {
3543     *errorcodeptr = ERR52;
3544     goto FAILED;
3545     }
3546    
3547     /* There is at least one situation where code goes backwards: this is the
3548     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3549     the class is simply eliminated. However, it is created first, so we have to
3550     allow memory for it. Therefore, don't ever reduce the length at this point.
3551     */
3552    
3553     if (code < last_code) code = last_code;
3554 ph10 202
3555     /* Paranoid check for integer overflow */
3556    
3557     if (OFLOW_MAX - *lengthptr < code - last_code)
3558     {
3559     *errorcodeptr = ERR20;
3560     goto FAILED;
3561     }
3562    
3563 ph10 530 *lengthptr += (int)(code - last_code);
3564 ph10 836 DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
3565     (int)(code - last_code), c, c));
3566 ph10 842
3567 nigel 93 /* If "previous" is set and it is not at the start of the work space, move
3568     it back to there, in order to avoid filling up the work space. Otherwise,
3569     if "previous" is NULL, reset the current code pointer to the start. */
3570    
3571     if (previous != NULL)
3572     {
3573     if (previous > orig_code)
3574     {
3575 ph10 836 memmove(orig_code, previous, IN_UCHARS(code - previous));
3576 nigel 93 code -= previous - orig_code;
3577     previous = orig_code;
3578     }
3579     }
3580     else code = orig_code;
3581    
3582     /* Remember where this code item starts so we can pick up the length
3583     next time round. */
3584    
3585     last_code = code;
3586     }
3587    
3588     /* In the real compile phase, just check the workspace used by the forward
3589     reference list. */
3590    
3591 ph10 836 else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3592     WORK_SIZE_SAFETY_MARGIN)
3593 nigel 93 {
3594     *errorcodeptr = ERR52;
3595     goto FAILED;
3596     }
3597    
3598 nigel 77 /* If in \Q...\E, check for the end; if not, we have a literal */
3599    
3600     if (inescq && c != 0)
3601     {
3602 ph10 391 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3603 nigel 77 {
3604     inescq = FALSE;
3605     ptr++;
3606     continue;
3607     }
3608     else
3609     {
3610     if (previous_callout != NULL)
3611     {
3612 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3613     complete_callout(previous_callout, ptr, cd);
3614 nigel 77 previous_callout = NULL;
3615     }
3616     if ((options & PCRE_AUTO_CALLOUT) != 0)
3617     {
3618     previous_callout = code;
3619     code = auto_callout(code, ptr, cd);
3620     }
3621     goto NORMAL_CHAR;
3622     }
3623     }
3624    
3625     /* Fill in length of a previous callout, except when the next thing is
3626     a quantifier. */
3627    
3628 ph10 392 is_quantifier =
3629 ph10 391 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3630     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3631 nigel 77
3632     if (!is_quantifier && previous_callout != NULL &&
3633     after_manual_callout-- <= 0)
3634     {
3635 nigel 93 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
3636     complete_callout(previous_callout, ptr, cd);
3637 nigel 77 previous_callout = NULL;
3638     }
3639    
3640 ph10 635 /* In extended mode, skip white space and comments. */
3641 nigel 77
3642     if ((options & PCRE_EXTENDED) != 0)
3643     {
3644 ph10 836 if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
3645 ph10 391 if (c == CHAR_NUMBER_SIGN)
3646 nigel 77 {
3647 ph10 579 ptr++;
3648 ph10 556 while (*ptr != 0)
3649 nigel 91 {
3650 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3651 ph10 556 ptr++;
3652 ph10 836 #ifdef SUPPORT_UTF
3653     if (utf) FORWARDCHAR(ptr);
3654 ph10 556 #endif
3655 nigel 91 }
3656 nigel 93 if (*ptr != 0) continue;
3657    
3658 nigel 91 /* Else fall through to handle end of string */
3659     c = 0;
3660 nigel 77 }
3661     }
3662    
3663     /* No auto callout for quantifiers. */
3664    
3665     if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3666     {
3667     previous_callout = code;
3668     code = auto_callout(code, ptr, cd);
3669     }
3670    
3671     switch(c)
3672     {
3673 nigel 93 /* ===================================================================*/
3674     case 0: /* The branch terminates at string end */
3675 ph10 391 case CHAR_VERTICAL_LINE: /* or | or ) */
3676     case CHAR_RIGHT_PARENTHESIS:
3677 ph10 836 *firstcharptr = firstchar;
3678     *reqcharptr = reqchar;
3679 nigel 77 *codeptr = code;
3680     *ptrptr = ptr;
3681 nigel 93 if (lengthptr != NULL)
3682     {
3683 ph10 202 if (OFLOW_MAX - *lengthptr < code - last_code)
3684     {
3685     *errorcodeptr = ERR20;
3686     goto FAILED;
3687     }
3688 ph10 530 *lengthptr += (int)(code - last_code); /* To include callout length */
3689 nigel 93 DPRINTF((">> end branch\n"));
3690     }
3691 nigel 77 return TRUE;
3692    
3693 nigel 93
3694     /* ===================================================================*/
3695 nigel 77 /* Handle single-character metacharacters. In multiline mode, ^ disables
3696     the setting of any following char as a first character. */
3697    
3698 ph10 391 case CHAR_CIRCUMFLEX_ACCENT:
3699 ph10 602 previous = NULL;
3700 nigel 77 if ((options & PCRE_MULTILINE) != 0)
3701     {
3702 ph10 836 if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3703 ph10 602 *code++ = OP_CIRCM;
3704 nigel 77 }
3705 ph10 602 else *code++ = OP_CIRC;
3706 nigel 77 break;
3707    
3708 ph10 391 case CHAR_DOLLAR_SIGN:
3709 nigel 77 previous = NULL;
3710 ph10 602 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3711 nigel 77 break;
3712    
3713     /* There can never be a first char if '.' is first, whatever happens about
3714 ph10 836 repeats. The value of reqchar doesn't change either. */
3715 nigel 77
3716 ph10 391 case CHAR_DOT:
3717 ph10 836 if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3718     zerofirstchar = firstchar;
3719     zeroreqchar = reqchar;
3720 nigel 77 previous = code;
3721 ph10 342 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3722 nigel 77 break;
3723    
3724 nigel 93
3725     /* ===================================================================*/
3726 nigel 87 /* Character classes. If the included characters are all < 256, we build a
3727     32-byte bitmap of the permitted characters, except in the special case
3728     where there is only one such character. For negated classes, we build the
3729     map as usual, then invert it at the end. However, we use a different opcode
3730     so that data characters > 255 can be handled correctly.
3731 nigel 77
3732     If the class contains characters outside the 0-255 range, a different
3733     opcode is compiled. It may optionally have a bit map for characters < 256,
3734     but those above are are explicitly listed afterwards. A flag byte tells
3735     whether the bitmap is present, and whether this is a negated class or not.
3736 ph10 345
3737 ph10 336 In JavaScript compatibility mode, an isolated ']' causes an error. In
3738     default (Perl) mode, it is treated as a data character. */
3739 ph10 345
3740 ph10 391 case CHAR_RIGHT_SQUARE_BRACKET:
3741 ph10 336 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3742     {
3743     *errorcodeptr = ERR64;
3744 ph10 345 goto FAILED;
3745 ph10 336 }
3746 ph10 345 goto NORMAL_CHAR;
3747 nigel 77
3748 ph10 391 case CHAR_LEFT_SQUARE_BRACKET:
3749 nigel 77 previous = code;
3750    
3751     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3752     they are encountered at the top level, so we'll do that too. */
3753    
3754 ph10 392 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3755 ph10 391 ptr[1] == CHAR_EQUALS_SIGN) &&
3756 ph10 295 check_posix_syntax(ptr, &tempptr))
3757 nigel 77 {
3758 ph10 391 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3759 nigel 77 goto FAILED;
3760     }
3761    
3762 ph10 205 /* If the first character is '^', set the negation flag and skip it. Also,
3763 ph10 208 if the first few characters (either before or after ^) are \Q\E or \E we
3764 ph10 205 skip them too. This makes for compatibility with Perl. */
3765 ph10 208
3766 ph10 205 negate_class = FALSE;
3767     for (;;)
3768 nigel 77 {
3769     c = *(++ptr);
3770 ph10 391 if (c == CHAR_BACKSLASH)
3771 ph10 205 {
3772 ph10 392 if (ptr[1] == CHAR_E)
3773 ph10 391 ptr++;
3774 ph10 836 else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3775 ph10 391 ptr += 3;
3776 ph10 392 else
3777 ph10 391 break;
3778 ph10 205 }
3779 ph10 391 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3780 ph10 205 negate_class = TRUE;
3781     else break;
3782 ph10 208 }
3783 ph10 345
3784     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3785     an initial ']' is taken as a data character -- the code below handles
3786 ph10 341 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3787     [^] must match any character, so generate OP_ALLANY. */
3788 ph10 345
3789 ph10 392 if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3790 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3791 ph10 341 {
3792     *code++ = negate_class? OP_ALLANY : OP_FAIL;
3793 ph10 836 if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3794     zerofirstchar = firstchar;
3795 ph10 341 break;
3796 ph10 345 }
3797 nigel