/[pcre]/code/trunk/pcre_compile.c
ViewVC logotype

Contents of /code/trunk/pcre_compile.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1325 - (hide annotations) (download)
Fri May 10 14:03:18 2013 UTC (2 weeks, 1 day ago) by ph10
File MIME type: text/plain
File size: 277879 byte(s)
Fix spelling mistakes in comments.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 1309 Copyright (c) 1997-2013 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_compile(), along with
42     supporting internal functions that are not used by other modules. */
43    
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK cd /* Block containing newline information */
50     #define PSSTART start_pattern /* Field containing processed string start */
51     #define PSEND end_pattern /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55    
56 chpe 1055 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57 ph10 836 is also used by pcretest. PCRE_DEBUG is not defined when building a production
58 ph10 842 library. We do not need to select pcre16_printint.c specially, because the
59 ph10 836 COMPILE_PCREx macro will already be appropriately set. */
60 nigel 85
61 ph10 475 #ifdef PCRE_DEBUG
62 ph10 836 /* pcre_printint.c should not include any headers */
63     #define PCRE_INCLUDED
64     #include "pcre_printint.c"
65     #undef PCRE_INCLUDED
66 nigel 85 #endif
67    
68    
69 ph10 178 /* Macro for setting individual bits in class bitmaps. */
70    
71 ph10 1045 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
72 ph10 178
73 ph10 202 /* Maximum length value to check against when making sure that the integer that
74     holds the compiled pattern length does not overflow. We make it a bit less than
75     INT_MAX to allow for adding in group terminating bytes, so that we don't have
76     to check them every time. */
77 ph10 178
78 ph10 202 #define OFLOW_MAX (INT_MAX - 20)
79    
80 ph10 1045 /* Definitions to allow mutual recursion */
81 ph10 202
82 ph10 1045 static int
83 ph10 1221 add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84 ph10 1045 const pcre_uint32 *, unsigned int);
85    
86     static BOOL
87 chpe 1078 compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88 ph10 1221 pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89 chpe 1078 compile_data *, int *);
90 ph10 1045
91    
92    
93 nigel 77 /*************************************************
94     * Code parameters and static tables *
95     *************************************************/
96    
97 nigel 93 /* This value specifies the size of stack workspace that is used during the
98     first pre-compile phase that determines how much memory is required. The regex
99     is partly compiled into this space, but the compiled parts are discarded as
100     soon as they can be, so that hopefully there will never be an overrun. The code
101     does, however, check for an overrun. The largest amount I've seen used is 218,
102     so this number is very generous.
103 nigel 77
104 nigel 93 The same workspace is used during the second, actual compile phase for
105     remembering forward references to groups so that they can be filled in at the
106     end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107 ph10 836 is 4 there is plenty of room for most patterns. However, the memory can get
108     filled up by repetitions of forward references, for example patterns like
109     /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110     that the workspace is expanded using malloc() in this situation. The value
111     below is therefore a minimum, and we put a maximum on it for safety. The
112     minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113     kicks in at the same number of forward references in all cases. */
114 nigel 77
115 ph10 836 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116     #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117 nigel 77
118 ph10 507 /* The overrun tests check for a slightly smaller size so that they detect the
119 ph10 505 overrun before it actually does run off the end of the data block. */
120 nigel 93
121 ph10 836 #define WORK_SIZE_SAFETY_MARGIN (100)
122 ph10 505
123 ph10 836 /* Private flags added to firstchar and reqchar. */
124 ph10 505
125 chpe 1078 #define REQ_CASELESS (1 << 0) /* Indicates caselessness */
126     #define REQ_VARY (1 << 1) /* Reqchar followed non-literal item */
127     /* Negative values for the firstchar and reqchar flags */
128     #define REQ_UNSET (-2)
129     #define REQ_NONE (-1)
130 ph10 836
131     /* Repeated character flags. */
132    
133     #define UTF_LENGTH 0x10000000l /* The char contains its length. */
134    
135 nigel 77 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
136     are simple data values; negative values are for special things like \d and so
137     on. Zero means further processing is needed (for things like \x), or the escape
138     is invalid. */
139    
140 ph10 391 #ifndef EBCDIC
141    
142     /* This is the "normal" table for ASCII systems or for EBCDIC systems running
143 ph10 392 in UTF-8 mode. */
144 ph10 391
145 ph10 392 static const short int escapes[] = {
146 ph10 391 0, 0,
147     0, 0,
148 ph10 392 0, 0,
149     0, 0,
150     0, 0,
151 ph10 391 CHAR_COLON, CHAR_SEMICOLON,
152 ph10 392 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
153 ph10 391 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
154 ph10 392 CHAR_COMMERCIAL_AT, -ESC_A,
155     -ESC_B, -ESC_C,
156     -ESC_D, -ESC_E,
157     0, -ESC_G,
158     -ESC_H, 0,
159     0, -ESC_K,
160 ph10 391 0, 0,
161 ph10 514 -ESC_N, 0,
162 ph10 391 -ESC_P, -ESC_Q,
163     -ESC_R, -ESC_S,
164 ph10 392 0, 0,
165     -ESC_V, -ESC_W,
166     -ESC_X, 0,
167     -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
168 ph10 391 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
169 ph10 392 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
170 ph10 391 CHAR_GRAVE_ACCENT, 7,
171 ph10 392 -ESC_b, 0,
172     -ESC_d, ESC_e,
173 ph10 391 ESC_f, 0,
174     -ESC_h, 0,
175 ph10 392 0, -ESC_k,
176 ph10 391 0, 0,
177     ESC_n, 0,
178 ph10 392 -ESC_p, 0,
179     ESC_r, -ESC_s,
180 ph10 391 ESC_tee, 0,
181 ph10 392 -ESC_v, -ESC_w,
182     0, 0,
183 ph10 391 -ESC_z
184 nigel 77 };
185    
186 ph10 392 #else
187 ph10 391
188     /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
189    
190 nigel 77 static const short int escapes[] = {
191     /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
192     /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
193     /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
194     /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
195     /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
196     /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
197     /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
198     /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
199 ph10 178 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
200 nigel 93 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
201 nigel 77 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
202 ph10 178 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
203 nigel 77 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
204     /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
205     /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
206     /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
207 ph10 178 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
208 ph10 514 /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
209 nigel 93 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
210 ph10 178 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
211 nigel 77 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
212     /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
213     /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
214     };
215     #endif
216    
217    
218 ph10 243 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
219     searched linearly. Put all the names into a single string, in order to reduce
220 ph10 392 the number of relocations when a shared library is dynamically linked. The
221     string is built from string macros so that it works in UTF-8 mode on EBCDIC
222 ph10 391 platforms. */
223 ph10 210
224     typedef struct verbitem {
225 ph10 510 int len; /* Length of verb name */
226     int op; /* Op when no arg, or -1 if arg mandatory */
227     int op_arg; /* Op when arg present, or -1 if not allowed */
228 ph10 211 } verbitem;
229 ph10 210
230 ph10 240 static const char verbnames[] =
231 ph10 510 "\0" /* Empty name is a shorthand for MARK */
232 ph10 512 STRING_MARK0
233 ph10 391 STRING_ACCEPT0
234     STRING_COMMIT0
235     STRING_F0
236     STRING_FAIL0
237     STRING_PRUNE0
238     STRING_SKIP0
239     STRING_THEN;
240 ph10 240
241 ph10 327 static const verbitem verbs[] = {
242 ph10 510 { 0, -1, OP_MARK },
243 ph10 512 { 4, -1, OP_MARK },
244 ph10 510 { 6, OP_ACCEPT, -1 },
245     { 6, OP_COMMIT, -1 },
246     { 1, OP_FAIL, -1 },
247     { 4, OP_FAIL, -1 },
248     { 5, OP_PRUNE, OP_PRUNE_ARG },
249     { 4, OP_SKIP, OP_SKIP_ARG },
250     { 4, OP_THEN, OP_THEN_ARG }
251 ph10 210 };
252    
253 ph10 327 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
254 ph10 210
255    
256 ph10 243 /* Tables of names of POSIX character classes and their lengths. The names are
257     now all in a single string, to reduce the number of relocations when a shared
258 ph10 240 library is dynamically loaded. The list of lengths is terminated by a zero
259     length entry. The first three must be alpha, lower, upper, as this is assumed
260     for handling case independence. */
261 nigel 77
262 ph10 240 static const char posix_names[] =
263 ph10 392 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
264     STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
265 ph10 391 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
266     STRING_word0 STRING_xdigit;
267 nigel 77
268 ph10 836 static const pcre_uint8 posix_name_lengths[] = {
269 nigel 77 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
270    
271 nigel 87 /* Table of class bit maps for each POSIX class. Each class is formed from a
272     base map, with an optional addition or removal of another map. Then, for some
273     classes, there is some additional tweaking: for [:blank:] the vertical space
274     characters are removed, and for [:alpha:] and [:alnum:] the underscore
275     character is removed. The triples in the table consist of the base map offset,
276     second map offset or -1 if no second map, and a non-negative value for map
277     addition or a negative value for map subtraction (if there are two maps). The
278     absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
279     remove vertical space characters, 2 => remove underscore. */
280 nigel 77
281     static const int posix_class_maps[] = {
282 nigel 87 cbit_word, cbit_digit, -2, /* alpha */
283     cbit_lower, -1, 0, /* lower */
284     cbit_upper, -1, 0, /* upper */
285     cbit_word, -1, 2, /* alnum - word without underscore */
286     cbit_print, cbit_cntrl, 0, /* ascii */
287     cbit_space, -1, 1, /* blank - a GNU extension */
288     cbit_cntrl, -1, 0, /* cntrl */
289     cbit_digit, -1, 0, /* digit */
290     cbit_graph, -1, 0, /* graph */
291     cbit_print, -1, 0, /* print */
292     cbit_punct, -1, 0, /* punct */
293     cbit_space, -1, 0, /* space */
294     cbit_word, -1, 0, /* word - a Perl extension */
295     cbit_xdigit,-1, 0 /* xdigit */
296 nigel 77 };
297    
298 ph10 535 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
299     substitutes must be in the order of the names, defined above, and there are
300 ph10 518 both positive and negative cases. NULL means no substitute. */
301 nigel 77
302 ph10 518 #ifdef SUPPORT_UCP
303 ph10 836 static const pcre_uchar string_PNd[] = {
304     CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
305     CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
306     static const pcre_uchar string_pNd[] = {
307     CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
308     CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
309     static const pcre_uchar string_PXsp[] = {
310     CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
311     CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
312     static const pcre_uchar string_pXsp[] = {
313     CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
314     CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
315     static const pcre_uchar string_PXwd[] = {
316     CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
317     CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
318     static const pcre_uchar string_pXwd[] = {
319     CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
320     CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
321    
322     static const pcre_uchar *substitutes[] = {
323     string_PNd, /* \D */
324     string_pNd, /* \d */
325     string_PXsp, /* \S */ /* NOTE: Xsp is Perl space */
326     string_pXsp, /* \s */
327     string_PXwd, /* \W */
328     string_pXwd /* \w */
329 ph10 518 };
330 ph10 535
331 ph10 836 static const pcre_uchar string_pL[] = {
332     CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
333     CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
334     static const pcre_uchar string_pLl[] = {
335     CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
336     CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
337     static const pcre_uchar string_pLu[] = {
338     CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
339     CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
340     static const pcre_uchar string_pXan[] = {
341     CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
342     CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
343     static const pcre_uchar string_h[] = {
344     CHAR_BACKSLASH, CHAR_h, '\0' };
345     static const pcre_uchar string_pXps[] = {
346     CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
347     CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
348     static const pcre_uchar string_PL[] = {
349     CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
350     CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
351     static const pcre_uchar string_PLl[] = {
352     CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
353     CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
354     static const pcre_uchar string_PLu[] = {
355     CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
356     CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
357     static const pcre_uchar string_PXan[] = {
358     CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
359     CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
360     static const pcre_uchar string_H[] = {
361     CHAR_BACKSLASH, CHAR_H, '\0' };
362     static const pcre_uchar string_PXps[] = {
363     CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
364     CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
365    
366     static const pcre_uchar *posix_substitutes[] = {
367     string_pL, /* alpha */
368     string_pLl, /* lower */
369     string_pLu, /* upper */
370     string_pXan, /* alnum */
371     NULL, /* ascii */
372     string_h, /* blank */
373     NULL, /* cntrl */
374     string_pNd, /* digit */
375     NULL, /* graph */
376     NULL, /* print */
377     NULL, /* punct */
378     string_pXps, /* space */ /* NOTE: Xps is POSIX space */
379     string_pXwd, /* word */
380     NULL, /* xdigit */
381 ph10 518 /* Negated cases */
382 ph10 836 string_PL, /* ^alpha */
383     string_PLl, /* ^lower */
384     string_PLu, /* ^upper */
385     string_PXan, /* ^alnum */
386     NULL, /* ^ascii */
387     string_H, /* ^blank */
388     NULL, /* ^cntrl */
389     string_PNd, /* ^digit */
390     NULL, /* ^graph */
391     NULL, /* ^print */
392     NULL, /* ^punct */
393     string_PXps, /* ^space */ /* NOTE: Xps is POSIX space */
394     string_PXwd, /* ^word */
395     NULL /* ^xdigit */
396 ph10 518 };
397 ph10 836 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
398 ph10 535 #endif
399 ph10 518
400 nigel 93 #define STRING(a) # a
401     #define XSTRING(s) STRING(s)
402    
403 nigel 77 /* The texts of compile-time error messages. These are "char *" because they
404 nigel 93 are passed to the outside world. Do not ever re-use any error number, because
405     they are documented. Always add a new error instead. Messages marked DEAD below
406 ph10 243 are no longer used. This used to be a table of strings, but in order to reduce
407     the number of relocations needed when a shared library is loaded dynamically,
408     it is now one long string. We cannot use a table of offsets, because the
409     lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
410     simply count through to the one we want - this isn't a performance issue
411 ph10 507 because these strings are used only when there is a compilation error.
412 nigel 77
413 ph10 507 Each substring ends with \0 to insert a null character. This includes the final
414     substring, so that the whole string ends with \0\0, which can be detected when
415 ph10 499 counting through. */
416    
417 ph10 240 static const char error_texts[] =
418     "no error\0"
419     "\\ at end of pattern\0"
420     "\\c at end of pattern\0"
421     "unrecognized character follows \\\0"
422     "numbers out of order in {} quantifier\0"
423 nigel 77 /* 5 */
424 ph10 240 "number too big in {} quantifier\0"
425     "missing terminating ] for character class\0"
426     "invalid escape sequence in character class\0"
427     "range out of order in character class\0"
428     "nothing to repeat\0"
429 nigel 77 /* 10 */
430 ph10 240 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
431     "internal error: unexpected repeat\0"
432 ph10 269 "unrecognized character after (? or (?-\0"
433 ph10 240 "POSIX named classes are supported only within a class\0"
434     "missing )\0"
435 nigel 77 /* 15 */
436 ph10 240 "reference to non-existent subpattern\0"
437     "erroffset passed as NULL\0"
438     "unknown option bit(s) set\0"
439     "missing ) after comment\0"
440     "parentheses nested too deeply\0" /** DEAD **/
441 nigel 77 /* 20 */
442 ph10 240 "regular expression is too large\0"
443     "failed to get memory\0"
444     "unmatched parentheses\0"
445     "internal error: code overflow\0"
446     "unrecognized character after (?<\0"
447 nigel 77 /* 25 */
448 ph10 240 "lookbehind assertion is not fixed length\0"
449     "malformed number or name after (?(\0"
450     "conditional group contains more than two branches\0"
451     "assertion expected after (?(\0"
452     "(?R or (?[+-]digits must be followed by )\0"
453 nigel 77 /* 30 */
454 ph10 240 "unknown POSIX class name\0"
455     "POSIX collating elements are not supported\0"
456 ph10 848 "this version of PCRE is compiled without UTF support\0"
457 ph10 240 "spare error\0" /** DEAD **/
458     "character value in \\x{...} sequence is too large\0"
459 nigel 77 /* 35 */
460 ph10 240 "invalid condition (?(0)\0"
461     "\\C not allowed in lookbehind assertion\0"
462 ph10 514 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
463 ph10 240 "number after (?C is > 255\0"
464     "closing ) for (?C expected\0"
465 nigel 77 /* 40 */
466 ph10 240 "recursive call could loop indefinitely\0"
467     "unrecognized character after (?P\0"
468     "syntax error in subpattern name (missing terminator)\0"
469     "two named subpatterns have the same name\0"
470     "invalid UTF-8 string\0"
471 nigel 77 /* 45 */
472 ph10 240 "support for \\P, \\p, and \\X has not been compiled\0"
473     "malformed \\P or \\p sequence\0"
474     "unknown property name after \\P or \\p\0"
475     "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
476     "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
477 nigel 91 /* 50 */
478 ph10 240 "repeated subpattern is too long\0" /** DEAD **/
479 ph10 848 "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
480 ph10 240 "internal error: overran compiling workspace\0"
481     "internal error: previously-checked referenced subpattern not found\0"
482     "DEFINE group contains more than one branch\0"
483 nigel 93 /* 55 */
484 ph10 637 "repeating a DEFINE group is not allowed\0" /** DEAD **/
485 ph10 240 "inconsistent NEWLINE options\0"
486 ph10 333 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
487     "a numbered reference must not be zero\0"
488 ph10 510 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
489 ph10 211 /* 60 */
490 ph10 1313 "(*VERB) not recognized or malformed\0"
491 ph10 268 "number is too big\0"
492 ph10 272 "subpattern name expected\0"
493 ph10 336 "digit expected after (?+\0"
494 ph10 457 "] is an invalid data character in JavaScript compatibility mode\0"
495     /* 65 */
496 ph10 510 "different names for subpatterns of the same number are not allowed\0"
497 ph10 512 "(*MARK) must have an argument\0"
498 ph10 848 "this version of PCRE is not compiled with Unicode property support\0"
499 ph10 579 "\\c must be followed by an ASCII character\0"
500 ph10 654 "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
501 ph10 747 /* 70 */
502     "internal error: unknown opcode in find_fixedlength()\0"
503 ph10 836 "\\N is not supported in a class\0"
504     "too many forward references\0"
505 ph10 848 "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
506 ph10 903 "invalid UTF-16 string\0"
507 ph10 964 /* 75 */
508 ph10 975 "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
509 ph10 978 "character value in \\u.... sequence is too large\0"
510 chpe 1055 "invalid UTF-32 string\0"
511 ph10 1309 "setting UTF is disabled by the application\0"
512 ph10 510 ;
513 nigel 77
514     /* Table to identify digits and hex digits. This is used when compiling
515     patterns. Note that the tables in chartables are dependent on the locale, and
516     may mark arbitrary characters as digits - but the PCRE compiling code expects
517     to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
518     a private table here. It costs 256 bytes, but it is a lot faster than doing
519     character value tests (at least in some simple cases I timed), and in some
520     applications one wants PCRE to compile efficiently as well as match
521     efficiently.
522    
523     For convenience, we use the same bit definitions as in chartables:
524    
525     0x04 decimal digit
526     0x08 hexadecimal digit
527    
528     Then we can use ctype_digit and ctype_xdigit in the code. */
529    
530 ph10 836 /* Using a simple comparison for decimal numbers rather than a memory read
531     is much faster, and the resulting code is simpler (the compiler turns it
532     into a subtraction and unsigned comparison). */
533    
534     #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
535    
536 ph10 392 #ifndef EBCDIC
537 ph10 391
538 ph10 392 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
539 ph10 391 UTF-8 mode. */
540    
541 ph10 836 static const pcre_uint8 digitab[] =
542 nigel 77 {
543     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
544     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
545     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
546     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
547     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
548     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
549     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
550     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
551     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
552     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
553     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
554     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
555     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
556     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
557     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
558     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
559     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
560     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
561     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
562     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
563     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
564     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
565     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
566     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
567     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
568     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
569     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
570     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
571     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
572     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
573     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
574     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
575    
576 ph10 392 #else
577 ph10 391
578     /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
579    
580 ph10 836 static const pcre_uint8 digitab[] =
581 nigel 77 {
582     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
583     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
584     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
585     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
586     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
587     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
588     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
589     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
590     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
591     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
592     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
593 ph10 97 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
594 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
595     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
596     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
597     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
598     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
599     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
600     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
601     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
602     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
603     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
604     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
605     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
606     0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
607     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
608     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
609     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
610     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
611     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
612     0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
613     0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
614    
615 ph10 836 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
616 nigel 77 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
617     0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
618     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
619     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
620     0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
621     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
622     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
623     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
624     0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
625     0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
626     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
627 ph10 97 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
628 nigel 77 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
629     0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
630     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
631     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
632     0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
633     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
634     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
635     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
636     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
637     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
638     0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
639     0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
640     0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
641     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
642     0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
643     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
644     0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
645     0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
646     0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
647     0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
648     #endif
649    
650    
651    
652     /*************************************************
653 ph10 240 * Find an error text *
654     *************************************************/
655    
656 ph10 243 /* The error texts are now all in one long string, to save on relocations. As
657     some of the text is of unknown length, we can't use a table of offsets.
658     Instead, just count through the strings. This is not a performance issue
659 ph10 240 because it happens only when there has been a compilation error.
660    
661     Argument: the error number
662     Returns: pointer to the error string
663     */
664    
665     static const char *
666     find_error_text(int n)
667     {
668     const char *s = error_texts;
669 ph10 507 for (; n > 0; n--)
670 ph10 499 {
671 ph10 1186 while (*s++ != CHAR_NULL) {};
672     if (*s == CHAR_NULL) return "Error text not found (please report)";
673 ph10 507 }
674 ph10 240 return s;
675     }
676    
677    
678     /*************************************************
679 ph10 836 * Expand the workspace *
680     *************************************************/
681    
682     /* This function is called during the second compiling phase, if the number of
683     forward references fills the existing workspace, which is originally a block on
684     the stack. A larger block is obtained from malloc() unless the ultimate limit
685     has been reached or the increase will be rather small.
686    
687     Argument: pointer to the compile data block
688     Returns: 0 if all went well, else an error number
689     */
690    
691     static int
692     expand_workspace(compile_data *cd)
693     {
694     pcre_uchar *newspace;
695     int newsize = cd->workspace_size * 2;
696    
697     if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
698     if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
699     newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
700     return ERR72;
701    
702     newspace = (PUBL(malloc))(IN_UCHARS(newsize));
703     if (newspace == NULL) return ERR21;
704     memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
705     cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
706     if (cd->workspace_size > COMPILE_WORK_SIZE)
707     (PUBL(free))((void *)cd->start_workspace);
708     cd->start_workspace = newspace;
709     cd->workspace_size = newsize;
710     return 0;
711     }
712    
713    
714    
715     /*************************************************
716 ph10 640 * Check for counted repeat *
717     *************************************************/
718    
719     /* This function is called when a '{' is encountered in a place where it might
720     start a quantifier. It looks ahead to see if it really is a quantifier or not.
721     It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
722     where the ddds are digits.
723    
724     Arguments:
725     p pointer to the first char after '{'
726    
727     Returns: TRUE or FALSE
728     */
729    
730     static BOOL
731 ph10 836 is_counted_repeat(const pcre_uchar *p)
732 ph10 640 {
733 ph10 836 if (!IS_DIGIT(*p)) return FALSE;
734     p++;
735     while (IS_DIGIT(*p)) p++;
736 ph10 640 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
737    
738     if (*p++ != CHAR_COMMA) return FALSE;
739     if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
740    
741 ph10 836 if (!IS_DIGIT(*p)) return FALSE;
742     p++;
743     while (IS_DIGIT(*p)) p++;
744 ph10 640
745     return (*p == CHAR_RIGHT_CURLY_BRACKET);
746     }
747    
748    
749    
750     /*************************************************
751 nigel 77 * Handle escapes *
752     *************************************************/
753    
754     /* This function is called when a \ has been encountered. It either returns a
755 chpe 1059 positive value for a simple escape such as \n, or 0 for a data character
756 chpe 1061 which will be placed in chptr. A backreference to group n is returned as
757     negative n. When UTF-8 is enabled, a positive value greater than 255 may
758     be returned in chptr.
759 chpe 1059 On entry,ptr is pointing at the \. On exit, it is on the final character of the
760     escape sequence.
761 nigel 77
762     Arguments:
763     ptrptr points to the pattern position pointer
764 chpe 1059 chptr points to the data character
765 nigel 77 errorcodeptr points to the errorcode variable
766     bracount number of previous extracting brackets
767     options the options bits
768     isclass TRUE if inside a character class
769    
770 chpe 1059 Returns: zero => a data character
771     positive => a special escape sequence
772 chpe 1061 negative => a back reference
773 ph10 213 on error, errorcodeptr is set
774 nigel 77 */
775    
776     static int
777 ph10 1221 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
778 chpe 1059 int bracount, int options, BOOL isclass)
779 nigel 77 {
780 ph10 836 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
781     BOOL utf = (options & PCRE_UTF8) != 0;
782     const pcre_uchar *ptr = *ptrptr + 1;
783 chpe 1060 pcre_uint32 c;
784 chpe 1059 int escape = 0;
785 ph10 836 int i;
786 nigel 77
787 nigel 87 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
788     ptr--; /* Set pointer back to the last byte */
789    
790 nigel 77 /* If backslash is at the end of the pattern, it's an error. */
791    
792 ph10 1186 if (c == CHAR_NULL) *errorcodeptr = ERR1;
793 nigel 77
794 ph10 274 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
795     in a table. A non-zero result is something that can be returned immediately.
796 nigel 77 Otherwise further processing may be required. */
797    
798 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
799 ph10 836 /* Not alphanumeric */
800     else if (c < CHAR_0 || c > CHAR_z) {}
801 ph10 1313 else if ((i = escapes[c - CHAR_0]) != 0)
802 ph10 1253 { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
803 nigel 77
804 ph10 97 #else /* EBCDIC coding */
805 ph10 836 /* Not alphanumeric */
806 ph10 1033 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
807 chpe 1062 else if ((i = escapes[c - 0x48]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
808 nigel 77 #endif
809    
810     /* Escapes that need further processing, or are illegal. */
811    
812     else
813     {
814 ph10 836 const pcre_uchar *oldptr;
815 chpe 1062 BOOL braced, negated, overflow;
816     int s;
817 nigel 93
818 nigel 77 switch (c)
819     {
820     /* A number of Perl escapes are not handled by PCRE. We give an explicit
821     error. */
822    
823 ph10 391 case CHAR_l:
824     case CHAR_L:
825 zherczeg 744 *errorcodeptr = ERR37;
826     break;
827    
828 ph10 391 case CHAR_u:
829 zherczeg 744 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
830     {
831     /* In JavaScript, \u must be followed by four hexadecimal numbers.
832     Otherwise it is a lowercase u letter. */
833 ph10 836 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
834     && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
835     && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
836     && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
837 zherczeg 744 {
838     c = 0;
839     for (i = 0; i < 4; ++i)
840     {
841 chpe 1060 register pcre_uint32 cc = *(++ptr);
842 zherczeg 744 #ifndef EBCDIC /* ASCII/UTF-8 coding */
843     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
844     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
845     #else /* EBCDIC coding */
846     if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
847     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
848     #endif
849     }
850 ph10 982
851 chpe 1055 #if defined COMPILE_PCRE8
852 ph10 1233 if (c > (utf ? 0x10ffffU : 0xffU))
853 chpe 1055 #elif defined COMPILE_PCRE16
854 ph10 1233 if (c > (utf ? 0x10ffffU : 0xffffU))
855 chpe 1055 #elif defined COMPILE_PCRE32
856 ph10 1233 if (utf && c > 0x10ffffU)
857 ph10 978 #endif
858     {
859 ph10 982 *errorcodeptr = ERR76;
860 ph10 978 }
861     else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
862 zherczeg 744 }
863     }
864     else
865     *errorcodeptr = ERR37;
866     break;
867    
868 ph10 391 case CHAR_U:
869 zherczeg 744 /* In JavaScript, \U is an uppercase U letter. */
870     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
871 nigel 77 break;
872    
873 ph10 654 /* In a character class, \g is just a literal "g". Outside a character
874 ph10 640 class, \g must be followed by one of a number of specific things:
875 ph10 345
876 ph10 333 (1) A number, either plain or braced. If positive, it is an absolute
877     backreference. If negative, it is a relative backreference. This is a Perl
878     5.10 feature.
879 ph10 345
880 ph10 333 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
881     is part of Perl's movement towards a unified syntax for back references. As
882     this is synonymous with \k{name}, we fudge it up by pretending it really
883     was \k.
884 ph10 345
885     (3) For Oniguruma compatibility we also support \g followed by a name or a
886     number either in angle brackets or in single quotes. However, these are
887     (possibly recursive) subroutine calls, _not_ backreferences. Just return
888 chpe 1059 the ESC_g code (cf \k). */
889 nigel 93
890 ph10 391 case CHAR_g:
891 ph10 640 if (isclass) break;
892 ph10 391 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
893 ph10 333 {
894 chpe 1059 escape = ESC_g;
895 ph10 345 break;
896     }
897 ph10 333
898     /* Handle the Perl-compatible cases */
899 ph10 345
900 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
901 nigel 93 {
902 ph10 836 const pcre_uchar *p;
903 ph10 1186 for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
904 ph10 836 if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
905 ph10 1186 if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
906 ph10 171 {
907 chpe 1059 escape = ESC_k;
908 ph10 171 break;
909 ph10 172 }
910 nigel 93 braced = TRUE;
911     ptr++;
912     }
913     else braced = FALSE;
914    
915 ph10 391 if (ptr[1] == CHAR_MINUS)
916 nigel 93 {
917     negated = TRUE;
918     ptr++;
919     }
920     else negated = FALSE;
921    
922 ph10 836 /* The integer range is limited by the machine's int representation. */
923 chpe 1062 s = 0;
924     overflow = FALSE;
925 ph10 836 while (IS_DIGIT(ptr[1]))
926     {
927 chpe 1062 if (s > INT_MAX / 10 - 1) /* Integer overflow */
928 ph10 836 {
929 chpe 1062 overflow = TRUE;
930 ph10 836 break;
931     }
932 chpe 1062 s = s * 10 + (int)(*(++ptr) - CHAR_0);
933 ph10 836 }
934 chpe 1062 if (overflow) /* Integer overflow */
935 ph10 213 {
936 ph10 836 while (IS_DIGIT(ptr[1]))
937     ptr++;
938 ph10 213 *errorcodeptr = ERR61;
939     break;
940 ph10 220 }
941 ph10 345
942 ph10 391 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
943 nigel 93 {
944     *errorcodeptr = ERR57;
945 ph10 213 break;
946 nigel 93 }
947 ph10 345
948 chpe 1062 if (s == 0)
949 ph10 333 {
950     *errorcodeptr = ERR58;
951     break;
952 ph10 345 }
953 nigel 93
954     if (negated)
955     {
956 chpe 1062 if (s > bracount)
957 nigel 93 {
958     *errorcodeptr = ERR15;
959 ph10 213 break;
960 nigel 93 }
961 chpe 1062 s = bracount - (s - 1);
962 nigel 93 }
963    
964 chpe 1062 escape = -s;
965 nigel 93 break;
966    
967 nigel 77 /* The handling of escape sequences consisting of a string of digits
968     starting with one that is not zero is not straightforward. By experiment,
969     the way Perl works seems to be as follows:
970    
971     Outside a character class, the digits are read as a decimal number. If the
972     number is less than 10, or if there are that many previous extracting
973     left brackets, then it is a back reference. Otherwise, up to three octal
974     digits are read to form an escaped byte. Thus \123 is likely to be octal
975     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
976     value is greater than 377, the least significant 8 bits are taken. Inside a
977     character class, \ followed by a digit is always an octal number. */
978    
979 ph10 391 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
980     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
981 nigel 77
982     if (!isclass)
983     {
984     oldptr = ptr;
985 ph10 836 /* The integer range is limited by the machine's int representation. */
986 chpe 1062 s = (int)(c -CHAR_0);
987     overflow = FALSE;
988 ph10 836 while (IS_DIGIT(ptr[1]))
989     {
990 chpe 1062 if (s > INT_MAX / 10 - 1) /* Integer overflow */
991 ph10 836 {
992 chpe 1062 overflow = TRUE;
993 ph10 836 break;
994     }
995 chpe 1062 s = s * 10 + (int)(*(++ptr) - CHAR_0);
996 ph10 836 }
997 chpe 1062 if (overflow) /* Integer overflow */
998 ph10 213 {
999 ph10 836 while (IS_DIGIT(ptr[1]))
1000     ptr++;
1001 ph10 213 *errorcodeptr = ERR61;
1002 ph10 220 break;
1003     }
1004 chpe 1062 if (s < 10 || s <= bracount)
1005 nigel 77 {
1006 chpe 1062 escape = -s;
1007 nigel 77 break;
1008     }
1009     ptr = oldptr; /* Put the pointer back and fall through */
1010     }
1011    
1012     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
1013     generates a binary zero byte and treats the digit as a following literal.
1014     Thus we have to pull back the pointer by one. */
1015    
1016 ph10 391 if ((c = *ptr) >= CHAR_8)
1017 nigel 77 {
1018     ptr--;
1019     c = 0;
1020     break;
1021     }
1022    
1023     /* \0 always starts an octal number, but we may drop through to here with a
1024 nigel 91 larger first octal digit. The original code used just to take the least
1025     significant 8 bits of octal numbers (I think this is what early Perls used
1026 ph10 849 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1027     but no more than 3 octal digits. */
1028 nigel 77
1029 ph10 391 case CHAR_0:
1030     c -= CHAR_0;
1031     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1032     c = c * 8 + *(++ptr) - CHAR_0;
1033 ph10 903 #ifdef COMPILE_PCRE8
1034 ph10 836 if (!utf && c > 0xff) *errorcodeptr = ERR51;
1035 ph10 903 #endif
1036 nigel 77 break;
1037    
1038 nigel 87 /* \x is complicated. \x{ddd} is a character number which can be greater
1039 ph10 836 than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
1040     If not, { is treated as a data character. */
1041 nigel 77
1042 ph10 391 case CHAR_x:
1043 zherczeg 744 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1044     {
1045     /* In JavaScript, \x must be followed by two hexadecimal numbers.
1046     Otherwise it is a lowercase x letter. */
1047 ph10 836 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1048     && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1049 zherczeg 744 {
1050     c = 0;
1051     for (i = 0; i < 2; ++i)
1052     {
1053 chpe 1060 register pcre_uint32 cc = *(++ptr);
1054 zherczeg 744 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1055     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1056     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1057     #else /* EBCDIC coding */
1058     if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1059     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1060     #endif
1061     }
1062     }
1063     break;
1064     }
1065    
1066 ph10 391 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1067 nigel 77 {
1068 ph10 836 const pcre_uchar *pt = ptr + 2;
1069 nigel 87
1070 nigel 77 c = 0;
1071 chpe 1060 overflow = FALSE;
1072 ph10 836 while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
1073 nigel 77 {
1074 chpe 1060 register pcre_uint32 cc = *pt++;
1075 ph10 391 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
1076 nigel 87
1077 chpe 1060 #ifdef COMPILE_PCRE32
1078     if (c >= 0x10000000l) { overflow = TRUE; break; }
1079     #endif
1080    
1081 ph10 391 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1082     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1083     c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1084 ph10 97 #else /* EBCDIC coding */
1085 ph10 391 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
1086     c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1087 nigel 77 #endif
1088 ph10 836
1089 chpe 1055 #if defined COMPILE_PCRE8
1090 ph10 1233 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1091 chpe 1055 #elif defined COMPILE_PCRE16
1092 ph10 1233 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1093 chpe 1055 #elif defined COMPILE_PCRE32
1094 ph10 1233 if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1095 ph10 836 #endif
1096 nigel 77 }
1097 nigel 87
1098 chpe 1060 if (overflow)
1099 ph10 836 {
1100     while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
1101     *errorcodeptr = ERR34;
1102     }
1103    
1104 ph10 391 if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1105 nigel 77 {
1106 ph10 836 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1107 nigel 77 ptr = pt;
1108     break;
1109     }
1110 nigel 87
1111 nigel 77 /* If the sequence of hex digits does not end with '}', then we don't
1112     recognize this construct; fall through to the normal \x handling. */
1113     }
1114    
1115 nigel 87 /* Read just a single-byte hex-defined char */
1116 nigel 77
1117     c = 0;
1118 ph10 836 while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1119 nigel 77 {
1120 chpe 1060 pcre_uint32 cc; /* Some compilers don't like */
1121 ph10 391 cc = *(++ptr); /* ++ in initializers */
1122     #ifndef EBCDIC /* ASCII/UTF-8 coding */
1123     if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
1124     c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1125 ph10 97 #else /* EBCDIC coding */
1126 ph10 391 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
1127     c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1128 nigel 77 #endif
1129     }
1130     break;
1131    
1132 nigel 93 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1133 ph10 574 An error is given if the byte following \c is not an ASCII character. This
1134     coding is ASCII-specific, but then the whole concept of \cx is
1135 nigel 93 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1136 nigel 77
1137 ph10 391 case CHAR_c:
1138 nigel 77 c = *(++ptr);
1139 ph10 1186 if (c == CHAR_NULL)
1140 nigel 77 {
1141     *errorcodeptr = ERR2;
1142 ph10 213 break;
1143 nigel 77 }
1144 ph10 574 #ifndef EBCDIC /* ASCII/UTF-8 coding */
1145     if (c > 127) /* Excludes all non-ASCII in either mode */
1146     {
1147     *errorcodeptr = ERR68;
1148 ph10 579 break;
1149     }
1150 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1151 nigel 77 c ^= 0x40;
1152 ph10 574 #else /* EBCDIC coding */
1153 ph10 391 if (c >= CHAR_a && c <= CHAR_z) c += 64;
1154 nigel 77 c ^= 0xC0;
1155     #endif
1156     break;
1157    
1158     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1159 ph10 274 other alphanumeric following \ is an error if PCRE_EXTRA was set;
1160     otherwise, for Perl compatibility, it is a literal. This code looks a bit
1161     odd, but there used to be some cases other than the default, and there may
1162     be again in future, so I haven't "optimized" it. */
1163 nigel 77
1164     default:
1165     if ((options & PCRE_EXTRA) != 0) switch(c)
1166     {
1167     default:
1168     *errorcodeptr = ERR3;
1169     break;
1170     }
1171     break;
1172     }
1173     }
1174 ph10 518
1175     /* Perl supports \N{name} for character names, as well as plain \N for "not
1176 ph10 654 newline". PCRE does not support \N{name}. However, it does support
1177 ph10 640 quantification such as \N{2,3}. */
1178 nigel 77
1179 chpe 1059 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1180 ph10 640 !is_counted_repeat(ptr+2))
1181 ph10 518 *errorcodeptr = ERR37;
1182 ph10 514
1183 ph10 518 /* If PCRE_UCP is set, we change the values for \d etc. */
1184    
1185 chpe 1059 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1186     escape += (ESC_DU - ESC_D);
1187 ph10 518
1188     /* Set the pointer to the final character before returning. */
1189    
1190 nigel 77 *ptrptr = ptr;
1191 chpe 1059 *chptr = c;
1192     return escape;
1193 nigel 77 }
1194    
1195     #ifdef SUPPORT_UCP
1196     /*************************************************
1197     * Handle \P and \p *
1198     *************************************************/
1199    
1200     /* This function is called after \P or \p has been encountered, provided that
1201     PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1202     pointing at the P or p. On exit, it is pointing at the final character of the
1203     escape sequence.
1204    
1205     Argument:
1206     ptrptr points to the pattern position pointer
1207     negptr points to a boolean that is set TRUE for negation else FALSE
1208 chpe 1129 ptypeptr points to an unsigned int that is set to the type value
1209     pdataptr points to an unsigned int that is set to the detailed property value
1210 nigel 77 errorcodeptr points to the error code variable
1211    
1212 chpe 1129 Returns: TRUE if the type value was found, or FALSE for an invalid type
1213 nigel 77 */
1214    
1215 chpe 1129 static BOOL
1216 ph10 1221 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1217 chpe 1129 unsigned int *pdataptr, int *errorcodeptr)
1218 nigel 77 {
1219 chpe 1065 pcre_uchar c;
1220     int i, bot, top;
1221 ph10 836 const pcre_uchar *ptr = *ptrptr;
1222     pcre_uchar name[32];
1223 nigel 77
1224     c = *(++ptr);
1225 ph10 1186 if (c == CHAR_NULL) goto ERROR_RETURN;
1226 nigel 77
1227     *negptr = FALSE;
1228    
1229 nigel 87 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1230     negation. */
1231 nigel 77
1232 ph10 391 if (c == CHAR_LEFT_CURLY_BRACKET)
1233 nigel 77 {
1234 ph10 391 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1235 nigel 77 {
1236     *negptr = TRUE;
1237     ptr++;
1238     }
1239 ph10 836 for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1240 nigel 77 {
1241     c = *(++ptr);
1242 ph10 1186 if (c == CHAR_NULL) goto ERROR_RETURN;
1243 ph10 391 if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1244 nigel 77 name[i] = c;
1245     }
1246 ph10 391 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1247 nigel 77 name[i] = 0;
1248     }
1249    
1250     /* Otherwise there is just one following character */
1251    
1252     else
1253     {
1254     name[0] = c;
1255     name[1] = 0;
1256     }
1257    
1258     *ptrptr = ptr;
1259    
1260     /* Search for a recognized property name using binary chop */
1261    
1262     bot = 0;
1263 ph10 836 top = PRIV(utt_size);
1264 nigel 77
1265     while (bot < top)
1266     {
1267 chpe 1065 int r;
1268 nigel 87 i = (bot + top) >> 1;
1269 chpe 1065 r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1270     if (r == 0)
1271 nigel 87 {
1272 chpe 1129 *ptypeptr = PRIV(utt)[i].type;
1273     *pdataptr = PRIV(utt)[i].value;
1274     return TRUE;
1275 nigel 87 }
1276 chpe 1065 if (r > 0) bot = i + 1; else top = i;
1277 nigel 77 }
1278    
1279     *errorcodeptr = ERR47;
1280     *ptrptr = ptr;
1281 chpe 1129 return FALSE;
1282 nigel 77
1283     ERROR_RETURN:
1284     *errorcodeptr = ERR46;
1285     *ptrptr = ptr;
1286 chpe 1129 return FALSE;
1287 nigel 77 }
1288     #endif
1289    
1290    
1291    
1292    
1293     /*************************************************
1294     * Read repeat counts *
1295     *************************************************/
1296    
1297     /* Read an item of the form {n,m} and return the values. This is called only
1298     after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1299     so the syntax is guaranteed to be correct, but we need to check the values.
1300    
1301     Arguments:
1302     p pointer to first char after '{'
1303     minp pointer to int for min
1304     maxp pointer to int for max
1305     returned as -1 if no max
1306     errorcodeptr points to error code variable
1307    
1308     Returns: pointer to '}' on success;
1309     current ptr on error, with errorcodeptr set non-zero
1310     */
1311    
1312 ph10 836 static const pcre_uchar *
1313     read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1314 nigel 77 {
1315     int min = 0;
1316     int max = -1;
1317    
1318 nigel 81 /* Read the minimum value and do a paranoid check: a negative value indicates
1319     an integer overflow. */
1320    
1321 chpe 1066 while (IS_DIGIT(*p)) min = min * 10 + (int)(*p++ - CHAR_0);
1322 nigel 81 if (min < 0 || min > 65535)
1323     {
1324     *errorcodeptr = ERR5;
1325     return p;
1326     }
1327 nigel 77
1328 nigel 81 /* Read the maximum value if there is one, and again do a paranoid on its size.
1329     Also, max must not be less than min. */
1330    
1331 ph10 391 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1332 nigel 77 {
1333 ph10 391 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1334 nigel 77 {
1335     max = 0;
1336 chpe 1066 while(IS_DIGIT(*p)) max = max * 10 + (int)(*p++ - CHAR_0);
1337 nigel 81 if (max < 0 || max > 65535)
1338     {
1339     *errorcodeptr = ERR5;
1340     return p;
1341     }
1342 nigel 77 if (max < min)
1343     {
1344     *errorcodeptr = ERR4;
1345     return p;
1346     }
1347     }
1348     }
1349    
1350 nigel 81 /* Fill in the required variables, and pass back the pointer to the terminating
1351     '}'. */
1352 nigel 77
1353 nigel 81 *minp = min;
1354     *maxp = max;
1355 nigel 77 return p;
1356     }
1357    
1358    
1359    
1360     /*************************************************
1361 ph10 408 * Subroutine for finding forward reference *
1362 nigel 91 *************************************************/
1363    
1364 ph10 408 /* This recursive function is called only from find_parens() below. The
1365     top-level call starts at the beginning of the pattern. All other calls must
1366     start at a parenthesis. It scans along a pattern's text looking for capturing
1367 nigel 93 subpatterns, and counting them. If it finds a named pattern that matches the
1368     name it is given, it returns its number. Alternatively, if the name is NULL, it
1369 ph10 578 returns when it reaches a given numbered subpattern. Recursion is used to keep
1370     track of subpatterns that reset the capturing group numbers - the (?| feature.
1371 nigel 91
1372 ph10 578 This function was originally called only from the second pass, in which we know
1373     that if (?< or (?' or (?P< is encountered, the name will be correctly
1374     terminated because that is checked in the first pass. There is now one call to
1375     this function in the first pass, to check for a recursive back reference by
1376     name (so that we can make the whole group atomic). In this case, we need check
1377 ph10 579 only up to the current position in the pattern, and that is still OK because
1378     and previous occurrences will have been checked. To make this work, the test
1379     for "end of pattern" is a check against cd->end_pattern in the main loop,
1380 ph10 578 instead of looking for a binary zero. This means that the special first-pass
1381 ph10 579 call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1382     processing items within the loop are OK, because afterwards the main loop will
1383 ph10 578 terminate.)
1384    
1385 nigel 91 Arguments:
1386 ph10 408 ptrptr address of the current character pointer (updated)
1387 ph10 345 cd compile background data
1388 nigel 93 name name to seek, or NULL if seeking a numbered subpattern
1389     lorn name length, or subpattern number if name is NULL
1390     xmode TRUE if we are in /x mode
1391 chpe 1055 utf TRUE if we are in UTF-8 / UTF-16 / UTF-32 mode
1392 ph10 411 count pointer to the current capturing subpattern number (updated)
1393 nigel 91
1394     Returns: the number of the named subpattern, or -1 if not found
1395     */
1396    
1397     static int
1398 ph10 836 find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
1399     BOOL xmode, BOOL utf, int *count)
1400 nigel 91 {
1401 ph10 836 pcre_uchar *ptr = *ptrptr;
1402 ph10 408 int start_count = *count;
1403     int hwm_count = start_count;
1404     BOOL dup_parens = FALSE;
1405 nigel 93
1406 ph10 411 /* If the first character is a parenthesis, check on the type of group we are
1407 ph10 408 dealing with. The very first call may not start with a parenthesis. */
1408    
1409     if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1410     {
1411 ph10 544 /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1412 ph10 545
1413 ph10 1313 if (ptr[1] == CHAR_ASTERISK)
1414 ph10 1305 {
1415     ptr += 2;
1416     while (ptr < cd->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1417 ph10 1313 }
1418 ph10 545
1419 ph10 544 /* Handle a normal, unnamed capturing parenthesis. */
1420 ph10 408
1421 ph10 544 else if (ptr[1] != CHAR_QUESTION_MARK)
1422 ph10 408 {
1423     *count += 1;
1424     if (name == NULL && *count == lorn) return *count;
1425 ph10 411 ptr++;
1426 ph10 408 }
1427    
1428 ph10 544 /* All cases now have (? at the start. Remember when we are in a group
1429     where the parenthesis numbers are duplicated. */
1430    
1431     else if (ptr[2] == CHAR_VERTICAL_LINE)
1432     {
1433     ptr += 3;
1434     dup_parens = TRUE;
1435     }
1436 ph10 545
1437 ph10 544 /* Handle comments; all characters are allowed until a ket is reached. */
1438    
1439     else if (ptr[2] == CHAR_NUMBER_SIGN)
1440     {
1441 ph10 1221 for (ptr += 3; *ptr != CHAR_NULL; ptr++)
1442 ph10 1186 if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1443 ph10 544 goto FAIL_EXIT;
1444 ph10 545 }
1445 ph10 544
1446 ph10 408 /* Handle a condition. If it is an assertion, just carry on so that it
1447     is processed as normal. If not, skip to the closing parenthesis of the
1448 ph10 544 condition (there can't be any nested parens). */
1449 ph10 411
1450 ph10 408 else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1451     {
1452 ph10 411 ptr += 2;
1453 ph10 408 if (ptr[1] != CHAR_QUESTION_MARK)
1454     {
1455 ph10 1186 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1456     if (*ptr != CHAR_NULL) ptr++;
1457 ph10 408 }
1458 ph10 411 }
1459    
1460 ph10 544 /* Start with (? but not a condition. */
1461 ph10 408
1462     else
1463 ph10 411 {
1464 ph10 408 ptr += 2;
1465     if (*ptr == CHAR_P) ptr++; /* Allow optional P */
1466    
1467     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1468 ph10 411
1469 ph10 408 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1470     ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1471     {
1472 chpe 1067 pcre_uchar term;
1473 ph10 836 const pcre_uchar *thisname;
1474 ph10 408 *count += 1;
1475     if (name == NULL && *count == lorn) return *count;
1476     term = *ptr++;
1477     if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1478     thisname = ptr;
1479     while (*ptr != term) ptr++;
1480 chpe 1067 if (name != NULL && lorn == (int)(ptr - thisname) &&
1481     STRNCMP_UC_UC(name, thisname, (unsigned int)lorn) == 0)
1482 ph10 408 return *count;
1483 ph10 461 term++;
1484 ph10 411 }
1485 ph10 408 }
1486 ph10 411 }
1487 ph10 408
1488 ph10 411 /* Past any initial parenthesis handling, scan for parentheses or vertical
1489 ph10 579 bars. Stop if we get to cd->end_pattern. Note that this is important for the
1490     first-pass call when this value is temporarily adjusted to stop at the current
1491 ph10 578 position. So DO NOT change this to a test for binary zero. */
1492 ph10 408
1493 ph10 578 for (; ptr < cd->end_pattern; ptr++)
1494 nigel 91 {
1495 nigel 93 /* Skip over backslashed characters and also entire \Q...\E */
1496    
1497 ph10 391 if (*ptr == CHAR_BACKSLASH)
1498 nigel 93 {
1499 ph10 1186 if (*(++ptr) == CHAR_NULL) goto FAIL_EXIT;
1500 ph10 391 if (*ptr == CHAR_Q) for (;;)
1501 nigel 93 {
1502 ph10 1186 while (*(++ptr) != CHAR_NULL && *ptr != CHAR_BACKSLASH) {};
1503     if (*ptr == CHAR_NULL) goto FAIL_EXIT;
1504 ph10 391 if (*(++ptr) == CHAR_E) break;
1505 nigel 93 }
1506     continue;
1507     }
1508    
1509 ph10 340 /* Skip over character classes; this logic must be similar to the way they
1510     are handled for real. If the first character is '^', skip it. Also, if the
1511     first few characters (either before or after ^) are \Q\E or \E we skip them
1512 ph10 392 too. This makes for compatibility with Perl. Note the use of STR macros to
1513 ph10 391 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1514 nigel 93
1515 ph10 391 if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1516 nigel 93 {
1517 ph10 340 BOOL negate_class = FALSE;
1518     for (;;)
1519     {
1520 ph10 438 if (ptr[1] == CHAR_BACKSLASH)
1521 ph10 340 {
1522 ph10 438 if (ptr[2] == CHAR_E)
1523     ptr+= 2;
1524 ph10 836 else if (STRNCMP_UC_C8(ptr + 2,
1525 ph10 392 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1526 ph10 438 ptr += 4;
1527 ph10 392 else
1528 ph10 391 break;
1529 ph10 340 }
1530 ph10 438 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1531 ph10 461 {
1532 ph10 340 negate_class = TRUE;
1533 ph10 438 ptr++;
1534 ph10 461 }
1535 ph10 340 else break;
1536     }
1537    
1538     /* If the next character is ']', it is a data character that must be
1539 ph10 341 skipped, except in JavaScript compatibility mode. */
1540 ph10 345
1541 ph10 392 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1542 ph10 391 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1543 ph10 345 ptr++;
1544    
1545 ph10 391 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1546 nigel 93 {
1547 ph10 1186 if (*ptr == CHAR_NULL) return -1;
1548 ph10 391 if (*ptr == CHAR_BACKSLASH)
1549 nigel 93 {
1550 ph10 1186 if (*(++ptr) == CHAR_NULL) goto FAIL_EXIT;
1551 ph10 391 if (*ptr == CHAR_Q) for (;;)
1552 nigel 93 {
1553 ph10 1186 while (*(++ptr) != CHAR_NULL && *ptr != CHAR_BACKSLASH) {};
1554     if (*ptr == CHAR_NULL) goto FAIL_EXIT;
1555 ph10 391 if (*(++ptr) == CHAR_E) break;
1556 nigel 93 }
1557     continue;
1558     }
1559     }
1560     continue;
1561     }
1562    
1563     /* Skip comments in /x mode */
1564    
1565 ph10 391 if (xmode && *ptr == CHAR_NUMBER_SIGN)
1566 nigel 93 {
1567 ph10 579 ptr++;
1568 ph10 1186 while (*ptr != CHAR_NULL)
1569 ph10 556 {
1570     if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1571     ptr++;
1572 ph10 836 #ifdef SUPPORT_UTF
1573     if (utf) FORWARDCHAR(ptr);
1574 ph10 556 #endif
1575     }
1576 ph10 1186 if (*ptr == CHAR_NULL) goto FAIL_EXIT;
1577 nigel 93 continue;
1578     }
1579    
1580 ph10 408 /* Check for the special metacharacters */
1581 ph10 411
1582 ph10 408 if (*ptr == CHAR_LEFT_PARENTHESIS)
1583 nigel 93 {
1584 ph10 836 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);
1585 ph10 408 if (rc > 0) return rc;
1586 ph10 1186 if (*ptr == CHAR_NULL) goto FAIL_EXIT;
1587 nigel 93 }
1588 ph10 411
1589 ph10 408 else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1590     {
1591     if (dup_parens && *count < hwm_count) *count = hwm_count;
1592 ph10 545 goto FAIL_EXIT;
1593 ph10 408 }
1594 ph10 411
1595     else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1596 ph10 408 {
1597     if (*count > hwm_count) hwm_count = *count;
1598     *count = start_count;
1599 ph10 411 }
1600 ph10 408 }
1601 nigel 93
1602 ph10 408 FAIL_EXIT:
1603     *ptrptr = ptr;
1604     return -1;
1605     }
1606 nigel 93
1607    
1608    
1609    
1610 ph10 408 /*************************************************
1611     * Find forward referenced subpattern *
1612     *************************************************/
1613 nigel 93
1614 ph10 408 /* This function scans along a pattern's text looking for capturing
1615     subpatterns, and counting them. If it finds a named pattern that matches the
1616     name it is given, it returns its number. Alternatively, if the name is NULL, it
1617     returns when it reaches a given numbered subpattern. This is used for forward
1618     references to subpatterns. We used to be able to start this scan from the
1619     current compiling point, using the current count value from cd->bracount, and
1620     do it all in a single loop, but the addition of the possibility of duplicate
1621     subpattern numbers means that we have to scan from the very start, in order to
1622     take account of such duplicates, and to use a recursive function to keep track
1623     of the different types of group.
1624    
1625     Arguments:
1626     cd compile background data
1627     name name to seek, or NULL if seeking a numbered subpattern
1628     lorn name length, or subpattern number if name is NULL
1629     xmode TRUE if we are in /x mode
1630 chpe 1055 utf TRUE if we are in UTF-8 / UTF-16 / UTF-32 mode
1631 ph10 408
1632     Returns: the number of the found subpattern, or -1 if not found
1633     */
1634    
1635     static int
1636 ph10 836 find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
1637     BOOL utf)
1638 ph10 408 {
1639 ph10 836 pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
1640 ph10 408 int count = 0;
1641     int rc;
1642    
1643     /* If the pattern does not start with an opening parenthesis, the first call
1644     to find_parens_sub() will scan right to the end (if necessary). However, if it
1645     does start with a parenthesis, find_parens_sub() will return when it hits the
1646     matching closing parens. That is why we have to have a loop. */
1647    
1648 ph10 411 for (;;)
1649     {
1650 ph10 836 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);
1651 ph10 1186 if (rc > 0 || *ptr++ == CHAR_NULL) break;
1652 ph10 411 }
1653    
1654 ph10 408 return rc;
1655 nigel 91 }
1656    
1657    
1658    
1659 ph10 408
1660 nigel 91 /*************************************************
1661 nigel 77 * Find first significant op code *
1662     *************************************************/
1663    
1664     /* This is called by several functions that scan a compiled expression looking
1665     for a fixed first character, or an anchoring op code etc. It skips over things
1666 ph10 602 that do not influence this. For some calls, it makes sense to skip negative
1667     forward and all backward assertions, and also the \b assertion; for others it
1668     does not.
1669 nigel 77
1670     Arguments:
1671     code pointer to the start of the group
1672     skipassert TRUE if certain assertions are to be skipped
1673    
1674     Returns: pointer to the first significant opcode
1675     */
1676    
1677 ph10 836 static const pcre_uchar*
1678     first_significant_code(const pcre_uchar *code, BOOL skipassert)
1679 nigel 77 {
1680     for (;;)
1681     {
1682     switch ((int)*code)
1683     {
1684     case OP_ASSERT_NOT:
1685     case OP_ASSERTBACK:
1686     case OP_ASSERTBACK_NOT:
1687     if (!skipassert) return code;
1688     do code += GET(code, 1); while (*code == OP_ALT);
1689 ph10 836 code += PRIV(OP_lengths)[*code];
1690 nigel 77 break;
1691    
1692     case OP_WORD_BOUNDARY:
1693     case OP_NOT_WORD_BOUNDARY:
1694     if (!skipassert) return code;
1695     /* Fall through */
1696    
1697     case OP_CALLOUT:
1698     case OP_CREF:
1699 ph10 459 case OP_NCREF:
1700 nigel 93 case OP_RREF:
1701 ph10 459 case OP_NRREF:
1702 nigel 93 case OP_DEF:
1703 ph10 836 code += PRIV(OP_lengths)[*code];
1704 nigel 77 break;
1705    
1706     default:
1707     return code;
1708     }
1709     }
1710     /* Control never reaches here */
1711     }
1712    
1713    
1714    
1715    
1716     /*************************************************
1717 ph10 454 * Find the fixed length of a branch *
1718 nigel 77 *************************************************/
1719    
1720 ph10 454 /* Scan a branch and compute the fixed length of subject that will match it,
1721 nigel 77 if the length is fixed. This is needed for dealing with backward assertions.
1722 ph10 461 In UTF8 mode, the result is in characters rather than bytes. The branch is
1723 ph10 454 temporarily terminated with OP_END when this function is called.
1724 nigel 77
1725 ph10 461 This function is called when a backward assertion is encountered, so that if it
1726     fails, the error message can point to the correct place in the pattern.
1727 ph10 454 However, we cannot do this when the assertion contains subroutine calls,
1728 ph10 461 because they can be forward references. We solve this by remembering this case
1729 ph10 454 and doing the check at the end; a flag specifies which mode we are running in.
1730    
1731 nigel 77 Arguments:
1732     code points to the start of the pattern (the bracket)
1733 chpe 1055 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
1734 ph10 461 atend TRUE if called when the pattern is complete
1735     cd the "compile data" structure
1736 nigel 77
1737 ph10 461 Returns: the fixed length,
1738 ph10 454 or -1 if there is no fixed length,
1739 ph10 754 or -2 if \C was encountered (in UTF-8 mode only)
1740 ph10 454 or -3 if an OP_RECURSE item was encountered and atend is FALSE
1741 ph10 747 or -4 if an unknown opcode was encountered (internal error)
1742 nigel 77 */
1743    
1744     static int
1745 ph10 836 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1746 nigel 77 {
1747     int length = -1;
1748    
1749     register int branchlength = 0;
1750 ph10 836 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1751 nigel 77
1752     /* Scan along the opcodes for this branch. If we get to the end of the
1753     branch, check the length against that of the other branches. */
1754    
1755     for (;;)
1756     {
1757     int d;
1758 ph10 836 pcre_uchar *ce, *cs;
1759 chpe 1068 register pcre_uchar op = *cc;
1760 ph10 842
1761 nigel 77 switch (op)
1762     {
1763 ph10 604 /* We only need to continue for OP_CBRA (normal capturing bracket) and
1764     OP_BRA (normal non-capturing bracket) because the other variants of these
1765     opcodes are all concerned with unlimited repeated groups, which of course
1766 ph10 747 are not of fixed length. */
1767 ph10 604
1768 nigel 93 case OP_CBRA:
1769 nigel 77 case OP_BRA:
1770     case OP_ONCE:
1771 ph10 733 case OP_ONCE_NC:
1772 nigel 77 case OP_COND:
1773 ph10 836 d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1774 nigel 77 if (d < 0) return d;
1775     branchlength += d;
1776     do cc += GET(cc, 1); while (*cc == OP_ALT);
1777     cc += 1 + LINK_SIZE;
1778     break;
1779    
1780 ph10 747 /* Reached end of a branch; if it's a ket it is the end of a nested call.
1781     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1782     an ALT. If it is END it's the end of the outer call. All can be handled by
1783     the same code. Note that we must not include the OP_KETRxxx opcodes here,
1784     because they all imply an unlimited repeat. */
1785 nigel 77
1786     case OP_ALT:
1787     case OP_KET:
1788     case OP_END:
1789 ph10 747 case OP_ACCEPT:
1790     case OP_ASSERT_ACCEPT:
1791 nigel 77 if (length < 0) length = branchlength;
1792     else if (length != branchlength) return -1;
1793     if (*cc != OP_ALT) return length;
1794     cc += 1 + LINK_SIZE;
1795     branchlength = 0;
1796     break;
1797 ph10 461
1798 ph10 454 /* A true recursion implies not fixed length, but a subroutine call may
1799     be OK. If the subroutine is a forward reference, we can't deal with
1800     it until the end of the pattern, so return -3. */
1801 ph10 461
1802 ph10 454 case OP_RECURSE:
1803     if (!atend) return -3;
1804 ph10 836 cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
1805     do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
1806     if (cc > cs && cc < ce) return -1; /* Recursion */
1807     d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1808 ph10 461 if (d < 0) return d;
1809 ph10 454 branchlength += d;
1810     cc += 1 + LINK_SIZE;
1811 ph10 461 break;
1812 nigel 77
1813     /* Skip over assertive subpatterns */
1814    
1815     case OP_ASSERT:
1816     case OP_ASSERT_NOT:
1817     case OP_ASSERTBACK:
1818     case OP_ASSERTBACK_NOT:
1819     do cc += GET(cc, 1); while (*cc == OP_ALT);
1820 ph10 836 cc += PRIV(OP_lengths)[*cc];
1821 ph10 842 break;
1822 nigel 77
1823     /* Skip over things that don't match chars */
1824    
1825 ph10 747 case OP_MARK:
1826     case OP_PRUNE_ARG:
1827     case OP_SKIP_ARG:
1828     case OP_THEN_ARG:
1829 ph10 836 cc += cc[1] + PRIV(OP_lengths)[*cc];
1830 ph10 747 break;
1831    
1832 nigel 77 case OP_CALLOUT:
1833     case OP_CIRC:
1834 ph10 602 case OP_CIRCM:
1835 ph10 747 case OP_CLOSE:
1836     case OP_COMMIT:
1837     case OP_CREF:
1838     case OP_DEF:
1839 nigel 77 case OP_DOLL:
1840 ph10 602 case OP_DOLLM:
1841 ph10 747 case OP_EOD:
1842     case OP_EODN:
1843     case OP_FAIL:
1844     case OP_NCREF:
1845     case OP_NRREF:
1846 nigel 77 case OP_NOT_WORD_BOUNDARY:
1847 ph10 747 case OP_PRUNE:
1848     case OP_REVERSE:
1849     case OP_RREF:
1850     case OP_SET_SOM:
1851     case OP_SKIP:
1852     case OP_SOD:
1853     case OP_SOM:
1854     case OP_THEN:
1855 nigel 77 case OP_WORD_BOUNDARY:
1856 ph10 836 cc += PRIV(OP_lengths)[*cc];
1857 nigel 77 break;
1858    
1859     /* Handle literal characters */
1860    
1861     case OP_CHAR:
1862 ph10 602 case OP_CHARI:
1863 nigel 91 case OP_NOT:
1864 ph10 604 case OP_NOTI:
1865 nigel 77 branchlength++;
1866     cc += 2;
1867 chpe 1114 #ifdef SUPPORT_UTF
1868 ph10 836 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1869 nigel 77 #endif
1870     break;
1871    
1872     /* Handle exact repetitions. The count is already in characters, but we
1873     need to skip over a multibyte character in UTF8 mode. */
1874    
1875     case OP_EXACT:
1876 ph10 747 case OP_EXACTI:
1877     case OP_NOTEXACT:
1878     case OP_NOTEXACTI:
1879 chpe 1068 branchlength += (int)GET2(cc,1);
1880 ph10 836 cc += 2 + IMM2_SIZE;
1881 chpe 1114 #ifdef SUPPORT_UTF
1882 ph10 836 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1883 nigel 77 #endif
1884     break;
1885    
1886     case OP_TYPEEXACT:
1887     branchlength += GET2(cc,1);
1888 ph10 1221 if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1889 ph10 1046 cc += 2;
1890 ph10 836 cc += 1 + IMM2_SIZE + 1;
1891 nigel 77 break;
1892    
1893     /* Handle single-char matchers */
1894    
1895     case OP_PROP:
1896     case OP_NOTPROP:
1897 nigel 87 cc += 2;
1898 nigel 77 /* Fall through */
1899    
1900 ph10 747 case OP_HSPACE:
1901     case OP_VSPACE:
1902     case OP_NOT_HSPACE:
1903     case OP_NOT_VSPACE:
1904 nigel 77 case OP_NOT_DIGIT:
1905     case OP_DIGIT:
1906     case OP_NOT_WHITESPACE:
1907     case OP_WHITESPACE:
1908     case OP_NOT_WORDCHAR:
1909     case OP_WORDCHAR:
1910     case OP_ANY:
1911 ph10 342 case OP_ALLANY:
1912 nigel 77 branchlength++;
1913     cc++;
1914     break;
1915    
1916 ph10 836 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1917 ph10 754 otherwise \C is coded as OP_ALLANY. */
1918 nigel 77
1919     case OP_ANYBYTE:
1920     return -2;
1921    
1922     /* Check a class for variable quantification */
1923    
1924     case OP_CLASS:
1925     case OP_NCLASS:
1926 chpe 1147 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1927 zherczeg 1148 case OP_XCLASS:
1928     /* The original code caused an unsigned overflow in 64 bit systems,
1929     so now we use a conditional statement. */
1930     if (op == OP_XCLASS)
1931 chpe 1147 cc += GET(cc, 1);
1932 zherczeg 1148 else
1933     cc += PRIV(OP_lengths)[OP_CLASS];
1934     #else
1935     cc += PRIV(OP_lengths)[OP_CLASS];
1936 chpe 1147 #endif
1937    
1938 nigel 77 switch (*cc)
1939     {
1940 ph10 747 case OP_CRPLUS:
1941     case OP_CRMINPLUS:
1942 nigel 77 case OP_CRSTAR:
1943     case OP_CRMINSTAR:
1944     case OP_CRQUERY:
1945     case OP_CRMINQUERY:
1946     return -1;
1947    
1948     case OP_CRRANGE:
1949     case OP_CRMINRANGE:
1950 ph10 836 if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1951 chpe 1068 branchlength += (int)GET2(cc,1);
1952 ph10 836 cc += 1 + 2 * IMM2_SIZE;
1953 nigel 77 break;
1954    
1955     default:
1956     branchlength++;
1957     }
1958     break;
1959    
1960     /* Anything else is variable length */
1961    
1962 ph10 747 case OP_ANYNL:
1963     case OP_BRAMINZERO:
1964     case OP_BRAPOS:
1965     case OP_BRAPOSZERO:
1966     case OP_BRAZERO:
1967     case OP_CBRAPOS:
1968     case OP_EXTUNI:
1969     case OP_KETRMAX:
1970     case OP_KETRMIN:
1971     case OP_KETRPOS:
1972     case OP_MINPLUS:
1973     case OP_MINPLUSI:
1974     case OP_MINQUERY:
1975     case OP_MINQUERYI:
1976     case OP_MINSTAR:
1977     case OP_MINSTARI:
1978     case OP_MINUPTO:
1979     case OP_MINUPTOI:
1980     case OP_NOTMINPLUS:
1981     case OP_NOTMINPLUSI:
1982     case OP_NOTMINQUERY:
1983     case OP_NOTMINQUERYI:
1984     case OP_NOTMINSTAR:
1985     case OP_NOTMINSTARI:
1986     case OP_NOTMINUPTO:
1987     case OP_NOTMINUPTOI:
1988     case OP_NOTPLUS:
1989     case OP_NOTPLUSI:
1990     case OP_NOTPOSPLUS:
1991     case OP_NOTPOSPLUSI:
1992     case OP_NOTPOSQUERY:
1993     case OP_NOTPOSQUERYI:
1994     case OP_NOTPOSSTAR:
1995     case OP_NOTPOSSTARI:
1996     case OP_NOTPOSUPTO:
1997     case OP_NOTPOSUPTOI:
1998     case OP_NOTQUERY:
1999     case OP_NOTQUERYI:
2000     case OP_NOTSTAR:
2001     case OP_NOTSTARI:
2002     case OP_NOTUPTO:
2003     case OP_NOTUPTOI:
2004     case OP_PLUS:
2005     case OP_PLUSI:
2006     case OP_POSPLUS:
2007     case OP_POSPLUSI:
2008     case OP_POSQUERY:
2009     case OP_POSQUERYI:
2010     case OP_POSSTAR:
2011     case OP_POSSTARI:
2012     case OP_POSUPTO:
2013     case OP_POSUPTOI:
2014     case OP_QUERY:
2015     case OP_QUERYI:
2016     case OP_REF:
2017     case OP_REFI:
2018     case OP_SBRA:
2019     case OP_SBRAPOS:
2020     case OP_SCBRA:
2021     case OP_SCBRAPOS:
2022     case OP_SCOND:
2023     case OP_SKIPZERO:
2024     case OP_STAR:
2025     case OP_STARI:
2026     case OP_TYPEMINPLUS:
2027     case OP_TYPEMINQUERY:
2028     case OP_TYPEMINSTAR:
2029     case OP_TYPEMINUPTO:
2030     case OP_TYPEPLUS:
2031     case OP_TYPEPOSPLUS:
2032     case OP_TYPEPOSQUERY:
2033     case OP_TYPEPOSSTAR:
2034     case OP_TYPEPOSUPTO:
2035     case OP_TYPEQUERY:
2036     case OP_TYPESTAR:
2037     case OP_TYPEUPTO:
2038     case OP_UPTO:
2039     case OP_UPTOI:
2040     return -1;
2041    
2042     /* Catch unrecognized opcodes so that when new ones are added they
2043     are not forgotten, as has happened in the past. */
2044    
2045 nigel 77 default:
2046 ph10 747 return -4;
2047 nigel 77 }
2048     }
2049     /* Control never gets here */
2050     }
2051    
2052    
2053    
2054    
2055     /*************************************************
2056 ph10 454 * Scan compiled regex for specific bracket *
2057 nigel 77 *************************************************/
2058    
2059     /* This little function scans through a compiled pattern until it finds a
2060 ph10 454 capturing bracket with the given number, or, if the number is negative, an
2061 ph10 461 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2062     so that it can be called from pcre_study() when finding the minimum matching
2063 ph10 455 length.
2064 nigel 77
2065     Arguments:
2066     code points to start of expression
2067 chpe 1055 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2068 ph10 454 number the required bracket number or negative to find a lookbehind
2069 nigel 77
2070     Returns: pointer to the opcode for the bracket, or NULL if not found
2071     */
2072    
2073 ph10 836 const pcre_uchar *
2074     PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2075 nigel 77 {
2076     for (;;)
2077     {
2078 chpe 1069 register pcre_uchar c = *code;
2079 ph10 618
2080 nigel 77 if (c == OP_END) return NULL;
2081 nigel 91
2082     /* XCLASS is used for classes that cannot be represented just by a bit
2083     map. This includes negated single high-valued characters. The length in
2084     the table is zero; the actual length is stored in the compiled code. */
2085    
2086     if (c == OP_XCLASS) code += GET(code, 1);
2087 ph10 461
2088 ph10 454 /* Handle recursion */
2089 ph10 461
2090 ph10 454 else if (c == OP_REVERSE)
2091     {
2092 ph10 836 if (number < 0) return (pcre_uchar *)code;
2093     code += PRIV(OP_lengths)[c];
2094 ph10 454 }
2095 nigel 91
2096 nigel 93 /* Handle capturing bracket */
2097 nigel 91
2098 ph10 604 else if (c == OP_CBRA || c == OP_SCBRA ||
2099     c == OP_CBRAPOS || c == OP_SCBRAPOS)
2100 nigel 77 {
2101 chpe 1069 int n = (int)GET2(code, 1+LINK_SIZE);
2102 ph10 836 if (n == number) return (pcre_uchar *)code;
2103     code += PRIV(OP_lengths)[c];
2104 nigel 77 }
2105 nigel 91
2106 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
2107     repeated character types, we have to test for \p and \P, which have an extra
2108 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2109 ph10 510 must add in its length. */
2110 nigel 91
2111 nigel 77 else
2112     {
2113 ph10 218 switch(c)
2114     {
2115     case OP_TYPESTAR:
2116     case OP_TYPEMINSTAR:
2117     case OP_TYPEPLUS:
2118     case OP_TYPEMINPLUS:
2119     case OP_TYPEQUERY:
2120     case OP_TYPEMINQUERY:
2121     case OP_TYPEPOSSTAR:
2122     case OP_TYPEPOSPLUS:
2123     case OP_TYPEPOSQUERY:
2124     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2125 ph10 220 break;
2126 ph10 221
2127     case OP_TYPEUPTO:
2128     case OP_TYPEMINUPTO:
2129     case OP_TYPEEXACT:
2130     case OP_TYPEPOSUPTO:
2131 ph10 1221 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2132 ph10 1046 code += 2;
2133 ph10 221 break;
2134 ph10 512
2135 ph10 510 case OP_MARK:
2136     case OP_PRUNE_ARG:
2137     case OP_SKIP_ARG:
2138 ph10 550 case OP_THEN_ARG:
2139 ph10 716 code += code[1];
2140 ph10 550 break;
2141 ph10 220 }
2142    
2143 ph10 218 /* Add in the fixed length from the table */
2144 ph10 220
2145 ph10 836 code += PRIV(OP_lengths)[c];
2146 ph10 220
2147 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2148     a multi-byte character. The length in the table is a minimum, so we have to
2149     arrange to skip the extra bytes. */
2150 ph10 220
2151 chpe 1055 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2152 ph10 836 if (utf) switch(c)
2153 nigel 77 {
2154     case OP_CHAR:
2155 ph10 602 case OP_CHARI:
2156 nigel 77 case OP_EXACT:
2157 ph10 602 case OP_EXACTI:
2158 nigel 77 case OP_UPTO:
2159 ph10 602 case OP_UPTOI:
2160 nigel 77 case OP_MINUPTO:
2161 ph10 602 case OP_MINUPTOI:
2162 nigel 93 case OP_POSUPTO:
2163 ph10 602 case OP_POSUPTOI:
2164 nigel 77 case OP_STAR:
2165 ph10 602 case OP_STARI:
2166 nigel 77 case OP_MINSTAR:
2167 ph10 602 case OP_MINSTARI:
2168 nigel 93 case OP_POSSTAR:
2169 ph10 602 case OP_POSSTARI:
2170 nigel 77 case OP_PLUS:
2171 ph10 602 case OP_PLUSI:
2172 nigel 77 case OP_MINPLUS:
2173 ph10 602 case OP_MINPLUSI:
2174 nigel 93 case OP_POSPLUS:
2175 ph10 602 case OP_POSPLUSI:
2176 nigel 77 case OP_QUERY:
2177 ph10 602 case OP_QUERYI:
2178 nigel 77 case OP_MINQUERY:
2179 ph10 602 case OP_MINQUERYI:
2180 nigel 93 case OP_POSQUERY:
2181 ph10 602 case OP_POSQUERYI:
2182 ph10 836 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2183 nigel 77 break;
2184     }
2185 ph10 369 #else
2186 ph10 836 (void)(utf); /* Keep compiler happy by referencing function argument */
2187 ph10 111 #endif
2188 nigel 77 }
2189     }
2190     }
2191    
2192    
2193    
2194     /*************************************************
2195     * Scan compiled regex for recursion reference *
2196     *************************************************/
2197    
2198     /* This little function scans through a compiled pattern until it finds an
2199     instance of OP_RECURSE.
2200    
2201     Arguments:
2202     code points to start of expression
2203 chpe 1055 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2204 nigel 77
2205     Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
2206     */
2207    
2208 ph10 836 static const pcre_uchar *
2209     find_recurse(const pcre_uchar *code, BOOL utf)
2210 nigel 77 {
2211     for (;;)
2212     {
2213 chpe 1070 register pcre_uchar c = *code;
2214 nigel 77 if (c == OP_END) return NULL;
2215 nigel 91 if (c == OP_RECURSE) return code;
2216 ph10 220
2217 nigel 91 /* XCLASS is used for classes that cannot be represented just by a bit
2218     map. This includes negated single high-valued characters. The length in
2219     the table is zero; the actual length is stored in the compiled code. */
2220    
2221     if (c == OP_XCLASS) code += GET(code, 1);
2222    
2223 ph10 220 /* Otherwise, we can get the item's length from the table, except that for
2224     repeated character types, we have to test for \p and \P, which have an extra
2225 ph10 512 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2226 ph10 510 must add in its length. */
2227 nigel 91
2228 nigel 77 else
2229     {
2230 ph10 218 switch(c)
2231     {
2232     case OP_TYPESTAR:
2233     case OP_TYPEMINSTAR:
2234     case OP_TYPEPLUS:
2235     case OP_TYPEMINPLUS:
2236     case OP_TYPEQUERY:
2237     case OP_TYPEMINQUERY:
2238     case OP_TYPEPOSSTAR:
2239     case OP_TYPEPOSPLUS:
2240     case OP_TYPEPOSQUERY:
2241     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2242 ph10 220 break;
2243 ph10 221
2244     case OP_TYPEPOSUPTO:
2245     case OP_TYPEUPTO:
2246     case OP_TYPEMINUPTO:
2247     case OP_TYPEEXACT:
2248 ph10 1221 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2249 ph10 1046 code += 2;
2250 ph10 221 break;
2251 ph10 512
2252 ph10 510 case OP_MARK:
2253     case OP_PRUNE_ARG:
2254     case OP_SKIP_ARG:
2255 ph10 550 case OP_THEN_ARG:
2256 ph10 716 code += code[1];
2257 ph10 550 break;
2258 ph10 220 }
2259    
2260 ph10 218 /* Add in the fixed length from the table */
2261    
2262 ph10 836 code += PRIV(OP_lengths)[c];
2263 ph10 220
2264 ph10 218 /* In UTF-8 mode, opcodes that are followed by a character may be followed
2265     by a multi-byte character. The length in the table is a minimum, so we have
2266     to arrange to skip the extra bytes. */
2267 ph10 220
2268 chpe 1055 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2269 ph10 836 if (utf) switch(c)
2270 nigel 77 {
2271     case OP_CHAR:
2272 ph10 602 case OP_CHARI:
2273 zherczeg 924 case OP_NOT:
2274     case OP_NOTI:
2275 nigel 77 case OP_EXACT:
2276 ph10 602 case OP_EXACTI:
2277 zherczeg 924 case OP_NOTEXACT:
2278     case OP_NOTEXACTI:
2279 nigel 77 case OP_UPTO:
2280 ph10 602 case OP_UPTOI:
2281 zherczeg 924 case OP_NOTUPTO:
2282     case OP_NOTUPTOI:
2283 nigel 77 case OP_MINUPTO:
2284 ph10 602 case OP_MINUPTOI:
2285 zherczeg 924 case OP_NOTMINUPTO:
2286     case OP_NOTMINUPTOI:
2287 nigel 93 case OP_POSUPTO:
2288 ph10 602 case OP_POSUPTOI:
2289 zherczeg 924 case OP_NOTPOSUPTO:
2290     case OP_NOTPOSUPTOI:
2291 nigel 77 case OP_STAR:
2292 ph10 602 case OP_STARI:
2293 zherczeg 924 case OP_NOTSTAR:
2294     case OP_NOTSTARI:
2295 nigel 77 case OP_MINSTAR:
2296 ph10 602 case OP_MINSTARI:
2297 zherczeg 924 case OP_NOTMINSTAR:
2298     case OP_NOTMINSTARI:
2299 nigel 93 case OP_POSSTAR:
2300 ph10 602 case OP_POSSTARI:
2301 zherczeg 924 case OP_NOTPOSSTAR:
2302     case OP_NOTPOSSTARI:
2303 nigel 77 case OP_PLUS:
2304 ph10 602 case OP_PLUSI:
2305 zherczeg 924 case OP_NOTPLUS:
2306     case OP_NOTPLUSI:
2307 nigel 77 case OP_MINPLUS:
2308 ph10 602 case OP_MINPLUSI:
2309 zherczeg 924 case OP_NOTMINPLUS:
2310     case OP_NOTMINPLUSI:
2311 nigel 93 case OP_POSPLUS:
2312 ph10 602 case OP_POSPLUSI:
2313 zherczeg 924 case OP_NOTPOSPLUS:
2314     case OP_NOTPOSPLUSI:
2315 nigel 77 case OP_QUERY:
2316 ph10 602 case OP_QUERYI:
2317 zherczeg 924 case OP_NOTQUERY:
2318     case OP_NOTQUERYI:
2319 nigel 77 case OP_MINQUERY:
2320 ph10 602 case OP_MINQUERYI:
2321 zherczeg 924 case OP_NOTMINQUERY:
2322     case OP_NOTMINQUERYI:
2323 nigel 93 case OP_POSQUERY:
2324 ph10 602 case OP_POSQUERYI:
2325 zherczeg 924 case OP_NOTPOSQUERY:
2326     case OP_NOTPOSQUERYI:
2327 ph10 836 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2328 nigel 77 break;
2329     }
2330 ph10 369 #else
2331 ph10 836 (void)(utf); /* Keep compiler happy by referencing function argument */
2332 ph10 111 #endif
2333 nigel 77 }
2334     }
2335     }
2336    
2337    
2338    
2339     /*************************************************
2340     * Scan compiled branch for non-emptiness *
2341     *************************************************/
2342    
2343     /* This function scans through a branch of a compiled pattern to see whether it
2344 nigel 93 can match the empty string or not. It is called from could_be_empty()
2345     below and from compile_branch() when checking for an unlimited repeat of a
2346     group that can match nothing. Note that first_significant_code() skips over
2347 ph10 282 backward and negative forward assertions when its final argument is TRUE. If we
2348     hit an unclosed bracket, we return "empty" - this means we've struck an inner
2349     bracket whose current branch will already have been scanned.
2350 nigel 77
2351     Arguments:
2352     code points to start of search
2353     endcode points to where to stop
2354 chpe 1055 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2355 ph10 503 cd contains pointers to tables etc.
2356 nigel 77
2357     Returns: TRUE if what is matched could be empty
2358     */
2359    
2360     static BOOL
2361 ph10 836 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2362     BOOL utf, compile_data *cd)
2363 nigel 77 {
2364 chpe 1071 register pcre_uchar c;
2365 ph10 836 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2366 nigel 77 code < endcode;
2367 ph10 836 code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2368 nigel 77 {
2369 ph10 836 const pcre_uchar *ccode;
2370 nigel 77
2371     c = *code;
2372 ph10 507
2373 ph10 286 /* Skip over forward assertions; the other assertions are skipped by
2374 ph10 282 first_significant_code() with a TRUE final argument. */
2375 ph10 286
2376 ph10 282 if (c == OP_ASSERT)
2377 ph10 286 {
2378 ph10 282 do code += GET(code, 1); while (*code == OP_ALT);
2379     c = *code;
2380     continue;
2381 ph10 286 }
2382 ph10 172
2383 ph10 503 /* For a recursion/subroutine call, if its end has been reached, which
2384 ph10 624 implies a backward reference subroutine call, we can scan it. If it's a
2385     forward reference subroutine call, we can't. To detect forward reference
2386 ph10 654 we have to scan up the list that is kept in the workspace. This function is
2387     called only when doing the real compile, not during the pre-compile that
2388 ph10 624 measures the size of the compiled pattern. */
2389 ph10 507
2390 ph10 503 if (c == OP_RECURSE)
2391     {
2392 ph10 836 const pcre_uchar *scode;
2393 ph10 624 BOOL empty_branch;
2394 ph10 654
2395 ph10 624 /* Test for forward reference */
2396 ph10 654
2397 ph10 624 for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2398 chpe 1071 if ((int)GET(scode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2399 ph10 624
2400     /* Not a forward reference, test for completed backward reference */
2401 ph10 654
2402 ph10 624 empty_branch = FALSE;
2403     scode = cd->start_code + GET(code, 1);
2404 ph10 503 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
2405 ph10 654
2406 ph10 624 /* Completed backwards reference */
2407 ph10 654
2408 ph10 503 do
2409     {
2410 ph10 836 if (could_be_empty_branch(scode, endcode, utf, cd))
2411 ph10 504 {
2412     empty_branch = TRUE;
2413 ph10 507 break;
2414     }
2415 ph10 503 scode += GET(scode, 1);
2416     }
2417     while (*scode == OP_ALT);
2418 ph10 654
2419 ph10 504 if (!empty_branch) return FALSE; /* All branches are non-empty */
2420 ph10 503 continue;
2421 ph10 507 }
2422 ph10 170
2423 ph10 604 /* Groups with zero repeats can of course be empty; skip them. */
2424    
2425     if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2426     c == OP_BRAPOSZERO)
2427     {
2428 ph10 836 code += PRIV(OP_lengths)[c];
2429 ph10 604 do code += GET(code, 1); while (*code == OP_ALT);
2430     c = *code;
2431     continue;
2432     }
2433    
2434     /* A nested group that is already marked as "could be empty" can just be
2435     skipped. */
2436    
2437     if (c == OP_SBRA || c == OP_SBRAPOS ||
2438     c == OP_SCBRA || c == OP_SCBRAPOS)
2439     {
2440     do code += GET(code, 1); while (*code == OP_ALT);
2441     c = *code;
2442     continue;
2443     }
2444    
2445 ph10 170 /* For other groups, scan the branches. */
2446 ph10 172
2447 ph10 604 if (c == OP_BRA || c == OP_BRAPOS ||
2448     c == OP_CBRA || c == OP_CBRAPOS ||
2449 ph10 723 c == OP_ONCE || c == OP_ONCE_NC ||
2450     c == OP_COND)
2451 nigel 77 {
2452     BOOL empty_branch;
2453     if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
2454 ph10 406
2455     /* If a conditional group has only one branch, there is a second, implied,
2456 ph10 395 empty branch, so just skip over the conditional, because it could be empty.
2457     Otherwise, scan the individual branches of the group. */
2458 ph10 406
2459 ph10 395 if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2460 nigel 77 code += GET(code, 1);
2461 ph10 395 else
2462 ph10 406 {
2463 ph10 395 empty_branch = FALSE;
2464     do
2465     {
2466 ph10 836 if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))
2467 ph10 395 empty_branch = TRUE;
2468     code += GET(code, 1);
2469     }
2470     while (*code == OP_ALT);
2471     if (!empty_branch) return FALSE; /* All branches are non-empty */
2472 nigel 77 }
2473 ph10 406
2474 ph10 172 c = *code;
2475 nigel 93 continue;
2476 nigel 77 }
2477    
2478 nigel 93 /* Handle the other opcodes */
2479    
2480     switch (c)
2481 nigel 77 {
2482 ph10 216 /* Check for quantifiers after a class. XCLASS is used for classes that
2483     cannot be represented just by a bit map. This includes negated single
2484 ph10 836 high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2485 ph10 220 actual length is stored in the compiled code, so we must update "code"
2486 ph10 216 here. */
2487 nigel 77
2488 ph10 836 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2489 nigel 77 case OP_XCLASS:
2490 ph10 216 ccode = code += GET(code, 1);
2491 nigel 77 goto CHECK_CLASS_REPEAT;
2492     #endif
2493    
2494     case OP_CLASS:
2495     case OP_NCLASS:
2496 ph10 836 ccode = code + PRIV(OP_lengths)[OP_CLASS];
2497 nigel 77
2498 ph10 836 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2499 nigel 77 CHECK_CLASS_REPEAT:
2500     #endif
2501    
2502     switch (*ccode)
2503     {
2504     case OP_CRSTAR: /* These could be empty; continue */
2505     case OP_CRMINSTAR:
2506     case OP_CRQUERY:
2507     case OP_CRMINQUERY:
2508     break;
2509    
2510     default: /* Non-repeat => class must match */
2511     case OP_CRPLUS: /* These repeats aren't empty */
2512     case OP_CRMINPLUS:
2513     return FALSE;
2514    
2515     case OP_CRRANGE:
2516     case OP_CRMINRANGE:
2517     if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
2518     break;
2519     }
2520     break;
2521    
2522     /* Opcodes that must match a character */
2523    
2524     case OP_PROP:
2525     case OP_NOTPROP:
2526     case OP_EXTUNI:
2527     case OP_NOT_DIGIT:
2528     case OP_DIGIT:
2529     case OP_NOT_WHITESPACE:
2530     case OP_WHITESPACE:
2531     case OP_NOT_WORDCHAR:
2532     case OP_WORDCHAR:
2533     case OP_ANY:
2534 ph10 345 case OP_ALLANY:
2535 nigel 77 case OP_ANYBYTE:
2536     case OP_CHAR:
2537 ph10 602 case OP_CHARI:
2538 nigel 77 case OP_NOT:
2539 ph10 602 case OP_NOTI:
2540 nigel 77 case OP_PLUS:
2541     case OP_MINPLUS:
2542 nigel 93 case OP_POSPLUS:
2543 nigel 77 case OP_EXACT:
2544     case OP_NOTPLUS:
2545     case OP_NOTMINPLUS:
2546 nigel 93 case OP_NOTPOSPLUS:
2547 nigel 77 case OP_NOTEXACT:
2548     case OP_TYPEPLUS:
2549     case OP_TYPEMINPLUS:
2550 nigel 93 case OP_TYPEPOSPLUS:
2551 nigel 77 case OP_TYPEEXACT:
2552     return FALSE;
2553 ph10 227
2554     /* These are going to continue, as they may be empty, but we have to
2555     fudge the length for the \p and \P cases. */
2556    
2557 ph10 224 case OP_TYPESTAR:
2558     case OP_TYPEMINSTAR:
2559     case OP_TYPEPOSSTAR:
2560     case OP_TYPEQUERY:
2561     case OP_TYPEMINQUERY:
2562     case OP_TYPEPOSQUERY:
2563     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2564 ph10 227 break;
2565    
2566 ph10 224 /* Same for these */
2567 ph10 227
2568 ph10 224 case OP_TYPEUPTO:
2569     case OP_TYPEMINUPTO:
2570     case OP_TYPEPOSUPTO:
2571 ph10 1221 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2572 ph10 1046 code += 2;
2573 ph10 224 break;
2574 nigel 77
2575     /* End of branch */
2576    
2577     case OP_KET:
2578     case OP_KETRMAX:
2579     case OP_KETRMIN:
2580 ph10 604 case OP_KETRPOS:
2581 nigel 77 case OP_ALT:
2582     return TRUE;
2583    
2584 nigel 93 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2585     MINUPTO, and POSUPTO may be followed by a multibyte character */
2586 nigel 77
2587 chpe 1055 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2588 nigel 77 case OP_STAR:
2589 ph10 602 case OP_STARI:
2590 nigel 77 case OP_MINSTAR:
2591 ph10 602 case OP_MINSTARI:
2592 nigel 93 case OP_POSSTAR:
2593 ph10 602 case OP_POSSTARI:
2594 nigel 77 case OP_QUERY:
2595 ph10 602 case OP_QUERYI:
2596 nigel 77 case OP_MINQUERY:
2597 ph10 602 case OP_MINQUERYI:
2598 nigel 93 case OP_POSQUERY:
2599 ph10 602 case OP_POSQUERYI:
2600 ph10 836 if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2601 ph10 426 break;
2602 ph10 461
2603 nigel 77 case OP_UPTO:
2604 ph10 602 case OP_UPTOI:
2605 nigel 77 case OP_MINUPTO:
2606 ph10 602 case OP_MINUPTOI:
2607 nigel 93 case OP_POSUPTO:
2608 ph10 602 case OP_POSUPTOI:
2609 ph10 836 if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2610 nigel 77 break;
2611     #endif
2612 ph10 503
2613 ph10 510 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2614     string. */
2615    
2616     case OP_MARK:
2617     case OP_PRUNE_ARG:
2618     case OP_SKIP_ARG:
2619 ph10 550 case OP_THEN_ARG:
2620 ph10 716 code += code[1];
2621 ph10 550 break;
2622    
2623 ph10 503 /* None of the remaining opcodes are required to match a character. */
2624 ph10 507
2625 ph10 503 default:
2626 ph10 507 break;
2627 nigel 77 }
2628     }
2629    
2630     return TRUE;
2631     }
2632    
2633    
2634    
2635     /*************************************************
2636     * Scan compiled regex for non-emptiness *
2637     *************************************************/
2638    
2639     /* This function is called to check for left recursive calls. We want to check
2640     the current branch of the current pattern to see if it could match the empty
2641     string. If it could, we must look outwards for branches at other levels,
2642     stopping when we pass beyond the bracket which is the subject of the recursion.
2643 ph10 654 This function is called only during the real compile, not during the
2644 ph10 624 pre-compile.
2645 nigel 77
2646     Arguments:
2647     code points to start of the recursion
2648     endcode points to where to stop (current RECURSE item)
2649     bcptr points to the chain of current (unclosed) branch starts
2650 chpe 1055 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2651 ph10 507 cd pointers to tables etc
2652 nigel 77
2653     Returns: TRUE if what is matched could be empty
2654     */
2655    
2656     static BOOL
2657 ph10 836 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2658     branch_chain *bcptr, BOOL utf, compile_data *cd)
2659 nigel 77 {
2660 ph10 475 while (bcptr != NULL && bcptr->current_branch >= code)
2661 nigel 77 {
2662 ph10 836 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))
2663 ph10 475 return FALSE;
2664 nigel 77 bcptr = bcptr->outer;
2665     }
2666     return TRUE;
2667     }
2668    
2669    
2670    
2671     /*************************************************
2672     * Check for POSIX class syntax *
2673     *************************************************/
2674    
2675     /* This function is called when the sequence "[:" or "[." or "[=" is
2676 ph10 295 encountered in a character class. It checks whether this is followed by a
2677 ph10 298 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2678 ph10 295 reach an unescaped ']' without the special preceding character, return FALSE.
2679 nigel 77
2680 ph10 298 Originally, this function only recognized a sequence of letters between the
2681     terminators, but it seems that Perl recognizes any sequence of characters,
2682     though of course unknown POSIX names are subsequently rejected. Perl gives an
2683     "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2684     didn't consider this to be a POSIX class. Likewise for [:1234:].
2685 ph10 295
2686 ph10 298 The problem in trying to be exactly like Perl is in the handling of escapes. We
2687     have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2688     class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2689     below handles the special case of \], but does not try to do any other escape
2690     processing. This makes it different from Perl for cases such as [:l\ower:]
2691 ph10 295 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2692 ph10 298 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2693 ph10 295 I think.
2694    
2695 ph10 640 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2696     It seems that the appearance of a nested POSIX class supersedes an apparent
2697     external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2698 ph10 691 a digit.
2699 ph10 640
2700 ph10 661 In Perl, unescaped square brackets may also appear as part of class names. For
2701     example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2702     [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2703 ph10 691 seem right at all. PCRE does not allow closing square brackets in POSIX class
2704 ph10 661 names.
2705    
2706 ph10 295 Arguments:
2707 nigel 77 ptr pointer to the initial [
2708     endptr where to return the end pointer
2709    
2710     Returns: TRUE or FALSE
2711     */
2712    
2713     static BOOL
2714 ph10 836 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2715 nigel 77 {
2716 chpe 1072 pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */
2717 nigel 77 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2718 ph10 1186 for (++ptr; *ptr != CHAR_NULL; ptr++)
2719 nigel 77 {
2720 ph10 654 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2721     ptr++;
2722 ph10 691 else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2723 ph10 640 else
2724 ph10 298 {
2725 ph10 391 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2726 ph10 295 {
2727     *endptr = ptr;
2728     return TRUE;
2729 ph10 298 }
2730 ph10 640 if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2731     (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2732     ptr[1] == CHAR_EQUALS_SIGN) &&
2733     check_posix_syntax(ptr, endptr))
2734 ph10 654 return FALSE;
2735 ph10 298 }
2736     }
2737 nigel 77 return FALSE;
2738     }
2739    
2740    
2741    
2742    
2743     /*************************************************
2744     * Check POSIX class name *
2745     *************************************************/
2746    
2747     /* This function is called to check the name given in a POSIX-style class entry
2748     such as [:alnum:].
2749    
2750     Arguments:
2751     ptr points to the first letter
2752     len the length of the name
2753    
2754     Returns: a value representing the name, or -1 if unknown
2755     */
2756    
2757     static int
2758 ph10 836 check_posix_name(const pcre_uchar *ptr, int len)
2759 nigel 77 {
2760 ph10 240 const char *pn = posix_names;
2761 nigel 77 register int yield = 0;
2762     while (posix_name_lengths[yield] != 0)
2763     {
2764     if (len == posix_name_lengths[yield] &&
2765 chpe 1074 STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
2766 ph10 243 pn += posix_name_lengths[yield] + 1;
2767 nigel 77 yield++;
2768     }
2769     return -1;
2770     }
2771    
2772    
2773     /*************************************************
2774     * Adjust OP_RECURSE items in repeated group *
2775     *************************************************/
2776    
2777     /* OP_RECURSE items contain an offset from the start of the regex to the group
2778     that is referenced. This means that groups can be replicated for fixed
2779     repetition simply by copying (because the recursion is allowed to refer to
2780     earlier groups that are outside the current group). However, when a group is
2781 ph10 335 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2782     inserted before it, after it has been compiled. This means that any OP_RECURSE
2783     items within it that refer to the group itself or any contained groups have to
2784     have their offsets adjusted. That one of the jobs of this function. Before it
2785     is called, the partially compiled regex must be temporarily terminated with
2786     OP_END.
2787 nigel 77
2788 nigel 93 This function has been extended with the possibility of forward references for
2789     recursions and subroutine calls. It must also check the list of such references
2790     for the group we are dealing with. If it finds that one of the recursions in
2791     the current group is on this list, it adjusts the offset in the list, not the
2792     value in the reference (which is a group number).
2793    
2794 nigel 77 Arguments:
2795     group points to the start of the group
2796     adjust the amount by which the group is to be moved
2797 chpe 1055 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
2798 nigel 77 cd contains pointers to tables etc.
2799 nigel 93 save_hwm the hwm forward reference pointer at the start of the group
2800 nigel 77
2801     Returns: nothing
2802     */
2803    
2804     static void
2805 ph10 836 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
2806     pcre_uchar *save_hwm)
2807 nigel 77 {
2808 ph10 836 pcre_uchar *ptr = group;
2809 ph10 224
2810 ph10 836 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
2811 nigel 77 {
2812 nigel 93 int offset;
2813 ph10 836 pcre_uchar *hc;
2814 nigel 93
2815     /* See if this recursion is on the forward reference list. If so, adjust the
2816     reference. */
2817 ph10 345
2818 nigel 93 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2819     {
2820 chpe 1073 offset = (int)GET(hc, 0);
2821 nigel 93 if (cd->start_code + offset == ptr + 1)
2822     {
2823     PUT(hc, 0, offset + adjust);
2824     break;
2825     }
2826     }
2827    
2828     /* Otherwise, adjust the recursion offset if it's after the start of this
2829     group. */
2830    
2831     if (hc >= cd->hwm)
2832     {
2833 chpe 1073 offset = (int)GET(ptr, 1);
2834 nigel 93 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2835     }
2836    
2837 nigel 77 ptr += 1 + LINK_SIZE;
2838     }
2839     }
2840    
2841    
2842    
2843     /*************************************************
2844     * Insert an automatic callout point *
2845     *************************************************/
2846    
2847     /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2848     callout points before each pattern item.
2849    
2850     Arguments:
2851     code current code pointer
2852     ptr current pattern pointer
2853     cd pointers to tables etc
2854    
2855     Returns: new code pointer
2856     */
2857    
2858 ph10 836 static pcre_uchar *
2859     auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2860 nigel 77 {
2861     *code++ = OP_CALLOUT;
2862     *code++ = 255;
2863 ph10 530 PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */
2864     PUT(code, LINK_SIZE, 0); /* Default length */
2865 ph10 836 return code + 2 * LINK_SIZE;
2866 nigel 77 }
2867    
2868    
2869    
2870     /*************************************************
2871     * Complete a callout item *
2872     *************************************************/
2873    
2874     /* A callout item contains the length of the next item in the pattern, which
2875     we can't fill in till after we have reached the relevant point. This is used
2876     for both automatic and manual callouts.
2877    
2878     Arguments:
2879     previous_callout points to previous callout item
2880     ptr current pattern pointer
2881     cd pointers to tables etc
2882    
2883     Returns: nothing
2884     */
2885    
2886     static void
2887 ph10 836 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2888 nigel 77 {
2889 ph10 530 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2890 nigel 77 PUT(previous_callout, 2 + LINK_SIZE, length);
2891     }
2892    
2893    
2894    
2895     #ifdef SUPPORT_UCP
2896     /*************************************************
2897     * Get othercase range *
2898     *************************************************/
2899    
2900     /* This function is passed the start and end of a class range, in UTF-8 mode
2901 ph10 1045 with UCP support. It searches up the characters, looking for ranges of
2902 nigel 77 characters in the "other" case. Each call returns the next one, updating the
2903 ph10 1221 start address. A character with multiple other cases is returned on its own
2904 ph10 1045 with a special return value.
2905 nigel 77
2906     Arguments:
2907     cptr points to starting character value; updated
2908     d end value
2909     ocptr where to put start of othercase range
2910     odptr where to put end of othercase range
2911    
2912 ph10 1045 Yield: -1 when no more
2913     0 when a range is returned
2914     >0 the CASESET offset for char with multiple other cases
2915 ph10 1221 in this case, ocptr contains the original
2916 nigel 77 */
2917    
2918 ph10 1045 static int
2919 chpe 1064 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
2920     pcre_uint32 *odptr)
2921 nigel 77 {
2922 chpe 1064 pcre_uint32 c, othercase, next;
2923 chpe 1131 unsigned int co;
2924 nigel 77
2925 ph10 1221 /* Find the first character that has an other case. If it has multiple other
2926 ph10 1045 cases, return its case offset value. */
2927    
2928 nigel 77 for (c = *cptr; c <= d; c++)
2929 ph10 1221 {
2930 ph10 1045 if ((co = UCD_CASESET(c)) != 0)
2931     {
2932     *ocptr = c++; /* Character that has the set */
2933     *cptr = c; /* Rest of input range */
2934 chpe 1131 return (int)co;
2935 ph10 1221 }
2936     if ((othercase = UCD_OTHERCASE(c)) != c) break;
2937 ph10 1045 }
2938 nigel 77
2939 ph10 1045 if (c > d) return -1; /* Reached end of range */
2940 nigel 77
2941     *ocptr = othercase;
2942     next = othercase + 1;
2943    
2944     for (++c; c <= d; c++)
2945     {
2946 ph10 349 if (UCD_OTHERCASE(c) != next) break;
2947 nigel 77 next++;
2948     }
2949    
2950 ph10 1045 *odptr = next - 1; /* End of othercase range */
2951     *cptr = c; /* Rest of input range */
2952     return 0;
2953 nigel 77 }
2954 ph10 532
2955    
2956    
2957     /*************************************************
2958     * Check a character and a property *
2959     *************************************************/
2960    
2961     /* This function is called by check_auto_possessive() when a property item
2962     is adjacent to a fixed character.
2963    
2964     Arguments:
2965     c the character
2966     ptype the property type
2967     pdata the data for the type
2968     negated TRUE if it's a negated property (\P or \p{^)
2969 ph10 535
2970 ph10 532 Returns: TRUE if auto-possessifying is OK
2971 ph10 535 */
2972 ph10 532
2973     static BOOL
2974 chpe 1130 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata, BOOL negated)
2975 ph10 532 {
2976 ph10 1046 #ifdef SUPPORT_UCP
2977     const pcre_uint32 *p;
2978     #endif
2979    
2980 ph10 532 const ucd_record *prop = GET_UCD(c);
2981 ph10 1046
2982 ph10 532 switch(ptype)
2983     {
2984     case PT_LAMP:
2985     return (prop->chartype == ucp_Lu ||
2986     prop->chartype == ucp_Ll ||
2987     prop->chartype == ucp_Lt) == negated;
2988    
2989     case PT_GC:
2990 ph10 836 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2991 ph10 532
2992     case PT_PC:
2993     return (pdata == prop->chartype) == negated;
2994    
2995     case PT_SC:
2996     return (pdata == prop->script) == negated;
2997    
2998     /* These are specials */
2999    
3000     case PT_ALNUM:
3001 ph10 836 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
3002     PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
3003 ph10 532
3004     case PT_SPACE: /* Perl space */
3005 ph10 836 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
3006 ph10 532 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
3007     == negated;
3008    
3009     case PT_PXSPACE: /* POSIX space */
3010 ph10 836 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
3011 ph10 532 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
3012     c == CHAR_FF || c == CHAR_CR)
3013     == negated;
3014    
3015     case PT_WORD:
3016 ph10 836 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
3017     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
3018 ph10 532 c == CHAR_UNDERSCORE) == negated;
3019 ph10 1221
3020 zherczeg 1047 #ifdef SUPPORT_UCP
3021 ph10 1046 case PT_CLIST:
3022     p = PRIV(ucd_caseless_sets) + prop->caseset;
3023     for (;;)
3024     {
3025 chpe 1205 if (c < *p) return !negated;
3026     if (c == *p++) return negated;
3027 zherczeg 1047 }
3028 ph10 1046 break; /* Control never reaches here */
3029 zherczeg 1047 #endif
3030 ph10 532 }
3031 zherczeg 1047
3032 ph10 535 return FALSE;
3033 ph10 532 }
3034 nigel 77 #endif /* SUPPORT_UCP */
3035    
3036    
3037 nigel 93
3038 nigel 77 /*************************************************
3039 nigel 93 * Check if auto-possessifying is possible *
3040     *************************************************/
3041    
3042     /* This function is called for unlimited repeats of certain items, to see
3043     whether the next thing could possibly match the repeated item. If not, it makes
3044     sense to automatically possessify the repeated item.
3045    
3046     Arguments:
3047 ph10 532 previous pointer to the repeated opcode
3048 chpe 1055 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
3049 nigel 93 ptr next character in pattern
3050     options options bits
3051     cd contains pointers to tables etc.
3052    
3053     Returns: TRUE if possessifying is wanted
3054     */
3055    
3056     static BOOL
3057 ph10 836 check_auto_possessive(const pcre_uchar *previous, BOOL utf,
3058     const pcre_uchar *ptr, int options, compile_data *cd)
3059 nigel 93 {
3060 chpe 1063 pcre_uint32 c = NOTACHAR;
3061     pcre_uint32 next;
3062 chpe 1059 int escape;
3063 chpe 1076 pcre_uchar op_code = *previous++;
3064 nigel 93
3065     /* Skip whitespace and comments in extended mode */
3066    
3067     if ((options & PCRE_EXTENDED) != 0)
3068     {
3069     for (;;)
3070     {
3071 ph10 836 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3072 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
3073 nigel 93 {
3074 ph10 579 ptr++;
3075 ph10 1186 while (*ptr != CHAR_NULL)
3076 ph10 556 {
3077 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3078 ph10 556 ptr++;
3079 ph10 836 #ifdef SUPPORT_UTF
3080     if (utf) FORWARDCHAR(ptr);
3081 ph10 556 #endif
3082     }
3083 nigel 93 }
3084     else break;
3085     }
3086     }
3087    
3088     /* If the next item is one that we can handle, get its value. A non-negative
3089     value is a character, a negative value is an escape value. */
3090    
3091 ph10 391 if (*ptr == CHAR_BACKSLASH)
3092 nigel 93 {
3093     int temperrorcode = 0;
3094 ph10 1313 escape = check_escape(&ptr, &next, &temperrorcode, cd->bracount, options,
3095 ph10 1253 FALSE);
3096 nigel 93 if (temperrorcode != 0) return FALSE;
3097     ptr++; /* Point after the escape sequence */
3098     }
3099 ph10 836 else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
3100 nigel 93 {
3101 chpe 1059 escape = 0;
3102 ph10 836 #ifdef SUPPORT_UTF
3103     if (utf) { GETCHARINC(next, ptr); } else
3104 nigel 93 #endif
3105     next = *ptr++;
3106     }
3107     else return FALSE;
3108    
3109     /* Skip whitespace and comments in extended mode */
3110    
3111     if ((options & PCRE_EXTENDED) != 0)
3112     {
3113     for (;;)
3114     {
3115 ph10 836 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3116 ph10 391 if (*ptr == CHAR_NUMBER_SIGN)
3117 nigel 93 {
3118 ph10 579 ptr++;
3119 ph10 1186 while (*ptr != CHAR_NULL)
3120 ph10 556 {
3121 nigel 93 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3122 ph10 556 ptr++;
3123 ph10 836 #ifdef SUPPORT_UTF
3124     if (utf) FORWARDCHAR(ptr);
3125 ph10 556 #endif
3126     }
3127 nigel 93 }
3128     else break;
3129     }
3130     }
3131    
3132     /* If the next thing is itself optional, we have to give up. */
3133    
3134 ph10 392 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3135 ph10 836 STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3136 ph10 391 return FALSE;
3137 zherczeg 1047
3138 ph10 1046 /* If the previous item is a character, get its value. */
3139 nigel 93
3140 ph10 1221 if (op_code == OP_CHAR || op_code == OP_CHARI ||
3141 ph10 1046 op_code == OP_NOT || op_code == OP_NOTI)
3142 zherczeg 1047 {
3143 ph10 836 #ifdef SUPPORT_UTF
3144 ph10 532 GETCHARTEST(c, previous);
3145 ph10 369 #else
3146 ph10 532 c = *previous;
3147 ph10 535 #endif
3148 ph10 1046 }
3149 nigel 93
3150 ph10 1046 /* Now compare the next item with the previous opcode. First, handle cases when
3151 ph10 1048 the next item is a character. */
3152 nigel 93
3153 chpe 1059 if (escape == 0)
3154 ph10 1046 {
3155 ph10 1048 /* For a caseless UTF match, the next character may have more than one other
3156     case, which maps to the special PT_CLIST property. Check this first. */
3157 ph10 1221
3158 ph10 1046 #ifdef SUPPORT_UCP
3159 chpe 1063 if (utf && c != NOTACHAR && (options & PCRE_CASELESS) != 0)
3160 ph10 1046 {
3161 chpe 1131 unsigned int ocs = UCD_CASESET(next);
3162 ph10 1048 if (ocs > 0) return check_char_prop(c, PT_CLIST, ocs, op_code >= OP_NOT);
3163 zherczeg 1047 }
3164 ph10 535 #endif
3165 ph10 1046
3166     switch(op_code)
3167     {
3168     case OP_CHAR:
3169     return c != next;
3170 zherczeg 1047
3171 ph10 1046 /* For CHARI (caseless character) we must check the other case. If we have
3172     Unicode property support, we can use it to test the other case of
3173 ph10 1221 high-valued characters. We know that next can have only one other case,
3174 ph10 1046 because multi-other-case characters are dealt with above. */
3175 zherczeg 1047
3176 ph10 1046 case OP_CHARI:
3177     if (c == next) return FALSE;
3178 ph10 836 #ifdef SUPPORT_UTF
3179 ph10 1046 if (utf)
3180     {
3181 chpe 1063 pcre_uint32 othercase;
3182 ph10 1046 if (next < 128) othercase = cd->fcc[next]; else
3183 nigel 93 #ifdef SUPPORT_UCP
3184 chpe 1063 othercase = UCD_OTHERCASE(next);
3185 nigel 93 #else
3186 ph10 1046 othercase = NOTACHAR;
3187 nigel 93 #endif
3188 chpe 1063 return c != othercase;
3189 ph10 1046 }
3190     else
3191 ph10 836 #endif /* SUPPORT_UTF */
3192 chpe 1063 return (c != TABLE_GET(next, cd->fcc, next)); /* Not UTF */
3193 ph10 1221
3194 ph10 1046 case OP_NOT:
3195     return c == next;
3196 ph10 1221
3197 ph10 1046 case OP_NOTI:
3198     if (c == next) return TRUE;
3199 zherczeg 924 #ifdef SUPPORT_UTF
3200 ph10 1046 if (utf)
3201     {
3202 chpe 1063 pcre_uint32 othercase;
3203 ph10 1046 if (next < 128) othercase = cd->fcc[next]; else
3204     #ifdef SUPPORT_UCP
3205 chpe 1063 othercase = UCD_OTHERCASE(next);
3206 zherczeg 924 #else
3207 ph10 1046 othercase = NOTACHAR;
3208 zherczeg 924 #endif
3209 chpe 1063 return c == othercase;
3210 ph10 1046 }
3211     else
3212     #endif /* SUPPORT_UTF */
3213 chpe 1063 return (c == TABLE_GET(next, cd->fcc, next)); /* Not UTF */
3214 zherczeg 1047
3215 ph10 1046 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3216     When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3217 zherczeg 1047
3218 ph10 1046 case OP_DIGIT:
3219     return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
3220 zherczeg 1047
3221 ph10 1046 case OP_NOT_DIGIT:
3222     return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
3223 zherczeg 1047
3224 ph10 1046 case OP_WHITESPACE:
3225     return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
3226 zherczeg 1047
3227 ph10 1046 case OP_NOT_WHITESPACE:
3228     return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
3229 zherczeg 1047
3230 ph10 1046 case OP_WORDCHAR:
3231     return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
3232 zherczeg 1047
3233 ph10 1046 case OP_NOT_WORDCHAR:
3234     return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
3235 zherczeg 1047
3236 ph10 1046 case OP_HSPACE:
3237     case OP_NOT_HSPACE:
3238     switch(next)
3239     {
3240 ph10 1221 HSPACE_CASES:
3241 ph10 1046 return op_code == OP_NOT_HSPACE;
3242 zherczeg 1047
3243 ph10 1046 default:
3244     return op_code != OP_NOT_HSPACE;
3245     }
3246 zherczeg 1047
3247 ph10 1046 case OP_ANYNL:
3248     case OP_VSPACE:
3249     case OP_NOT_VSPACE:
3250     switch(next)
3251     {
3252 ph10 1221 VSPACE_CASES:
3253 ph10 1046 return op_code == OP_NOT_VSPACE;
3254 zherczeg 1047
3255 ph10 1046 default:
3256     return op_code != OP_NOT_VSPACE;
3257     }
3258 zherczeg 1047
3259 nigel 93 #ifdef SUPPORT_UCP
3260 ph10 1046 case OP_PROP:
3261 chpe 1130 return check_char_prop(next, previous[0], previous[1], FALSE);
3262 zherczeg 1047
3263 ph10 1046 case OP_NOTPROP:
3264 chpe 1130 return check_char_prop(next, previous[0], previous[1], TRUE);
3265 nigel 93 #endif
3266 zherczeg 1047
3267 ph10 180 default:
3268 ph10 1046 return FALSE;
3269 ph10 180 }
3270 zherczeg 1047 }
3271 ph10 180
3272 ph10 535 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3273     is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3274     generated only when PCRE_UCP is *not* set, that is, when only ASCII
3275     characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
3276 ph10 532 replaced by OP_PROP codes when PCRE_UCP is set. */
3277 nigel 93
3278     switch(op_code)
3279     {
3280     case OP_CHAR:
3281 ph10 602 case OP_CHARI:
3282 chpe 1059 switch(escape)
3283 nigel 93 {
3284     case ESC_d:
3285 ph10 962 return c > 255 || (cd->ctypes[c] & ctype_digit) == 0;
3286 nigel 93
3287     case ESC_D:
3288 ph10 962 return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;
3289 nigel 93
3290     case ESC_s:
3291 ph10 962 return c > 255 || (cd->ctypes[c] & ctype_space) == 0;
3292 nigel 93
3293     case ESC_S:
3294 ph10 962 return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;
3295 nigel 93
3296     case ESC_w:
3297 ph10 962 return c > 255 || (cd->ctypes[c] & ctype_word) == 0;
3298 nigel 93
3299     case ESC_W:
3300 ph10 962 return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;
3301 ph10 182
3302 ph10 180 case ESC_h:
3303     case ESC_H:
3304 ph10 532 switch(c)
3305 ph10 180 {
3306 ph10 1221 HSPACE_CASES:
3307 chpe 1059 return escape != ESC_h;
3308 ph10 1221
3309 ph10 180 default:
3310 chpe 1059 return escape == ESC_h;
3311 ph10 182 }
3312    
3313 ph10 180 case ESC_v:
3314     case ESC_V:
3315 ph10 532 switch(c)
3316 ph10 180 {
3317 ph10 1221 VSPACE_CASES:
3318 chpe 1059 return escape != ESC_v;
3319 zherczeg 1047
3320 ph10 180 default:
3321 chpe 1059 return escape == ESC_v;
3322 ph10 182 }
3323 ph10 535
3324     /* When PCRE_UCP is set, these values get generated for \d etc. Find
3325     their substitutions and process them. The result will always be either
3326 chpe 1059 ESC_p or ESC_P. Then fall through to process those values. */
3327 ph10 535
3328 ph10 532 #ifdef SUPPORT_UCP
3329     case ESC_du:
3330     case ESC_DU:
3331     case ESC_wu:
3332     case ESC_WU:
3333     case ESC_su:
3334     case ESC_SU:
3335     {
3336     int temperrorcode = 0;
3337 chpe 1059 ptr = substitutes[escape - ESC_DU];
3338     escape = check_escape(&ptr, &next, &temperrorcode, 0, options, FALSE);
3339 ph10 532 if (temperrorcode != 0) return FALSE;
3340     ptr++; /* For compatibility */
3341     }
3342 ph10 535 /* Fall through */
3343 nigel 93
3344 ph10 532 case ESC_p:
3345     case ESC_P:
3346     {
3347 chpe 1129 unsigned int ptype = 0, pdata = 0;
3348     int errorcodeptr;
3349 ph10 535 BOOL negated;
3350    
3351 ph10 532 ptr--; /* Make ptr point at the p or P */
3352 chpe 1129 if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcodeptr))
3353     return FALSE;
3354 ph10 532 ptr++; /* Point past the final curly ket */
3355 ph10 535
3356 ph10 532 /* If the property item is optional, we have to give up. (When generated
3357     from \d etc by PCRE_UCP, this test will have been applied much earlier,
3358     to the original \d etc. At this point, ptr will point to a zero byte. */
3359 ph10 535
3360 ph10 532 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3361 ph10 836 STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3362 ph10 532 return FALSE;
3363 ph10 535
3364 ph10 532 /* Do the property check. */
3365 ph10 535
3366 chpe 1059 return check_char_prop(c, ptype, pdata, (escape == ESC_P) != negated);
3367 ph10 535 }
3368 ph10 532 #endif
3369    
3370 nigel 93 default:
3371     return FALSE;
3372     }
3373    
3374 ph10 535 /* In principle, support for Unicode properties should be integrated here as
3375     well. It means re-organizing the above code so as to get hold of the property
3376     values before switching on the op-code. However, I wonder how many patterns
3377     combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
3378     these op-codes are never generated.) */
3379    
3380 nigel 93 case OP_DIGIT:
3381 chpe 1059 return escape == ESC_D || escape == ESC_s || escape == ESC_W ||
3382     escape == ESC_h || escape == ESC_v || escape == ESC_R;
3383 nigel 93
3384     case OP_NOT_DIGIT:
3385 chpe 1059 return escape == ESC_d;
3386 nigel 93
3387     case OP_WHITESPACE:
3388 chpe 1059 return escape == ESC_S || escape == ESC_d || escape == ESC_w;
3389 nigel 93
3390     case OP_NOT_WHITESPACE:
3391 chpe 1059 return escape == ESC_s || escape == ESC_h || escape == ESC_v || escape == ESC_R;
3392 nigel 93
3393 ph10 180 case OP_HSPACE:
3394 chpe 1059 return escape == ESC_S || escape == ESC_H || escape == ESC_d ||
3395     escape == ESC_w || escape == ESC_v || escape == ESC_R;
3396 ph10 180
3397     case OP_NOT_HSPACE:
3398 chpe 1059 return escape == ESC_h;
3399 ph10 182
3400 ph10 180 /* Can't have \S in here because VT matches \S (Perl anomaly) */
3401 ph10 535 case OP_ANYNL:
3402 ph10 182 case OP_VSPACE:
3403 chpe 1059 return escape == ESC_V || escape == ESC_d || escape == ESC_w;
3404 ph10 180
3405     case OP_NOT_VSPACE:
3406 chpe 1059 return escape == ESC_v || escape == ESC_R;
3407 ph10 180
3408 nigel 93 case OP_WORDCHAR:
3409 chpe 1059 return escape == ESC_W || escape == ESC_s || escape == ESC_h ||
3410     escape == ESC_v || escape == ESC_R;
3411 nigel 93
3412     case OP_NOT_WORDCHAR:
3413 chpe 1059 return escape == ESC_w || escape == ESC_d;
3414 ph10 182
3415 nigel 93 default:
3416     return FALSE;
3417     }
3418    
3419     /* Control does not reach here */
3420     }
3421    
3422    
3423    
3424     /*************************************************
3425 ph10 1045 * Add a character or range to a class *
3426     *************************************************/
3427    
3428     /* This function packages up the logic of adding a character or range of
3429 ph10 1221 characters to a class. The character values in the arguments will be within the
3430     valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
3431 ph10 1045 mutually recursive with the function immediately below.
3432    
3433     Arguments:
3434     classbits the bit map for characters < 256
3435     uchardptr points to the pointer for extra data
3436     options the options word
3437 ph10 1221 cd contains pointers to tables etc.
3438 ph10 1045 start start of range character
3439     end end of range character
3440 ph10 1221
3441 ph10 1045 Returns: the number of < 256 characters added
3442     the pointer to extra data is updated
3443     */
3444    
3445     static int
3446     add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
3447 chpe 1056 compile_data *cd, pcre_uint32 start, pcre_uint32 end)
3448 ph10 1045 {
3449 chpe 1056 pcre_uint32 c;
3450 ph10 1045 int n8 = 0;
3451    
3452 ph10 1221 /* If caseless matching is required, scan the range and process alternate
3453     cases. In Unicode, there are 8-bit characters that have alternate cases that
3454     are greater than 255 and vice-versa. Sometimes we can just extend the original
3455 ph10 1045 range. */
3456    
3457     if ((options & PCRE_CASELESS) != 0)
3458     {
3459     #ifdef SUPPORT_UCP
3460     if ((options & PCRE_UTF8) != 0)
3461 ph10 1221 {
3462     int rc;
3463 chpe 1056 pcre_uint32 oc, od;
3464 ph10 1221
3465 ph10 1045 options &= ~PCRE_CASELESS; /* Remove for recursive calls */
3466     c = start;
3467 ph10 1221
3468 ph10 1045 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
3469     {
3470     /* Handle a single character that has more than one other case. */
3471 ph10 1221
3472 ph10 1045 if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
3473     PRIV(ucd_caseless_sets) + rc, oc);
3474 ph10 1221
3475 ph10 1045 /* Do nothing if the other case range is within the original range. */
3476 ph10 1221
3477 ph10 1045 else if (oc >= start && od <= end) continue;
3478 ph10 1221
3479 ph10 1045 /* Extend the original range if there is overlap, noting that if oc < c, we
3480     can't have od > end because a subrange is always shorter than the basic
3481     range. Otherwise, use a recursive call to add the additional range. */
3482 ph10 1221
3483 ph10 1045 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
3484     else if (od > end && oc <= end + 1) end = od; /* Extend upwards */
3485     else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
3486     }
3487     }
3488     else
3489     #endif /* SUPPORT_UCP */
3490    
3491     /* Not UTF-mode, or no UCP */
3492 ph10 1221
3493     for (c = start; c <= end && c < 256; c++)
3494     {
3495 ph10 1045 SETBIT(classbits, cd->fcc[c]);
3496 ph10 1221 n8++;
3497     }
3498     }
3499    
3500 ph10 1045 /* Now handle the original range. Adjust the final value according to the bit
3501     length - this means that the same lists of (e.g.) horizontal spaces can be used
3502     in all cases. */
3503    
3504 chpe 1055 #if defined COMPILE_PCRE8
3505 ph10 1045 #ifdef SUPPORT_UTF
3506     if ((options & PCRE_UTF8) == 0)
3507     #endif
3508     if (end > 0xff) end = 0xff;
3509    
3510 chpe 1055 #elif defined COMPILE_PCRE16
3511 ph10 1045 #ifdef SUPPORT_UTF
3512     if ((options & PCRE_UTF16) == 0)
3513     #endif
3514     if (end > 0xffff) end = 0xffff;
3515 chpe 1055
3516 chpe 1056 #endif /* COMPILE_PCRE[8|16] */
3517 ph10 1045
3518     /* If all characters are less than 256, use the bit map. Otherwise use extra
3519     data. */
3520    
3521     if (end < 0x100)
3522     {
3523     for (c = start; c <= end; c++)
3524     {
3525 ph10 1221 n8++;
3526 ph10 1045 SETBIT(classbits, c);
3527 ph10 1221 }
3528 ph10 1045 }
3529 ph10 1221
3530 ph10 1045 else
3531 ph10 1221 {
3532 ph10 1045 pcre_uchar *uchardata = *uchardptr;
3533 ph10 1221
3534 ph10 1045 #ifdef SUPPORT_UTF
3535     if ((options & PCRE_UTF8) != 0) /* All UTFs use the same flag bit */
3536     {
3537     if (start < end)
3538     {
3539     *uchardata++ = XCL_RANGE;
3540 ph10 1221 uchardata += PRIV(ord2utf)(start, uchardata);
3541     uchardata += PRIV(ord2utf)(end, uchardata);
3542 ph10 1045 }
3543     else if (start == end)
3544     {
3545     *uchardata++ = XCL_SINGLE;
3546 ph10 1221 uchardata += PRIV(ord2utf)(start, uchardata);
3547 ph10 1045 }
3548     }
3549     else
3550 ph10 1221 #endif /* SUPPORT_UTF */
3551    
3552 ph10 1045 /* Without UTF support, character values are constrained by the bit length,
3553     and can only be > 256 for 16-bit and 32-bit libraries. */
3554 ph10 1221
3555 ph10 1045 #ifdef COMPILE_PCRE8
3556     {}
3557 ph10 1221 #else
3558 ph10 1045 if (start < end)
3559     {
3560     *uchardata++ = XCL_RANGE;
3561     *uchardata++ = start;
3562     *uchardata++ = end;
3563     }
3564     else if (start == end)
3565     {
3566     *uchardata++ = XCL_SINGLE;
3567     *uchardata++ = start;
3568 ph10 1221 }
3569 ph10 1045 #endif
3570    
3571     *uchardptr = uchardata; /* Updata extra data pointer */
3572 ph10 1221 }
3573 ph10 1045
3574     return n8; /* Number of 8-bit characters */
3575 ph10 1221 }
3576 ph10 1045
3577    
3578 ph10 1221
3579    
3580 ph10 1045 /*************************************************
3581     * Add a list of characters to a class *
3582     *************************************************/
3583    
3584 ph10 1221 /* This function is used for adding a list of case-equivalent characters to a
3585 ph10 1045 class, and also for adding a list of horizontal or vertical whitespace. If the
3586     list is in order (which it should be), ranges of characters are detected and
3587     handled appropriately. This function is mutually recursive with the function
3588     above.
3589    
3590     Arguments:
3591     classbits the bit map for characters < 256
3592     uchardptr points to the pointer for extra data
3593     options the options word
3594 ph10 1221 cd contains pointers to tables etc.
3595     p points to row of 32-bit values, terminated by NOTACHAR
3596 ph10 1045 except character to omit; this is used when adding lists of
3597     case-equivalent characters to avoid including the one we
3598 ph10 1221 already know about
3599    
3600 ph10 1045 Returns: the number of < 256 characters added
3601     the pointer to extra data is updated
3602     */
3603    
3604     static int
3605     add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
3606     compile_data *cd, const pcre_uint32 *p, unsigned int except)
3607     {
3608     int n8 = 0;
3609     while (p[0] < NOTACHAR)
3610     {
3611     int n = 0;
3612     if (p[0] != except)
3613 ph10 1221 {
3614 ph10 1045 while(p[n+1] == p[0] + n + 1) n++;
3615     n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
3616 ph10 1221 }
3617     p += n + 1;
3618     }
3619 ph10 1045 return n8;
3620 ph10 1221 }
3621 ph10 1045
3622    
3623    
3624     /*************************************************
3625     * Add characters not in a list to a class *
3626     *************************************************/
3627    
3628     /* This function is used for adding the complement of a list of horizontal or
3629     vertical whitespace to a class. The list must be in order.
3630    
3631     Arguments:
3632     classbits the bit map for characters < 256
3633     uchardptr points to the pointer for extra data
3634     options the options word
3635 ph10 1221 cd contains pointers to tables etc.
3636     p points to row of 32-bit values, terminated by NOTACHAR
3637    
3638 ph10 1045 Returns: the number of < 256 characters added
3639     the pointer to extra data is updated
3640     */
3641    
3642     static int
3643 ph10 1221 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
3644 ph10 1045 int options, compile_data *cd, const pcre_uint32 *p)
3645     {
3646 chpe 1056 BOOL utf = (options & PCRE_UTF8) != 0;
3647 ph10 1045 int n8 = 0;
3648     if (p[0] > 0)
3649     n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
3650     while (p[0] < NOTACHAR)
3651     {
3652     while (p[1] == p[0] + 1) p++;
3653 chpe 1056 n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
3654     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
3655 ph10 1221 p++;
3656     }
3657 ph10 1045 return n8;
3658 ph10 1221 }
3659 ph10 1045
3660    
3661    
3662     /*************************************************
3663 nigel 77 * Compile one branch *
3664     *************************************************/
3665    
3666 nigel 93 /* Scan the pattern, compiling it into the a vector. If the options are
3667 nigel 77 changed during the branch, the pointer is used to change the external options
3668 nigel 93 bits. This function is used during the pre-compile phase when we are trying
3669     to find out the amount of memory needed, as well as during the real compile
3670     phase. The value of lengthptr distinguishes the two phases.
3671 nigel 77
3672     Arguments:
3673     optionsptr pointer to the option bits
3674     codeptr points to the pointer to the current code point
3675     ptrptr points to the current pattern pointer
3676     errorcodeptr points to error code variable
3677 chpe 1078 firstcharptr place to put the first required character
3678     firstcharflagsptr place to put the first character flags, or a negative number
3679     reqcharptr place to put the last required character
3680     reqcharflagsptr place to put the last required character flags, or a negative number
3681 nigel 77 bcptr points to current branch chain
3682 ph10 654 cond_depth conditional nesting depth
3683 nigel 77 cd contains pointers to tables etc.
3684 nigel 93 lengthptr NULL during the real compile phase
3685     points to length accumulator during pre-compile phase
3686 nigel 77
3687     Returns: TRUE on success
3688     FALSE, with *errorcodeptr set non-zero on error
3689     */
3690    
3691     static BOOL
3692 ph10 836 compile_branch(int *optionsptr, pcre_uchar **codeptr,
3693 chpe 1078 const pcre_uchar **ptrptr, int *errorcodeptr,
3694     pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
3695     pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
3696     branch_chain *bcptr, int cond_depth,
3697 ph10 836 compile_data *cd, int *lengthptr)
3698 nigel 77 {
3699     int repeat_type, op_type;
3700     int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
3701     int bravalue = 0;
3702     int greedy_default, greedy_non_default;
3703 chpe 1078 pcre_uint32 firstchar, reqchar;
3704     pcre_int32 firstcharflags, reqcharflags;
3705     pcre_uint32 zeroreqchar, zerofirstchar;
3706     pcre_int32 zeroreqcharflags, zerofirstcharflags;
3707 ph10 836 pcre_int32 req_caseopt, reqvary, tempreqvary;
3708 ph10 635 int options = *optionsptr; /* May change dynamically */
3709 nigel 77 int after_manual_callout = 0;
3710 nigel 93 int length_prevgroup = 0;
3711 chpe 1064 register pcre_uint32 c;
3712 chpe 1059 int escape;
3713 ph10 836 register pcre_uchar *code = *codeptr;
3714     pcre_uchar *last_code = code;
3715     pcre_uchar *orig_code = code;
3716     pcre_uchar *tempcode;
3717 nigel 77 BOOL inescq = FALSE;
3718 ph10 836 BOOL groupsetfirstchar = FALSE;
3719     const pcre_uchar *ptr = *ptrptr;
3720     const pcre_uchar *tempptr;
3721     const pcre_uchar *nestptr = NULL;
3722     pcre_uchar *previous = NULL;
3723     pcre_uchar *previous_callout = NULL;
3724     pcre_uchar *save_hwm = NULL;
3725     pcre_uint8 classbits[32];
3726 nigel 77
3727 ph10 635 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3728 ph10 654 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3729 ph10 635 dynamically as we process the pattern. */
3730    
3731 ph10 836 #ifdef SUPPORT_UTF
3732 chpe 1059 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
3733 ph10 836 BOOL utf = (options & PCRE_UTF8) != 0;
3734 chpe 1120 #ifndef COMPILE_PCRE32
3735 ph10 836 pcre_uchar utf_chars[6];
3736 chpe 1120 #endif
3737 nigel 77 #else
3738 ph10 836 BOOL utf = FALSE;
3739 nigel 77 #endif
3740    
3741 ph10 1046 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
3742 ph10 1221 class_uchardata always so that it can be passed to add_to_class() always,
3743     though it will not be used in non-UTF 8-bit cases. This avoids having to supply
3744 ph10 1046 alternative calls for the different cases. */
3745 ph10 836
3746 ph10 1046 pcre_uchar *class_uchardata;
3747 ph10 836 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3748     BOOL xclass;
3749     pcre_uchar *class_uchardata_base;
3750     #endif
3751    
3752 ph10 475 #ifdef PCRE_DEBUG
3753 nigel 93 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3754     #endif
3755    
3756 nigel 77 /* Set up the default and non-default settings for greediness */
3757    
3758     greedy_default = ((options & PCRE_UNGREEDY) != 0);
3759     greedy_non_default = greedy_default ^ 1;
3760    
3761     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3762     matching encountered yet". It gets changed to REQ_NONE if we hit something that
3763 ph10 836 matches a non-fixed char first char; reqchar just remains unset if we never
3764 nigel 77 find one.
3765    
3766     When we hit a repeat whose minimum is zero, we may have to adjust these values
3767     to take the zero repeat into account. This is implemented by setting them to
3768 ph10 836 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
3769 nigel 77 item types that can be repeated set these backoff variables appropriately. */
3770    
3771 chpe 1078 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
3772     firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
3773 nigel 77
3774 ph10 836 /* The variable req_caseopt contains either the REQ_CASELESS value
3775     or zero, according to the current setting of the caseless flag. The
3776     REQ_CASELESS leaves the lower 28 bit empty. It is added into the
3777     firstchar or reqchar variables to record the case status of the
3778     value. This is used only for ASCII characters. */
3779 nigel 77
3780 ph10 836 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
3781 nigel 77
3782     /* Switch on next character until the end of the branch */
3783    
3784