/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 618 - (hide annotations) (download)
Sat Jul 16 17:24:16 2011 UTC (22 months, 1 week ago) by ph10
File MIME type: text/plain
File size: 195257 byte(s)
Re-do atomic group processing to fix backtrack capture bugs. Recursion is also 
re-worked.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 ph10 604 /* Values for setting in md->match_function_type to indicate two special types
61     of call to match(). We do it this way to save on using another stack variable,
62     as stack usage is to be discouraged. */
63 nigel 77
64 ph10 604 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65     #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66 nigel 77
67     /* Non-error returns from the match() function. Error returns are externally
68     defined PCRE_ERROR_xxx codes, which are all negative. */
69    
70     #define MATCH_MATCH 1
71     #define MATCH_NOMATCH 0
72    
73 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
74 ph10 210 negative to avoid the external error codes. */
75    
76 ph10 511 #define MATCH_ACCEPT (-999)
77     #define MATCH_COMMIT (-998)
78 ph10 604 #define MATCH_KETRPOS (-997)
79 ph10 618 #define MATCH_ONCE (-996)
80     #define MATCH_PRUNE (-995)
81     #define MATCH_SKIP (-994)
82     #define MATCH_SKIP_ARG (-993)
83     #define MATCH_THEN (-992)
84 ph10 210
85 ph10 510 /* This is a convenience macro for code that occurs many times. */
86    
87     #define MRRETURN(ra) \
88     { \
89     md->mark = markptr; \
90     RRETURN(ra); \
91     }
92    
93 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
94     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95     because the offset vector is always a multiple of 3 long. */
96    
97     #define REC_STACK_SAVE_MAX 30
98    
99     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100    
101     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103    
104    
105    
106 ph10 475 #ifdef PCRE_DEBUG
107 nigel 77 /*************************************************
108     * Debugging function to print chars *
109     *************************************************/
110    
111     /* Print a sequence of chars in printable format, stopping at the end of the
112     subject if the requested.
113    
114     Arguments:
115     p points to characters
116     length number to print
117     is_subject TRUE if printing from within md->start_subject
118     md pointer to matching data block, if is_subject is TRUE
119    
120     Returns: nothing
121     */
122    
123     static void
124     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
125     {
126 nigel 93 unsigned int c;
127 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128     while (length-- > 0)
129     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130     }
131     #endif
132    
133    
134    
135     /*************************************************
136     * Match a back-reference *
137     *************************************************/
138    
139 ph10 595 /* Normally, if a back reference hasn't been set, the length that is passed is
140     negative, so the match always fails. However, in JavaScript compatibility mode,
141     the length passed is zero. Note that in caseless UTF-8 mode, the number of
142     subject bytes matched may be different to the number of reference bytes.
143 nigel 77
144     Arguments:
145     offset index into the offset vector
146 ph10 595 eptr pointer into the subject
147     length length of reference to be matched (number of bytes)
148 nigel 77 md points to match data block
149 ph10 602 caseless TRUE if caseless
150 nigel 77
151 ph10 595 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 nigel 77 */
153    
154 ph10 595 static int
155 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
156 ph10 602 BOOL caseless)
157 nigel 77 {
158 ph10 595 USPTR eptr_start = eptr;
159     register USPTR p = md->start_subject + md->offset_vector[offset];
160 nigel 77
161 ph10 475 #ifdef PCRE_DEBUG
162 nigel 77 if (eptr >= md->end_subject)
163     printf("matching subject <null>");
164     else
165     {
166     printf("matching subject ");
167     pchars(eptr, length, TRUE, md);
168     }
169     printf(" against backref ");
170     pchars(p, length, FALSE, md);
171     printf("\n");
172     #endif
173    
174 ph10 595 /* Always fail if reference not set (and not JavaScript compatible). */
175 nigel 77
176 ph10 595 if (length < 0) return -1;
177 nigel 77
178 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179     properly if Unicode properties are supported. Otherwise, we can check only
180     ASCII characters. */
181 nigel 77
182 ph10 602 if (caseless)
183 nigel 77 {
184 ph10 354 #ifdef SUPPORT_UTF8
185     #ifdef SUPPORT_UCP
186     if (md->utf8)
187     {
188 ph10 595 /* Match characters up to the end of the reference. NOTE: the number of
189     bytes matched may differ, because there are some characters whose upper and
190     lower case versions code as different numbers of bytes. For example, U+023A
191     (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192     a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193     the latter. It is important, therefore, to check the length along the
194     reference, not along the subject (earlier code did this wrong). */
195    
196     USPTR endptr = p + length;
197     while (p < endptr)
198 ph10 354 {
199 ph10 358 int c, d;
200 ph10 597 if (eptr >= md->end_subject) return -1;
201 ph10 354 GETCHARINC(c, eptr);
202     GETCHARINC(d, p);
203 ph10 595 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 ph10 358 }
205     }
206 ph10 354 else
207     #endif
208     #endif
209    
210     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211     is no UCP support. */
212 ph10 597 {
213     if (eptr + length > md->end_subject) return -1;
214     while (length-- > 0)
215     { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216     }
217 nigel 77 }
218 ph10 358
219 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
220     are in UTF-8 mode. */
221 ph10 358
222 nigel 77 else
223 ph10 597 {
224     if (eptr + length > md->end_subject) return -1;
225     while (length-- > 0) if (*p++ != *eptr++) return -1;
226     }
227 nigel 77
228 ph10 595 return eptr - eptr_start;
229 nigel 77 }
230    
231    
232    
233     /***************************************************************************
234     ****************************************************************************
235     RECURSION IN THE match() FUNCTION
236    
237 nigel 87 The match() function is highly recursive, though not every recursive call
238     increases the recursive depth. Nevertheless, some regular expressions can cause
239     it to recurse to a great depth. I was writing for Unix, so I just let it call
240     itself recursively. This uses the stack for saving everything that has to be
241     saved for a recursive call. On Unix, the stack can be large, and this works
242     fine.
243 nigel 77
244 nigel 87 It turns out that on some non-Unix-like systems there are problems with
245     programs that use a lot of stack. (This despite the fact that every last chip
246     has oodles of memory these days, and techniques for extending the stack have
247     been known for decades.) So....
248 nigel 77
249     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250     calls by keeping local variables that need to be preserved in blocks of memory
251 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
252 nigel 77 achieve this so that the actual code doesn't look very different to what it
253     always used to.
254 ph10 164
255 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
256 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
257     Switzer, the use of longjmp() has been abolished, at the cost of having to
258     provide a unique number for each call to RMATCH. There is no way of generating
259     a sequence of numbers at compile time in C. I have given them names, to make
260     them stand out more clearly.
261    
262     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
265     don't have indeterminate values; this has meant that the frame size can be
266 ph10 164 reduced because the result can be "passed back" by straight setting of the
267     variable instead of being passed in the frame.
268 nigel 77 ****************************************************************************
269     ***************************************************************************/
270    
271 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272     below must be updated in sync. */
273 nigel 77
274 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 ph10 527 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 ph10 618 RM61, RM62, RM63, RM64, RM65, RM66 };
281 ph10 164
282 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
283 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 ph10 501 actually used in this definition. */
285 nigel 77
286     #ifndef NO_RECURSE
287     #define REGISTER register
288 ph10 164
289 ph10 475 #ifdef PCRE_DEBUG
290 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 nigel 87 { \
292     printf("match() called in line %d\n", __LINE__); \
293 ph10 604 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 nigel 87 printf("to line %d\n", __LINE__); \
295     }
296     #define RRETURN(ra) \
297     { \
298     printf("match() returned %d from line %d ", ra, __LINE__); \
299     return ra; \
300     }
301     #else
302 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
303     rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 nigel 77 #define RRETURN(ra) return ra
305 nigel 87 #endif
306    
307 nigel 77 #else
308    
309    
310 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
311     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312     argument of match(), which never changes. */
313 nigel 77
314     #define REGISTER
315    
316 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 nigel 77 {\
318 ph10 563 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 ph10 534 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 ph10 164 frame->Xwhere = rw; \
321     newframe->Xeptr = ra;\
322     newframe->Xecode = rb;\
323 ph10 168 newframe->Xmstart = mstart;\
324 ph10 501 newframe->Xmarkptr = markptr;\
325 ph10 164 newframe->Xoffset_top = rc;\
326 ph10 602 newframe->Xeptrb = re;\
327 ph10 164 newframe->Xrdepth = frame->Xrdepth + 1;\
328     newframe->Xprevframe = frame;\
329     frame = newframe;\
330     DPRINTF(("restarting from line %d\n", __LINE__));\
331     goto HEAP_RECURSE;\
332     L_##rw:\
333     DPRINTF(("jumped back to line %d\n", __LINE__));\
334 nigel 77 }
335    
336     #define RRETURN(ra)\
337     {\
338 ph10 527 heapframe *oldframe = frame;\
339     frame = oldframe->Xprevframe;\
340     (pcre_stack_free)(oldframe);\
341 nigel 77 if (frame != NULL)\
342     {\
343 ph10 164 rrc = ra;\
344     goto HEAP_RETURN;\
345 nigel 77 }\
346     return ra;\
347     }
348    
349    
350     /* Structure for remembering the local variables in a private frame */
351    
352     typedef struct heapframe {
353     struct heapframe *Xprevframe;
354    
355     /* Function arguments that may change */
356    
357 ph10 409 USPTR Xeptr;
358 nigel 77 const uschar *Xecode;
359 ph10 409 USPTR Xmstart;
360 ph10 501 USPTR Xmarkptr;
361 nigel 77 int Xoffset_top;
362     eptrblock *Xeptrb;
363 nigel 91 unsigned int Xrdepth;
364 nigel 77
365     /* Function local variables */
366    
367 ph10 409 USPTR Xcallpat;
368 ph10 406 #ifdef SUPPORT_UTF8
369 ph10 409 USPTR Xcharptr;
370 ph10 406 #endif
371 ph10 409 USPTR Xdata;
372     USPTR Xnext;
373     USPTR Xpp;
374     USPTR Xprev;
375     USPTR Xsaved_eptr;
376 nigel 77
377     recursion_info Xnew_recursive;
378    
379     BOOL Xcur_is_word;
380     BOOL Xcondition;
381     BOOL Xprev_is_word;
382    
383     #ifdef SUPPORT_UCP
384     int Xprop_type;
385 nigel 87 int Xprop_value;
386 nigel 77 int Xprop_fail_result;
387     int Xprop_category;
388     int Xprop_chartype;
389 nigel 87 int Xprop_script;
390 ph10 123 int Xoclength;
391     uschar Xocchars[8];
392 nigel 77 #endif
393    
394 ph10 403 int Xcodelink;
395 nigel 77 int Xctype;
396 nigel 93 unsigned int Xfc;
397 nigel 77 int Xfi;
398     int Xlength;
399     int Xmax;
400     int Xmin;
401     int Xnumber;
402     int Xoffset;
403     int Xop;
404     int Xsave_capture_last;
405     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
406     int Xstacksave[REC_STACK_SAVE_MAX];
407    
408     eptrblock Xnewptrb;
409    
410 ph10 164 /* Where to jump back to */
411 nigel 77
412 ph10 164 int Xwhere;
413 ph10 165
414 nigel 77 } heapframe;
415    
416     #endif
417    
418    
419     /***************************************************************************
420     ***************************************************************************/
421    
422    
423    
424     /*************************************************
425     * Match from current position *
426     *************************************************/
427    
428 nigel 93 /* This function is called recursively in many circumstances. Whenever it
429 nigel 77 returns a negative (error) response, the outer incarnation must also return the
430 ph10 426 same response. */
431 nigel 77
432 ph10 426 /* These macros pack up tests that are used for partial matching, and which
433     appears several times in the code. We set the "hit end" flag if the pointer is
434     at the end of the subject and also past the start of the subject (i.e.
435 ph10 427 something has been matched). For hard partial matching, we then return
436     immediately. The second one is used when we already know we are past the end of
437     the subject. */
438 ph10 426
439     #define CHECK_PARTIAL()\
440 ph10 553 if (md->partial != 0 && eptr >= md->end_subject && \
441     eptr > md->start_used_ptr) \
442     { \
443     md->hitend = TRUE; \
444     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
445 ph10 427 }
446 ph10 426
447     #define SCHECK_PARTIAL()\
448 ph10 553 if (md->partial != 0 && eptr > md->start_used_ptr) \
449     { \
450     md->hitend = TRUE; \
451     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
452 ph10 427 }
453 ph10 426
454 ph10 427
455 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
456     the md structure (e.g. utf8, end_subject) into individual variables to improve
457 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
458     made performance worse.
459    
460     Arguments:
461 nigel 93 eptr pointer to current character in subject
462     ecode pointer to current position in compiled code
463 ph10 168 mstart pointer to the current match start position (can be modified
464 ph10 172 by encountering \K)
465 ph10 501 markptr pointer to the most recent MARK name, or NULL
466 nigel 77 offset_top current top pointer
467     md pointer to "static" info for the match
468     eptrb pointer to chain of blocks containing eptr at start of
469     brackets - for testing for empty matches
470 nigel 87 rdepth the recursion depth
471 nigel 77
472     Returns: MATCH_MATCH if matched ) these values are >= 0
473     MATCH_NOMATCH if failed to match )
474 ph10 510 a negative MATCH_xxx value for PRUNE, SKIP, etc
475 nigel 77 a negative PCRE_ERROR_xxx value if aborted by an error condition
476 nigel 87 (e.g. stopped by repeated call or recursion limit)
477 nigel 77 */
478    
479     static int
480 ph10 510 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
481 ph10 602 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
482 ph10 604 unsigned int rdepth)
483 nigel 77 {
484     /* These variables do not need to be preserved over recursion in this function,
485 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
486     "register" because they are used a lot in loops. */
487 nigel 77
488 nigel 91 register int rrc; /* Returns from recursive calls */
489     register int i; /* Used for loops not involving calls to RMATCH() */
490 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
491 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
492 nigel 77
493 nigel 93 BOOL minimize, possessive; /* Quantifier options */
494 ph10 602 BOOL caseless;
495 ph10 403 int condcode;
496 nigel 93
497 nigel 77 /* When recursion is not being used, all "local" variables that have to be
498     preserved over calls to RMATCH() are part of a "frame" which is obtained from
499     heap storage. Set up the top-level frame here; others are obtained from the
500     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
501    
502     #ifdef NO_RECURSE
503 ph10 563 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
504 ph10 531 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
505 nigel 77 frame->Xprevframe = NULL; /* Marks the top level */
506    
507     /* Copy in the original argument variables */
508    
509     frame->Xeptr = eptr;
510     frame->Xecode = ecode;
511 ph10 168 frame->Xmstart = mstart;
512 ph10 501 frame->Xmarkptr = markptr;
513 nigel 77 frame->Xoffset_top = offset_top;
514     frame->Xeptrb = eptrb;
515 nigel 87 frame->Xrdepth = rdepth;
516 nigel 77
517     /* This is where control jumps back to to effect "recursion" */
518    
519     HEAP_RECURSE:
520    
521     /* Macros make the argument variables come from the current frame */
522    
523     #define eptr frame->Xeptr
524     #define ecode frame->Xecode
525 ph10 168 #define mstart frame->Xmstart
526 ph10 501 #define markptr frame->Xmarkptr
527 nigel 77 #define offset_top frame->Xoffset_top
528     #define eptrb frame->Xeptrb
529 nigel 87 #define rdepth frame->Xrdepth
530 nigel 77
531     /* Ditto for the local variables */
532    
533     #ifdef SUPPORT_UTF8
534     #define charptr frame->Xcharptr
535     #endif
536     #define callpat frame->Xcallpat
537 ph10 403 #define codelink frame->Xcodelink
538 nigel 77 #define data frame->Xdata
539     #define next frame->Xnext
540     #define pp frame->Xpp
541     #define prev frame->Xprev
542     #define saved_eptr frame->Xsaved_eptr
543    
544     #define new_recursive frame->Xnew_recursive
545    
546     #define cur_is_word frame->Xcur_is_word
547     #define condition frame->Xcondition
548     #define prev_is_word frame->Xprev_is_word
549    
550     #ifdef SUPPORT_UCP
551     #define prop_type frame->Xprop_type
552 nigel 87 #define prop_value frame->Xprop_value
553 nigel 77 #define prop_fail_result frame->Xprop_fail_result
554     #define prop_category frame->Xprop_category
555     #define prop_chartype frame->Xprop_chartype
556 nigel 87 #define prop_script frame->Xprop_script
557 ph10 115 #define oclength frame->Xoclength
558     #define occhars frame->Xocchars
559 nigel 77 #endif
560    
561     #define ctype frame->Xctype
562     #define fc frame->Xfc
563     #define fi frame->Xfi
564     #define length frame->Xlength
565     #define max frame->Xmax
566     #define min frame->Xmin
567     #define number frame->Xnumber
568     #define offset frame->Xoffset
569     #define op frame->Xop
570     #define save_capture_last frame->Xsave_capture_last
571     #define save_offset1 frame->Xsave_offset1
572     #define save_offset2 frame->Xsave_offset2
573     #define save_offset3 frame->Xsave_offset3
574     #define stacksave frame->Xstacksave
575    
576     #define newptrb frame->Xnewptrb
577    
578     /* When recursion is being used, local variables are allocated on the stack and
579     get preserved during recursion in the normal way. In this environment, fi and
580     i, and fc and c, can be the same variables. */
581    
582 nigel 93 #else /* NO_RECURSE not defined */
583 nigel 77 #define fi i
584     #define fc c
585    
586 ph10 604 /* Many of the following variables are used only in small blocks of the code.
587     My normal style of coding would have declared them within each of those blocks.
588     However, in order to accommodate the version of this code that uses an external
589     "stack" implemented on the heap, it is easier to declare them all here, so the
590     declarations can be cut out in a block. The only declarations within blocks
591     below are for variables that do not have to be preserved over a recursive call
592     to RMATCH(). */
593 nigel 77
594 ph10 604 #ifdef SUPPORT_UTF8
595     const uschar *charptr;
596     #endif
597     const uschar *callpat;
598     const uschar *data;
599     const uschar *next;
600     USPTR pp;
601     const uschar *prev;
602     USPTR saved_eptr;
603    
604     recursion_info new_recursive;
605    
606     BOOL cur_is_word;
607 nigel 87 BOOL condition;
608 nigel 77 BOOL prev_is_word;
609    
610     #ifdef SUPPORT_UCP
611     int prop_type;
612 nigel 87 int prop_value;
613 nigel 77 int prop_fail_result;
614     int prop_category;
615     int prop_chartype;
616 nigel 87 int prop_script;
617 ph10 115 int oclength;
618     uschar occhars[8];
619 nigel 77 #endif
620    
621 ph10 399 int codelink;
622 nigel 77 int ctype;
623     int length;
624     int max;
625     int min;
626     int number;
627     int offset;
628     int op;
629     int save_capture_last;
630     int save_offset1, save_offset2, save_offset3;
631     int stacksave[REC_STACK_SAVE_MAX];
632    
633     eptrblock newptrb;
634 nigel 93 #endif /* NO_RECURSE */
635 nigel 77
636 ph10 604 /* To save space on the stack and in the heap frame, I have doubled up on some
637     of the local variables that are used only in localised parts of the code, but
638     still need to be preserved over recursive calls of match(). These macros define
639     the alternative names that are used. */
640    
641     #define allow_zero cur_is_word
642     #define cbegroup condition
643     #define code_offset codelink
644     #define condassert condition
645     #define matched_once prev_is_word
646    
647 nigel 77 /* These statements are here to stop the compiler complaining about unitialized
648     variables. */
649    
650     #ifdef SUPPORT_UCP
651 nigel 87 prop_value = 0;
652 nigel 77 prop_fail_result = 0;
653     #endif
654    
655 nigel 93
656 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
657     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
658     used. Thanks to Ian Taylor for noticing this possibility and sending the
659     original patch. */
660    
661     TAIL_RECURSE:
662    
663 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
664     are specified by the macro RMATCH and RRETURN is used to return. When
665     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
666 ph10 475 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
667 nigel 87 defined). However, RMATCH isn't like a function call because it's quite a
668     complicated macro. It has to be used in one particular way. This shouldn't,
669     however, impact performance when true recursion is being used. */
670 nigel 77
671 ph10 164 #ifdef SUPPORT_UTF8
672     utf8 = md->utf8; /* Local copy of the flag */
673     #else
674     utf8 = FALSE;
675     #endif
676    
677 nigel 87 /* First check that we haven't called match() too many times, or that we
678     haven't exceeded the recursive call limit. */
679    
680 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
681 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
682 nigel 77
683 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
684 ph10 604 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
685     done this way to save having to use another function argument, which would take
686     up space on the stack. See also MATCH_CONDASSERT below.
687 nigel 77
688 ph10 604 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
689     such remembered pointers, to be checked when we hit the closing ket, in order
690     to break infinite loops that match no characters. When match() is called in
691     other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
692     NOT be used with tail recursion, because the memory block that is used is on
693     the stack, so a new one may be required for each match(). */
694    
695     if (md->match_function_type == MATCH_CBEGROUP)
696 nigel 77 {
697 ph10 197 newptrb.epb_saved_eptr = eptr;
698     newptrb.epb_prev = eptrb;
699     eptrb = &newptrb;
700 ph10 604 md->match_function_type = 0;
701 nigel 77 }
702    
703 nigel 93 /* Now start processing the opcodes. */
704 nigel 77
705     for (;;)
706     {
707 nigel 93 minimize = possessive = FALSE;
708 nigel 77 op = *ecode;
709 ph10 604
710 nigel 93 switch(op)
711     {
712 ph10 510 case OP_MARK:
713     markptr = ecode + 2;
714     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
715 ph10 604 eptrb, RM55);
716 ph10 512
717     /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
718     argument, and we must check whether that argument matches this MARK's
719     argument. It is passed back in md->start_match_ptr (an overloading of that
720     variable). If it does match, we reset that variable to the current subject
721     position and return MATCH_SKIP. Otherwise, pass back the return code
722 ph10 510 unaltered. */
723 ph10 512
724     if (rrc == MATCH_SKIP_ARG &&
725 ph10 510 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
726     {
727     md->start_match_ptr = eptr;
728     RRETURN(MATCH_SKIP);
729     }
730    
731 ph10 512 if (md->mark == NULL) md->mark = markptr;
732 ph10 510 RRETURN(rrc);
733    
734 ph10 210 case OP_FAIL:
735 ph10 510 MRRETURN(MATCH_NOMATCH);
736 ph10 211
737 ph10 551 /* COMMIT overrides PRUNE, SKIP, and THEN */
738 ph10 553
739 ph10 510 case OP_COMMIT:
740     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
741 ph10 604 eptrb, RM52);
742 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
743 ph10 553 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
744     rrc != MATCH_THEN)
745 ph10 551 RRETURN(rrc);
746 ph10 510 MRRETURN(MATCH_COMMIT);
747    
748 ph10 551 /* PRUNE overrides THEN */
749 ph10 553
750 ph10 210 case OP_PRUNE:
751     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
752 ph10 604 eptrb, RM51);
753 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
754 ph10 510 MRRETURN(MATCH_PRUNE);
755 ph10 211
756 ph10 510 case OP_PRUNE_ARG:
757     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
758 ph10 604 eptrb, RM56);
759 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
760 ph10 510 md->mark = ecode + 2;
761     RRETURN(MATCH_PRUNE);
762 ph10 211
763 ph10 551 /* SKIP overrides PRUNE and THEN */
764 ph10 553
765 ph10 210 case OP_SKIP:
766     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
767 ph10 604 eptrb, RM53);
768 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
769 ph10 551 RRETURN(rrc);
770 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
771 ph10 510 MRRETURN(MATCH_SKIP);
772 ph10 211
773 ph10 510 case OP_SKIP_ARG:
774     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
775 ph10 604 eptrb, RM57);
776 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
777 ph10 551 RRETURN(rrc);
778 ph10 512
779     /* Pass back the current skip name by overloading md->start_match_ptr and
780     returning the special MATCH_SKIP_ARG return code. This will either be
781     caught by a matching MARK, or get to the top, where it is treated the same
782 ph10 510 as PRUNE. */
783 ph10 512
784 ph10 510 md->start_match_ptr = ecode + 2;
785 ph10 512 RRETURN(MATCH_SKIP_ARG);
786 ph10 553
787 ph10 550 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
788 ph10 553 the alt that is at the start of the current branch. This makes it possible
789     to skip back past alternatives that precede the THEN within the current
790     branch. */
791 ph10 512
792 ph10 210 case OP_THEN:
793     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
794 ph10 604 eptrb, RM54);
795 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
796 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
797 ph10 510 MRRETURN(MATCH_THEN);
798    
799     case OP_THEN_ARG:
800 ph10 553 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
801 ph10 604 offset_top, md, eptrb, RM58);
802 ph10 510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
803 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
804     md->mark = ecode + LINK_SIZE + 2;
805 ph10 212 RRETURN(MATCH_THEN);
806 ph10 211
807 ph10 604 /* Handle a capturing bracket, other than those that are possessive with an
808     unlimited repeat. If there is space in the offset vector, save the current
809     subject position in the working slot at the top of the vector. We mustn't
810     change the current values of the data slot, because they may be set from a
811     previous iteration of this group, and be referred to by a reference inside
812 ph10 617 the group. A failure to match might occur after the group has succeeded,
813     if something later on doesn't match. For this reason, we need to restore
814     the working value and also the values of the final offsets, in case they
815     were set by a previous iteration of the same bracket.
816 nigel 77
817 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
818     a non-capturing bracket. Don't worry about setting the flag for the error
819     case here; that is handled in the code for KET. */
820 nigel 77
821 nigel 93 case OP_CBRA:
822     case OP_SCBRA:
823     number = GET2(ecode, 1+LINK_SIZE);
824 nigel 77 offset = number << 1;
825 ph10 604
826 ph10 475 #ifdef PCRE_DEBUG
827 nigel 93 printf("start bracket %d\n", number);
828     printf("subject=");
829 nigel 77 pchars(eptr, 16, TRUE, md);
830     printf("\n");
831     #endif
832    
833     if (offset < md->offset_max)
834     {
835     save_offset1 = md->offset_vector[offset];
836     save_offset2 = md->offset_vector[offset+1];
837     save_offset3 = md->offset_vector[md->offset_end - number];
838     save_capture_last = md->capture_last;
839    
840     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
841 ph10 531 md->offset_vector[md->offset_end - number] =
842 ph10 530 (int)(eptr - md->start_subject);
843 nigel 77
844 ph10 604 for (;;)
845 nigel 77 {
846 ph10 604 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
847     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
848     eptrb, RM1);
849 ph10 618 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
850 ph10 550 if (rrc != MATCH_NOMATCH &&
851     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
852     RRETURN(rrc);
853 nigel 77 md->capture_last = save_capture_last;
854     ecode += GET(ecode, 1);
855 ph10 604 if (*ecode != OP_ALT) break;
856 nigel 77 }
857    
858     DPRINTF(("bracket %d failed\n", number));
859     md->offset_vector[offset] = save_offset1;
860     md->offset_vector[offset+1] = save_offset2;
861     md->offset_vector[md->offset_end - number] = save_offset3;
862 ph10 618
863     /* At this point, rrc will be one of MATCH_ONCE, MATCH_NOMATCH, or
864     MATCH_THEN. */
865 nigel 77
866 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
867 ph10 618 RRETURN(((rrc == MATCH_ONCE)? MATCH_ONCE:MATCH_NOMATCH));
868 nigel 77 }
869    
870 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
871     as a non-capturing bracket. */
872 nigel 77
873 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
874     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
875    
876 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
877 nigel 77
878 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
879     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
880    
881 ph10 618 /* Non-capturing or atomic group, except for possessive with unlimited
882     repeat. Loop for all the alternatives. When we get to the final alternative
883     within the brackets, we used to return the result of a recursive call to
884     match() whatever happened so it was possible to reduce stack usage by
885     turning this into a tail recursion, except in the case of a possibly empty
886     group. However, now that there is the possiblity of (*THEN) occurring in
887     the final alternative, this optimization is no longer possible.
888    
889     MATCH_ONCE is returned when the end of an atomic group is successfully
890     reached, but subsequent matching fails. It passes back up the tree (causing
891     captured values to be reset) until the original atomic group level is
892     reached. This is tested by comparing md->once_target with the start of the
893     group. At this point, the return is converted into MATCH_NOMATCH so that
894     previous backup points can be taken. */
895 nigel 77
896 ph10 618 case OP_ONCE:
897 nigel 93 case OP_BRA:
898     case OP_SBRA:
899     DPRINTF(("start non-capturing bracket\n"));
900 ph10 618
901 nigel 91 for (;;)
902 nigel 77 {
903 ph10 618 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
904 ph10 602 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
905 ph10 604 RM2);
906 ph10 550 if (rrc != MATCH_NOMATCH &&
907     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
908 ph10 618 {
909     if (rrc == MATCH_ONCE)
910     {
911     const uschar *scode = ecode;
912     if (*scode != OP_ONCE) /* If not at start, find it */
913     {
914     while (*scode == OP_ALT) scode += GET(scode, 1);
915     scode -= GET(scode, 1);
916     }
917     if (md->once_target == scode) rrc = MATCH_NOMATCH;
918     }
919 ph10 550 RRETURN(rrc);
920 ph10 618 }
921 nigel 77 ecode += GET(ecode, 1);
922 ph10 609 if (*ecode != OP_ALT) break;
923 nigel 77 }
924 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
925     RRETURN(MATCH_NOMATCH);
926    
927 ph10 604 /* Handle possessive capturing brackets with an unlimited repeat. We come
928     here from BRAZERO with allow_zero set TRUE. The offset_vector values are
929     handled similarly to the normal case above. However, the matching is
930     different. The end of these brackets will always be OP_KETRPOS, which
931     returns MATCH_KETRPOS without going further in the pattern. By this means
932     we can handle the group by iteration rather than recursion, thereby
933     reducing the amount of stack needed. */
934    
935     case OP_CBRAPOS:
936     case OP_SCBRAPOS:
937     allow_zero = FALSE;
938    
939     POSSESSIVE_CAPTURE:
940     number = GET2(ecode, 1+LINK_SIZE);
941     offset = number << 1;
942    
943     #ifdef PCRE_DEBUG
944     printf("start possessive bracket %d\n", number);
945     printf("subject=");
946     pchars(eptr, 16, TRUE, md);
947     printf("\n");
948     #endif
949    
950     if (offset < md->offset_max)
951     {
952     matched_once = FALSE;
953     code_offset = ecode - md->start_code;
954    
955     save_offset1 = md->offset_vector[offset];
956     save_offset2 = md->offset_vector[offset+1];
957     save_offset3 = md->offset_vector[md->offset_end - number];
958     save_capture_last = md->capture_last;
959    
960     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
961    
962     /* Each time round the loop, save the current subject position for use
963     when the group matches. For MATCH_MATCH, the group has matched, so we
964     restart it with a new subject starting position, remembering that we had
965     at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
966     usual. If we haven't matched any alternatives in any iteration, check to
967     see if a previous iteration matched. If so, the group has matched;
968     continue from afterwards. Otherwise it has failed; restore the previous
969     capture values before returning NOMATCH. */
970    
971     for (;;)
972     {
973     md->offset_vector[md->offset_end - number] =
974     (int)(eptr - md->start_subject);
975     if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
976     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
977     eptrb, RM63);
978     if (rrc == MATCH_KETRPOS)
979     {
980     offset_top = md->end_offset_top;
981     eptr = md->end_match_ptr;
982     ecode = md->start_code + code_offset;
983     save_capture_last = md->capture_last;
984     matched_once = TRUE;
985     continue;
986     }
987     if (rrc != MATCH_NOMATCH &&
988     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
989     RRETURN(rrc);
990     md->capture_last = save_capture_last;
991     ecode += GET(ecode, 1);
992     if (*ecode != OP_ALT) break;
993     }
994 ph10 610
995 ph10 604 if (!matched_once)
996     {
997     md->offset_vector[offset] = save_offset1;
998     md->offset_vector[offset+1] = save_offset2;
999     md->offset_vector[md->offset_end - number] = save_offset3;
1000     }
1001    
1002 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
1003 ph10 604 if (allow_zero || matched_once)
1004     {
1005     ecode += 1 + LINK_SIZE;
1006     break;
1007     }
1008    
1009     RRETURN(MATCH_NOMATCH);
1010     }
1011    
1012     /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1013     as a non-capturing bracket. */
1014    
1015     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1016     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1017    
1018     DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1019    
1020     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1021     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1022    
1023     /* Non-capturing possessive bracket with unlimited repeat. We come here
1024     from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1025     without the capturing complication. It is written out separately for speed
1026     and cleanliness. */
1027    
1028     case OP_BRAPOS:
1029     case OP_SBRAPOS:
1030     allow_zero = FALSE;
1031    
1032     POSSESSIVE_NON_CAPTURE:
1033     matched_once = FALSE;
1034     code_offset = ecode - md->start_code;
1035    
1036     for (;;)
1037     {
1038     if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1039     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1040 ph10 609 eptrb, RM48);
1041 ph10 604 if (rrc == MATCH_KETRPOS)
1042     {
1043 ph10 610 offset_top = md->end_offset_top;
1044 ph10 604 eptr = md->end_match_ptr;
1045     ecode = md->start_code + code_offset;
1046     matched_once = TRUE;
1047     continue;
1048     }
1049     if (rrc != MATCH_NOMATCH &&
1050     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1051     RRETURN(rrc);
1052     ecode += GET(ecode, 1);
1053     if (*ecode != OP_ALT) break;
1054     }
1055 ph10 610
1056 ph10 604 if (matched_once || allow_zero)
1057     {
1058     ecode += 1 + LINK_SIZE;
1059     break;
1060     }
1061     RRETURN(MATCH_NOMATCH);
1062    
1063     /* Control never reaches here. */
1064    
1065 nigel 77 /* Conditional group: compilation checked that there are no more than
1066     two branches. If the condition is false, skipping the first branch takes us
1067     past the end if there is only one branch, but that's OK because that is
1068 ph10 609 exactly what going to the ket would do. */
1069 nigel 77
1070     case OP_COND:
1071 nigel 93 case OP_SCOND:
1072 ph10 604 codelink = GET(ecode, 1);
1073 ph10 406
1074 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
1075     inserted between OP_COND and an assertion condition. */
1076 ph10 392
1077 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1078     {
1079     if (pcre_callout != NULL)
1080     {
1081     pcre_callout_block cb;
1082     cb.version = 1; /* Version 1 of the callout block */
1083     cb.callout_number = ecode[LINK_SIZE+2];
1084     cb.offset_vector = md->offset_vector;
1085     cb.subject = (PCRE_SPTR)md->start_subject;
1086 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1087     cb.start_match = (int)(mstart - md->start_subject);
1088     cb.current_position = (int)(eptr - md->start_subject);
1089 ph10 381 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1090     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1091     cb.capture_top = offset_top/2;
1092     cb.capture_last = md->capture_last;
1093     cb.callout_data = md->callout_data;
1094 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1095 ph10 381 if (rrc < 0) RRETURN(rrc);
1096     }
1097     ecode += _pcre_OP_lengths[OP_CALLOUT];
1098     }
1099 ph10 392
1100 ph10 399 condcode = ecode[LINK_SIZE+1];
1101 ph10 406
1102 ph10 381 /* Now see what the actual condition is */
1103 ph10 392
1104 ph10 459 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1105 nigel 77 {
1106 ph10 459 if (md->recursive == NULL) /* Not recursing => FALSE */
1107     {
1108 ph10 461 condition = FALSE;
1109     ecode += GET(ecode, 1);
1110     }
1111 ph10 459 else
1112 ph10 461 {
1113 ph10 459 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1114     condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1115 ph10 461
1116 ph10 459 /* If the test is for recursion into a specific subpattern, and it is
1117     false, but the test was set up by name, scan the table to see if the
1118     name refers to any other numbers, and test them. The condition is true
1119     if any one is set. */
1120 ph10 461
1121 ph10 459 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1122     {
1123     uschar *slotA = md->name_table;
1124     for (i = 0; i < md->name_count; i++)
1125 ph10 461 {
1126     if (GET2(slotA, 0) == recno) break;
1127 ph10 459 slotA += md->name_entry_size;
1128     }
1129 ph10 461
1130 ph10 459 /* Found a name for the number - there can be only one; duplicate
1131     names for different numbers are allowed, but not vice versa. First
1132     scan down for duplicates. */
1133 ph10 461
1134 ph10 459 if (i < md->name_count)
1135 ph10 461 {
1136 ph10 459 uschar *slotB = slotA;
1137     while (slotB > md->name_table)
1138     {
1139     slotB -= md->name_entry_size;
1140     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1141     {
1142     condition = GET2(slotB, 0) == md->recursive->group_num;
1143 ph10 461 if (condition) break;
1144     }
1145 ph10 459 else break;
1146 ph10 461 }
1147    
1148 ph10 459 /* Scan up for duplicates */
1149 ph10 461
1150 ph10 459 if (!condition)
1151 ph10 461 {
1152 ph10 459 slotB = slotA;
1153     for (i++; i < md->name_count; i++)
1154     {
1155     slotB += md->name_entry_size;
1156     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1157     {
1158     condition = GET2(slotB, 0) == md->recursive->group_num;
1159     if (condition) break;
1160 ph10 461 }
1161 ph10 459 else break;
1162 ph10 461 }
1163     }
1164 ph10 459 }
1165 ph10 461 }
1166    
1167 ph10 459 /* Chose branch according to the condition */
1168 ph10 461
1169 ph10 459 ecode += condition? 3 : GET(ecode, 1);
1170     }
1171 ph10 461 }
1172 nigel 93
1173 ph10 459 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1174 nigel 93 {
1175 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1176 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1177 ph10 461
1178 ph10 459 /* If the numbered capture is unset, but the reference was by name,
1179 ph10 461 scan the table to see if the name refers to any other numbers, and test
1180     them. The condition is true if any one is set. This is tediously similar
1181     to the code above, but not close enough to try to amalgamate. */
1182    
1183 ph10 459 if (!condition && condcode == OP_NCREF)
1184     {
1185 ph10 461 int refno = offset >> 1;
1186 ph10 459 uschar *slotA = md->name_table;
1187 ph10 461
1188 ph10 459 for (i = 0; i < md->name_count; i++)
1189 ph10 461 {
1190     if (GET2(slotA, 0) == refno) break;
1191 ph10 459 slotA += md->name_entry_size;
1192     }
1193 ph10 461
1194     /* Found a name for the number - there can be only one; duplicate names
1195     for different numbers are allowed, but not vice versa. First scan down
1196 ph10 459 for duplicates. */
1197 ph10 461
1198 ph10 459 if (i < md->name_count)
1199 ph10 461 {
1200 ph10 459 uschar *slotB = slotA;
1201     while (slotB > md->name_table)
1202     {
1203     slotB -= md->name_entry_size;
1204     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1205     {
1206     offset = GET2(slotB, 0) << 1;
1207 ph10 461 condition = offset < offset_top &&
1208 ph10 459 md->offset_vector[offset] >= 0;
1209 ph10 461 if (condition) break;
1210     }
1211 ph10 459 else break;
1212 ph10 461 }
1213    
1214 ph10 459 /* Scan up for duplicates */
1215 ph10 461
1216 ph10 459 if (!condition)
1217 ph10 461 {
1218 ph10 459 slotB = slotA;
1219     for (i++; i < md->name_count; i++)
1220     {
1221     slotB += md->name_entry_size;
1222     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1223     {
1224     offset = GET2(slotB, 0) << 1;
1225 ph10 461 condition = offset < offset_top &&
1226 ph10 459 md->offset_vector[offset] >= 0;
1227 ph10 461 if (condition) break;
1228     }
1229 ph10 459 else break;
1230 ph10 461 }
1231     }
1232 ph10 459 }
1233 ph10 461 }
1234    
1235 ph10 459 /* Chose branch according to the condition */
1236    
1237 nigel 93 ecode += condition? 3 : GET(ecode, 1);
1238 nigel 77 }
1239    
1240 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
1241 nigel 93 {
1242     condition = FALSE;
1243     ecode += GET(ecode, 1);
1244     }
1245    
1246 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
1247 ph10 604 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1248     an assertion. */
1249 nigel 77
1250     else
1251     {
1252 ph10 604 md->match_function_type = MATCH_CONDASSERT;
1253     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1254 nigel 77 if (rrc == MATCH_MATCH)
1255     {
1256 nigel 93 condition = TRUE;
1257     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1258 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1259     }
1260 ph10 550 else if (rrc != MATCH_NOMATCH &&
1261     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1262 nigel 77 {
1263     RRETURN(rrc); /* Need braces because of following else */
1264     }
1265 nigel 93 else
1266     {
1267     condition = FALSE;
1268 ph10 399 ecode += codelink;
1269 nigel 93 }
1270     }
1271 nigel 91
1272 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
1273 ph10 609 we used to use tail recursion to avoid using another stack frame, except
1274     when there was unlimited repeat of a possibly empty group. However, that
1275     strategy no longer works because of the possibilty of (*THEN) being
1276     encountered in the branch. A recursive call to match() is always required,
1277     unless the second alternative doesn't exist, in which case we can just
1278     plough on. */
1279 nigel 91
1280 nigel 93 if (condition || *ecode == OP_ALT)
1281     {
1282 ph10 609 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1283     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1284     if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1285     rrc = MATCH_NOMATCH;
1286     RRETURN(rrc);
1287 nigel 77 }
1288 ph10 395 else /* Condition false & no alternative */
1289 nigel 93 {
1290     ecode += 1 + LINK_SIZE;
1291     }
1292     break;
1293 nigel 77
1294 ph10 461
1295 ph10 447 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1296     to close any currently open capturing brackets. */
1297 ph10 461
1298 ph10 447 case OP_CLOSE:
1299 ph10 461 number = GET2(ecode, 1);
1300 ph10 447 offset = number << 1;
1301 ph10 461
1302 ph10 475 #ifdef PCRE_DEBUG
1303 ph10 447 printf("end bracket %d at *ACCEPT", number);
1304     printf("\n");
1305     #endif
1306 nigel 77
1307 ph10 447 md->capture_last = number;
1308     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1309     {
1310     md->offset_vector[offset] =
1311     md->offset_vector[md->offset_end - number];
1312 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1313 ph10 447 if (offset_top <= offset) offset_top = offset + 2;
1314     }
1315     ecode += 3;
1316 ph10 461 break;
1317 ph10 447
1318    
1319 ph10 608 /* End of the pattern, either real or forced. If we are in a recursion, we
1320     should restore the offsets appropriately, and if it's a top-level
1321     recursion, continue from after the call. */
1322 nigel 77
1323 ph10 210 case OP_ACCEPT:
1324 ph10 613 case OP_ASSERT_ACCEPT:
1325 nigel 77 case OP_END:
1326 ph10 618
1327     /*
1328 ph10 608 if (md->recursive != NULL)
1329 nigel 77 {
1330     recursion_info *rec = md->recursive;
1331 ph10 618
1332 nigel 77 md->recursive = rec->prevrec;
1333 ph10 618
1334 ph10 608 memmove(md->offset_vector, rec->offset_save,
1335 nigel 77 rec->saved_max * sizeof(int));
1336 ph10 461 offset_top = rec->save_offset_top;
1337 ph10 608 if (rec->group_num == 0)
1338     {
1339     ecode = rec->after_call;
1340     break;
1341     }
1342 nigel 77 }
1343 ph10 618 */
1344 ph10 613 /* Otherwise, if we have matched an empty string, fail if not in an
1345     assertion and if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1346     is set and we have matched at the start of the subject. In both cases,
1347     backtracking will then try other alternatives, if any. */
1348 ph10 443
1349 ph10 618 /* else */ if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1350    
1351     md->recursive == NULL &&
1352    
1353 ph10 442 (md->notempty ||
1354 ph10 443 (md->notempty_atstart &&
1355 ph10 442 mstart == md->start_subject + md->start_offset)))
1356 ph10 510 MRRETURN(MATCH_NOMATCH);
1357 ph10 443
1358 ph10 442 /* Otherwise, we have a match. */
1359 ph10 608
1360 ph10 168 md->end_match_ptr = eptr; /* Record where we ended */
1361     md->end_offset_top = offset_top; /* and how many extracts were taken */
1362 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1363 nigel 77
1364 ph10 512 /* For some reason, the macros don't work properly if an expression is
1365     given as the argument to MRRETURN when the heap is in use. */
1366    
1367     rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1368     MRRETURN(rrc);
1369    
1370 nigel 77 /* Assertion brackets. Check the alternative branches in turn - the
1371     matching won't pass the KET for an assertion. If any one branch matches,
1372     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1373     start of each branch to move the current point backwards, so the code at
1374 ph10 604 this level is identical to the lookahead case. When the assertion is part
1375     of a condition, we want to return immediately afterwards. The caller of
1376     this incarnation of the match() function will have set MATCH_CONDASSERT in
1377     md->match_function type, and one of these opcodes will be the first opcode
1378     that is processed. We use a local variable that is preserved over calls to
1379     match() to remember this case. */
1380 nigel 77
1381     case OP_ASSERT:
1382     case OP_ASSERTBACK:
1383 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1384     {
1385     condassert = TRUE;
1386     md->match_function_type = 0;
1387     }
1388     else condassert = FALSE;
1389    
1390 nigel 77 do
1391     {
1392 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1393 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1394 ph10 500 {
1395     mstart = md->start_match_ptr; /* In case \K reset it */
1396     break;
1397 ph10 501 }
1398 ph10 550 if (rrc != MATCH_NOMATCH &&
1399     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1400     RRETURN(rrc);
1401 nigel 77 ecode += GET(ecode, 1);
1402     }
1403     while (*ecode == OP_ALT);
1404 ph10 604
1405 ph10 510 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1406 nigel 77
1407     /* If checking an assertion for a condition, return MATCH_MATCH. */
1408    
1409 ph10 604 if (condassert) RRETURN(MATCH_MATCH);
1410 nigel 77
1411     /* Continue from after the assertion, updating the offsets high water
1412     mark, since extracts may have been taken during the assertion. */
1413    
1414     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1415     ecode += 1 + LINK_SIZE;
1416     offset_top = md->end_offset_top;
1417     continue;
1418    
1419 ph10 473 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1420 ph10 482 PRUNE, or COMMIT means we must assume failure without checking subsequent
1421 ph10 473 branches. */
1422 nigel 77
1423     case OP_ASSERT_NOT:
1424     case OP_ASSERTBACK_NOT:
1425 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1426     {
1427     condassert = TRUE;
1428     md->match_function_type = 0;
1429     }
1430     else condassert = FALSE;
1431    
1432 nigel 77 do
1433     {
1434 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1435 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1436 ph10 473 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1437     {
1438     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1439 ph10 482 break;
1440     }
1441 ph10 550 if (rrc != MATCH_NOMATCH &&
1442     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1443     RRETURN(rrc);
1444 nigel 77 ecode += GET(ecode,1);
1445     }
1446     while (*ecode == OP_ALT);
1447    
1448 ph10 604 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1449    
1450 nigel 77 ecode += 1 + LINK_SIZE;
1451     continue;
1452    
1453     /* Move the subject pointer back. This occurs only at the start of
1454     each branch of a lookbehind assertion. If we are too close to the start to
1455     move back, this match function fails. When working with UTF-8 we move
1456     back a number of characters, not bytes. */
1457    
1458     case OP_REVERSE:
1459     #ifdef SUPPORT_UTF8
1460     if (utf8)
1461     {
1462 nigel 93 i = GET(ecode, 1);
1463     while (i-- > 0)
1464 nigel 77 {
1465     eptr--;
1466 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1467 ph10 207 BACKCHAR(eptr);
1468 nigel 77 }
1469     }
1470     else
1471     #endif
1472    
1473     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1474    
1475     {
1476 nigel 93 eptr -= GET(ecode, 1);
1477 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1478 nigel 77 }
1479    
1480 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1481 nigel 77
1482 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1483 nigel 77 ecode += 1 + LINK_SIZE;
1484     break;
1485    
1486     /* The callout item calls an external function, if one is provided, passing
1487     details of the match so far. This is mainly for debugging, though the
1488     function is able to force a failure. */
1489    
1490     case OP_CALLOUT:
1491     if (pcre_callout != NULL)
1492     {
1493     pcre_callout_block cb;
1494     cb.version = 1; /* Version 1 of the callout block */
1495     cb.callout_number = ecode[1];
1496     cb.offset_vector = md->offset_vector;
1497 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1498 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1499     cb.start_match = (int)(mstart - md->start_subject);
1500     cb.current_position = (int)(eptr - md->start_subject);
1501 nigel 77 cb.pattern_position = GET(ecode, 2);
1502     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1503     cb.capture_top = offset_top/2;
1504     cb.capture_last = md->capture_last;
1505     cb.callout_data = md->callout_data;
1506 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1507 nigel 77 if (rrc < 0) RRETURN(rrc);
1508     }
1509     ecode += 2 + 2*LINK_SIZE;
1510     break;
1511    
1512     /* Recursion either matches the current regex, or some subexpression. The
1513     offset data is the offset to the starting bracket from the start of the
1514     whole pattern. (This is so that it works from duplicated subpatterns.)
1515 ph10 618
1516     The state of the capturing groups is preserved over recursion, and
1517     re-instated afterwards. We don't know how many are started and not yet
1518     finished (offset_top records the completed total) so we just have to save
1519     all the potential data. There may be up to 65535 such values, which is too
1520     large to put on the stack, but using malloc for small numbers seems
1521     expensive. As a compromise, the stack is used when there are no more than
1522     REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1523 nigel 77
1524     There are also other values that have to be saved. We use a chained
1525     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1526 ph10 618 for the original version of this logic. It has, however, been hacked around
1527     a lot, so he is not to blame for the current way it works. */
1528 nigel 77
1529     case OP_RECURSE:
1530     {
1531     callpat = md->start_code + GET(ecode, 1);
1532 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1533     GET2(callpat, 1 + LINK_SIZE);
1534 nigel 77
1535     /* Add to "recursing stack" */
1536    
1537     new_recursive.prevrec = md->recursive;
1538     md->recursive = &new_recursive;
1539    
1540 ph10 618 /* Where to continue from afterwards */
1541 nigel 77
1542     ecode += 1 + LINK_SIZE;
1543    
1544 ph10 618 /* Now save the offset data */
1545 nigel 77
1546     new_recursive.saved_max = md->offset_end;
1547     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1548     new_recursive.offset_save = stacksave;
1549     else
1550     {
1551     new_recursive.offset_save =
1552     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1553     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1554     }
1555     memcpy(new_recursive.offset_save, md->offset_vector,
1556     new_recursive.saved_max * sizeof(int));
1557 ph10 608
1558 ph10 618 /* OK, now we can do the recursion. After processing each alternative,
1559     restore the offset data. If there were nested recursions, md->recursive
1560     might be changed, so reset it before looping. */
1561 nigel 77
1562     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1563 ph10 604 cbegroup = (*callpat >= OP_SBRA);
1564 nigel 77 do
1565     {
1566 ph10 604 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1567 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1568 ph10 604 md, eptrb, RM6);
1569 ph10 618 memcpy(md->offset_vector, new_recursive.offset_save,
1570     new_recursive.saved_max * sizeof(int));
1571 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1572 nigel 77 {
1573 nigel 87 DPRINTF(("Recursion matched\n"));
1574 nigel 77 md->recursive = new_recursive.prevrec;
1575     if (new_recursive.offset_save != stacksave)
1576     (pcre_free)(new_recursive.offset_save);
1577 ph10 618
1578     /* Set where we got to in the subject, and reset the start in case
1579     it was changed by \K. This *is* propagated back out of a recursion,
1580     for Perl compatibility. */
1581    
1582     eptr = md->end_match_ptr;
1583     mstart = md->start_match_ptr;
1584     goto RECURSION_MATCHED; /* Exit loop; end processing */
1585 nigel 77 }
1586 ph10 550 else if (rrc != MATCH_NOMATCH &&
1587     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1588 nigel 87 {
1589     DPRINTF(("Recursion gave error %d\n", rrc));
1590 ph10 400 if (new_recursive.offset_save != stacksave)
1591     (pcre_free)(new_recursive.offset_save);
1592 nigel 87 RRETURN(rrc);
1593     }
1594 nigel 77
1595     md->recursive = &new_recursive;
1596     callpat += GET(callpat, 1);
1597     }
1598     while (*callpat == OP_ALT);
1599    
1600     DPRINTF(("Recursion didn't match\n"));
1601     md->recursive = new_recursive.prevrec;
1602     if (new_recursive.offset_save != stacksave)
1603     (pcre_free)(new_recursive.offset_save);
1604 ph10 510 MRRETURN(MATCH_NOMATCH);
1605 nigel 77 }
1606 ph10 618
1607     RECURSION_MATCHED:
1608     break;
1609 nigel 77
1610     /* An alternation is the end of a branch; scan along to find the end of the
1611     bracketed group and go to there. */
1612    
1613     case OP_ALT:
1614     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1615     break;
1616    
1617 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1618     indicating that it may occur zero times. It may repeat infinitely, or not
1619     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1620     with fixed upper repeat limits are compiled as a number of copies, with the
1621     optional ones preceded by BRAZERO or BRAMINZERO. */
1622 ph10 604
1623 nigel 77 case OP_BRAZERO:
1624 ph10 604 next = ecode + 1;
1625     RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1626     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1627     do next += GET(next, 1); while (*next == OP_ALT);
1628     ecode = next + 1 + LINK_SIZE;
1629 nigel 77 break;
1630 ph10 604
1631 nigel 77 case OP_BRAMINZERO:
1632 ph10 604 next = ecode + 1;
1633     do next += GET(next, 1); while (*next == OP_ALT);
1634     RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1635     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1636     ecode++;
1637 nigel 77 break;
1638    
1639 ph10 335 case OP_SKIPZERO:
1640 ph10 604 next = ecode+1;
1641     do next += GET(next,1); while (*next == OP_ALT);
1642     ecode = next + 1 + LINK_SIZE;
1643 ph10 335 break;
1644 ph10 604
1645     /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1646     here; just jump to the group, with allow_zero set TRUE. */
1647    
1648     case OP_BRAPOSZERO:
1649     op = *(++ecode);
1650     allow_zero = TRUE;
1651     if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1652     goto POSSESSIVE_NON_CAPTURE;
1653 ph10 335
1654 nigel 93 /* End of a group, repeated or non-repeating. */
1655 nigel 77
1656     case OP_KET:
1657     case OP_KETRMIN:
1658     case OP_KETRMAX:
1659 ph10 604 case OP_KETRPOS:
1660 nigel 91 prev = ecode - GET(ecode, 1);
1661 ph10 618
1662 nigel 93 /* If this was a group that remembered the subject start, in order to break
1663     infinite repeats of empty string matches, retrieve the subject start from
1664     the chain. Otherwise, set it NULL. */
1665 nigel 77
1666 ph10 618 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1667 nigel 93 {
1668     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1669     eptrb = eptrb->epb_prev; /* Backup to previous group */
1670     }
1671     else saved_eptr = NULL;
1672 nigel 77
1673 ph10 618 /* If we are at the end of an assertion group, stop matching and return
1674     MATCH_MATCH, but record the current high water mark for use by positive
1675     assertions. We also need to record the match start in case it was changed
1676     by \K. */
1677 nigel 93
1678 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1679 ph10 618 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT)
1680 nigel 91 {
1681     md->end_match_ptr = eptr; /* For ONCE */
1682     md->end_offset_top = offset_top;
1683 ph10 500 md->start_match_ptr = mstart;
1684 ph10 510 MRRETURN(MATCH_MATCH);
1685 nigel 91 }
1686 nigel 77
1687 nigel 93 /* For capturing groups we have to check the group number back at the start
1688     and if necessary complete handling an extraction by setting the offsets and
1689 ph10 618 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1690     into group 0, so it won't be picked up here. Instead, we catch it when the
1691     OP_END is reached. Other recursion is handled here. We just have to record
1692     the current subject position and start match pointer and give a MATCH
1693     return. */
1694 nigel 77
1695 ph10 604 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1696     *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1697 nigel 91 {
1698 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1699 nigel 91 offset = number << 1;
1700 ph10 461
1701 ph10 475 #ifdef PCRE_DEBUG
1702 nigel 91 printf("end bracket %d", number);
1703     printf("\n");
1704 nigel 77 #endif
1705    
1706 ph10 618 /* Handle a recursively called group. */
1707    
1708     if (md->recursive != NULL && md->recursive->group_num == number)
1709     {
1710     md->end_match_ptr = eptr;
1711     md->start_match_ptr = mstart;
1712     RRETURN(MATCH_MATCH);
1713     }
1714    
1715     /* Deal with capturing */
1716    
1717 nigel 93 md->capture_last = number;
1718     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1719 nigel 91 {
1720 ph10 615 /* If offset is greater than offset_top, it means that we are
1721     "skipping" a capturing group, and that group's offsets must be marked
1722     unset. In earlier versions of PCRE, all the offsets were unset at the
1723     start of matching, but this doesn't work because atomic groups and
1724     assertions can cause a value to be set that should later be unset.
1725     Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1726     part of the atomic group, but this is not on the final matching path,
1727     so must be unset when 2 is set. (If there is no group 2, there is no
1728     problem, because offset_top will then be 2, indicating no capture.) */
1729    
1730     if (offset > offset_top)
1731     {
1732     register int *iptr = md->offset_vector + offset_top;
1733     register int *iend = md->offset_vector + offset;
1734     while (iptr < iend) *iptr++ = -1;
1735     }
1736    
1737     /* Now make the extraction */
1738    
1739 nigel 93 md->offset_vector[offset] =
1740     md->offset_vector[md->offset_end - number];
1741 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1742 nigel 93 if (offset_top <= offset) offset_top = offset + 2;
1743     }
1744 nigel 91 }
1745 nigel 77
1746 ph10 618 /* For an ordinary non-repeating ket, just continue at this level. This
1747     also happens for a repeating ket if no characters were matched in the
1748     group. This is the forcible breaking of infinite loops as implemented in
1749     Perl 5.005. For a non-repeating atomic group, establish a backup point by
1750     processing the rest of the pattern at a lower level. If this results in a
1751     NOMATCH return, pass MATCH_ONCE back to the original OP_ONCE level, thereby
1752     bypassing intermediate backup points, but resetting any captures that
1753     happened along the way. */
1754 nigel 77
1755 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1756     {
1757 ph10 618 if (*prev == OP_ONCE)
1758     {
1759     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1760     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1761     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1762     RRETURN(MATCH_ONCE);
1763     }
1764     ecode += 1 + LINK_SIZE; /* Carry on at this level */
1765 nigel 91 break;
1766     }
1767 ph10 604
1768     /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1769     and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1770     at a time from the outer level, thus saving stack. */
1771    
1772     if (*ecode == OP_KETRPOS)
1773     {
1774     md->end_match_ptr = eptr;
1775     md->end_offset_top = offset_top;
1776     RRETURN(MATCH_KETRPOS);
1777     }
1778 nigel 77
1779 ph10 604 /* The normal repeating kets try the rest of the pattern or restart from
1780     the preceding bracket, in the appropriate order. In the second case, we can
1781     use tail recursion to avoid using another stack frame, unless we have an
1782 ph10 618 an atomic group or an unlimited repeat of a group that can match an empty
1783     string. */
1784 nigel 77
1785 nigel 91 if (*ecode == OP_KETRMIN)
1786     {
1787 ph10 618 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
1788 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1789 ph10 618 if (*prev == OP_ONCE)
1790     {
1791     RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
1792     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1793     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1794     RRETURN(MATCH_ONCE);
1795     }
1796 ph10 604 if (*prev >= OP_SBRA) /* Could match an empty string */
1797 ph10 197 {
1798 ph10 604 md->match_function_type = MATCH_CBEGROUP;
1799     RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1800 ph10 197 RRETURN(rrc);
1801     }
1802 nigel 91 ecode = prev;
1803     goto TAIL_RECURSE;
1804 nigel 77 }
1805 nigel 91 else /* OP_KETRMAX */
1806     {
1807 ph10 604 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1808     RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1809 ph10 618 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1810 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1811 ph10 618 if (*prev == OP_ONCE)
1812     {
1813     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
1814     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1815     md->once_target = prev;
1816     RRETURN(MATCH_ONCE);
1817     }
1818 nigel 91 ecode += 1 + LINK_SIZE;
1819     goto TAIL_RECURSE;
1820     }
1821     /* Control never gets here */
1822 nigel 77
1823 ph10 602 /* Not multiline mode: start of subject assertion, unless notbol. */
1824 nigel 77
1825     case OP_CIRC:
1826 ph10 510 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1827 ph10 602
1828 nigel 77 /* Start of subject assertion */
1829    
1830     case OP_SOD:
1831 ph10 510 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1832 nigel 77 ecode++;
1833     break;
1834 ph10 602
1835     /* Multiline mode: start of subject unless notbol, or after any newline. */
1836 nigel 77
1837 ph10 602 case OP_CIRCM:
1838     if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1839     if (eptr != md->start_subject &&
1840     (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1841     MRRETURN(MATCH_NOMATCH);
1842     ecode++;
1843     break;
1844    
1845 nigel 77 /* Start of match assertion */
1846    
1847     case OP_SOM:
1848 ph10 510 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1849 nigel 77 ecode++;
1850     break;
1851 ph10 172
1852 ph10 168 /* Reset the start of match point */
1853 ph10 172
1854 ph10 168 case OP_SET_SOM:
1855     mstart = eptr;
1856 ph10 172 ecode++;
1857     break;
1858 nigel 77
1859 ph10 602 /* Multiline mode: assert before any newline, or before end of subject
1860     unless noteol is set. */
1861 nigel 77
1862 ph10 602 case OP_DOLLM:
1863     if (eptr < md->end_subject)
1864     { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1865     else
1866 nigel 77 {
1867 ph10 510 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1868 ph10 602 SCHECK_PARTIAL();
1869 nigel 77 }
1870 ph10 602 ecode++;
1871     break;
1872 ph10 579
1873 ph10 602 /* Not multiline mode: assert before a terminating newline or before end of
1874     subject unless noteol is set. */
1875    
1876     case OP_DOLL:
1877     if (md->noteol) MRRETURN(MATCH_NOMATCH);
1878     if (!md->endonly) goto ASSERT_NL_OR_EOS;
1879    
1880 nigel 91 /* ... else fall through for endonly */
1881 nigel 77
1882     /* End of subject assertion (\z) */
1883    
1884     case OP_EOD:
1885 ph10 510 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1886 ph10 553 SCHECK_PARTIAL();
1887 nigel 77 ecode++;
1888     break;
1889    
1890     /* End of subject or ending \n assertion (\Z) */
1891    
1892     case OP_EODN:
1893 ph10 553 ASSERT_NL_OR_EOS:
1894     if (eptr < md->end_subject &&
1895 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1896 ph10 510 MRRETURN(MATCH_NOMATCH);
1897 ph10 579
1898 ph10 553 /* Either at end of string or \n before end. */
1899 ph10 579
1900 ph10 553 SCHECK_PARTIAL();
1901 nigel 77 ecode++;
1902     break;
1903    
1904     /* Word boundary assertions */
1905    
1906     case OP_NOT_WORD_BOUNDARY:
1907     case OP_WORD_BOUNDARY:
1908     {
1909    
1910     /* Find out if the previous and current characters are "word" characters.
1911     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1912 ph10 443 be "non-word" characters. Remember the earliest consulted character for
1913 ph10 435 partial matching. */
1914 nigel 77
1915     #ifdef SUPPORT_UTF8
1916     if (utf8)
1917     {
1918 ph10 518 /* Get status of previous character */
1919 ph10 527
1920 nigel 77 if (eptr == md->start_subject) prev_is_word = FALSE; else
1921     {
1922 ph10 409 USPTR lastptr = eptr - 1;
1923 nigel 77 while((*lastptr & 0xc0) == 0x80) lastptr--;
1924 ph10 443 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1925 nigel 77 GETCHAR(c, lastptr);
1926 ph10 527 #ifdef SUPPORT_UCP
1927 ph10 518 if (md->use_ucp)
1928     {
1929     if (c == '_') prev_is_word = TRUE; else
1930 ph10 527 {
1931 ph10 518 int cat = UCD_CATEGORY(c);
1932     prev_is_word = (cat == ucp_L || cat == ucp_N);
1933 ph10 527 }
1934     }
1935     else
1936     #endif
1937 nigel 77 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1938     }
1939 ph10 527
1940 ph10 518 /* Get status of next character */
1941 ph10 527
1942 ph10 443 if (eptr >= md->end_subject)
1943 nigel 77 {
1944 ph10 443 SCHECK_PARTIAL();
1945     cur_is_word = FALSE;
1946 ph10 428 }
1947     else
1948     {
1949 nigel 77 GETCHAR(c, eptr);
1950 ph10 527 #ifdef SUPPORT_UCP
1951 ph10 518 if (md->use_ucp)
1952     {
1953     if (c == '_') cur_is_word = TRUE; else
1954 ph10 527 {
1955 ph10 518 int cat = UCD_CATEGORY(c);
1956     cur_is_word = (cat == ucp_L || cat == ucp_N);
1957 ph10 527 }
1958     }
1959     else
1960     #endif
1961 nigel 77 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1962     }
1963     }
1964     else
1965     #endif
1966    
1967 ph10 527 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1968 ph10 518 consistency with the behaviour of \w we do use it in this case. */
1969 nigel 77
1970     {
1971 ph10 518 /* Get status of previous character */
1972 ph10 527
1973 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
1974     {
1975 ph10 443 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1976 ph10 527 #ifdef SUPPORT_UCP
1977 ph10 518 if (md->use_ucp)
1978     {
1979 ph10 527 c = eptr[-1];
1980 ph10 518 if (c == '_') prev_is_word = TRUE; else
1981 ph10 527 {
1982 ph10 518 int cat = UCD_CATEGORY(c);
1983     prev_is_word = (cat == ucp_L || cat == ucp_N);
1984 ph10 527 }
1985     }
1986     else
1987     #endif
1988 ph10 435 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1989     }
1990 ph10 527
1991 ph10 518 /* Get status of next character */
1992 ph10 527
1993 ph10 443 if (eptr >= md->end_subject)
1994 ph10 428 {
1995 ph10 443 SCHECK_PARTIAL();
1996     cur_is_word = FALSE;
1997 ph10 428 }
1998 ph10 527 else
1999     #ifdef SUPPORT_UCP
2000 ph10 518 if (md->use_ucp)
2001     {
2002 ph10 527 c = *eptr;
2003 ph10 518 if (c == '_') cur_is_word = TRUE; else
2004 ph10 527 {
2005 ph10 518 int cat = UCD_CATEGORY(c);
2006     cur_is_word = (cat == ucp_L || cat == ucp_N);
2007 ph10 527 }
2008     }
2009     else
2010     #endif
2011 ph10 518 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2012 nigel 77 }
2013    
2014     /* Now see if the situation is what we want */
2015    
2016     if ((*ecode++ == OP_WORD_BOUNDARY)?
2017     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2018 ph10 510 MRRETURN(MATCH_NOMATCH);
2019 nigel 77 }
2020     break;
2021    
2022     /* Match a single character type; inline for speed */
2023    
2024     case OP_ANY:
2025 ph10 510 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2026 ph10 345 /* Fall through */
2027    
2028 ph10 341 case OP_ALLANY:
2029 ph10 443 if (eptr++ >= md->end_subject)
2030 ph10 428 {
2031 ph10 443 SCHECK_PARTIAL();
2032 ph10 510 MRRETURN(MATCH_NOMATCH);
2033 ph10 443 }
2034 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2035 nigel 77 ecode++;
2036     break;
2037    
2038     /* Match a single byte, even in UTF-8 mode. This opcode really does match
2039     any byte, even newline, independent of the setting of PCRE_DOTALL. */
2040    
2041     case OP_ANYBYTE:
2042 ph10 443 if (eptr++ >= md->end_subject)
2043 ph10 428 {
2044 ph10 443 SCHECK_PARTIAL();
2045 ph10 510 MRRETURN(MATCH_NOMATCH);
2046 ph10 443 }
2047 nigel 77 ecode++;
2048     break;
2049    
2050     case OP_NOT_DIGIT:
2051 ph10 443 if (eptr >= md->end_subject)
2052 ph10 428 {
2053 ph10 443 SCHECK_PARTIAL();
2054 ph10 510 MRRETURN(MATCH_NOMATCH);
2055 ph10 443 }
2056 nigel 77 GETCHARINCTEST(c, eptr);
2057     if (
2058     #ifdef SUPPORT_UTF8
2059     c < 256 &&
2060     #endif
2061     (md->ctypes[c] & ctype_digit) != 0
2062     )
2063 ph10 510 MRRETURN(MATCH_NOMATCH);
2064 nigel 77 ecode++;
2065     break;
2066    
2067     case OP_DIGIT:
2068 ph10 443 if (eptr >= md->end_subject)
2069 ph10 428 {
2070 ph10 443 SCHECK_PARTIAL();
2071 ph10 510 MRRETURN(MATCH_NOMATCH);
2072 ph10 443 }
2073 nigel 77 GETCHARINCTEST(c, eptr);
2074     if (
2075     #ifdef SUPPORT_UTF8
2076     c >= 256 ||
2077     #endif
2078     (md->ctypes[c] & ctype_digit) == 0
2079     )
2080 ph10 510 MRRETURN(MATCH_NOMATCH);
2081 nigel 77 ecode++;
2082     break;
2083    
2084     case OP_NOT_WHITESPACE:
2085 ph10 443 if (eptr >= md->end_subject)
2086 ph10 428 {
2087 ph10 443 SCHECK_PARTIAL();
2088 ph10 510 MRRETURN(MATCH_NOMATCH);
2089 ph10 443 }
2090 nigel 77 GETCHARINCTEST(c, eptr);
2091     if (
2092     #ifdef SUPPORT_UTF8
2093     c < 256 &&
2094     #endif
2095     (md->ctypes[c] & ctype_space) != 0
2096     )
2097 ph10 510 MRRETURN(MATCH_NOMATCH);
2098 nigel 77 ecode++;
2099     break;
2100    
2101     case OP_WHITESPACE:
2102 ph10 443 if (eptr >= md->end_subject)
2103 ph10 428 {
2104 ph10 443 SCHECK_PARTIAL();
2105 ph10 510 MRRETURN(MATCH_NOMATCH);
2106 ph10 443 }
2107 nigel 77 GETCHARINCTEST(c, eptr);
2108     if (
2109     #ifdef SUPPORT_UTF8
2110     c >= 256 ||
2111     #endif
2112     (md->ctypes[c] & ctype_space) == 0
2113     )
2114 ph10 510 MRRETURN(MATCH_NOMATCH);
2115 nigel 77 ecode++;
2116     break;
2117    
2118     case OP_NOT_WORDCHAR:
2119 ph10 443 if (eptr >= md->end_subject)
2120 ph10 428 {
2121 ph10 443 SCHECK_PARTIAL();
2122 ph10 510 MRRETURN(MATCH_NOMATCH);
2123 ph10 443 }
2124 nigel 77 GETCHARINCTEST(c, eptr);
2125     if (
2126     #ifdef SUPPORT_UTF8
2127     c < 256 &&
2128     #endif
2129     (md->ctypes[c] & ctype_word) != 0
2130     )
2131 ph10 510 MRRETURN(MATCH_NOMATCH);
2132 nigel 77 ecode++;
2133     break;
2134    
2135     case OP_WORDCHAR:
2136 ph10 443 if (eptr >= md->end_subject)
2137 ph10 428 {
2138 ph10 443 SCHECK_PARTIAL();
2139 ph10 510 MRRETURN(MATCH_NOMATCH);
2140 ph10 443 }
2141 nigel 77 GETCHARINCTEST(c, eptr);
2142     if (
2143     #ifdef SUPPORT_UTF8
2144     c >= 256 ||
2145     #endif
2146     (md->ctypes[c] & ctype_word) == 0
2147     )
2148 ph10 510 MRRETURN(MATCH_NOMATCH);
2149 nigel 77 ecode++;
2150     break;
2151    
2152 nigel 93 case OP_ANYNL:
2153 ph10 443 if (eptr >= md->end_subject)
2154 ph10 428 {
2155 ph10 443 SCHECK_PARTIAL();
2156 ph10 510 MRRETURN(MATCH_NOMATCH);
2157 ph10 443 }
2158 nigel 93 GETCHARINCTEST(c, eptr);
2159     switch(c)
2160     {
2161 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2162 ph10 600
2163 nigel 93 case 0x000d:
2164     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2165     break;
2166 ph10 231
2167 nigel 93 case 0x000a:
2168 ph10 231 break;
2169    
2170 nigel 93 case 0x000b:
2171     case 0x000c:
2172     case 0x0085:
2173     case 0x2028:
2174     case 0x2029:
2175 ph10 510 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2176 nigel 93 break;
2177     }
2178     ecode++;
2179     break;
2180    
2181 ph10 178 case OP_NOT_HSPACE:
2182 ph10 443 if (eptr >= md->end_subject)
2183 ph10 428 {
2184 ph10 443 SCHECK_PARTIAL();
2185 ph10 510 MRRETURN(MATCH_NOMATCH);
2186 ph10 443 }
2187 ph10 178 GETCHARINCTEST(c, eptr);
2188     switch(c)
2189     {
2190     default: break;
2191     case 0x09: /* HT */
2192     case 0x20: /* SPACE */
2193     case 0xa0: /* NBSP */
2194     case 0x1680: /* OGHAM SPACE MARK */
2195     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2196     case 0x2000: /* EN QUAD */
2197     case 0x2001: /* EM QUAD */
2198     case 0x2002: /* EN SPACE */
2199     case 0x2003: /* EM SPACE */
2200     case 0x2004: /* THREE-PER-EM SPACE */
2201     case 0x2005: /* FOUR-PER-EM SPACE */
2202     case 0x2006: /* SIX-PER-EM SPACE */
2203     case 0x2007: /* FIGURE SPACE */
2204     case 0x2008: /* PUNCTUATION SPACE */
2205     case 0x2009: /* THIN SPACE */
2206     case 0x200A: /* HAIR SPACE */
2207     case 0x202f: /* NARROW NO-BREAK SPACE */
2208     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2209     case 0x3000: /* IDEOGRAPHIC SPACE */
2210 ph10 510 MRRETURN(MATCH_NOMATCH);
2211 ph10 178 }
2212     ecode++;
2213     break;
2214    
2215     case OP_HSPACE:
2216 ph10 443 if (eptr >= md->end_subject)
2217 ph10 428 {
2218 ph10 443 SCHECK_PARTIAL();
2219 ph10 510 MRRETURN(MATCH_NOMATCH);
2220 ph10 443 }
2221 ph10 178 GETCHARINCTEST(c, eptr);
2222     switch(c)
2223     {
2224 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2225 ph10 178 case 0x09: /* HT */
2226     case 0x20: /* SPACE */
2227     case 0xa0: /* NBSP */
2228     case 0x1680: /* OGHAM SPACE MARK */
2229     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2230     case 0x2000: /* EN QUAD */
2231     case 0x2001: /* EM QUAD */
2232     case 0x2002: /* EN SPACE */
2233     case 0x2003: /* EM SPACE */
2234     case 0x2004: /* THREE-PER-EM SPACE */
2235     case 0x2005: /* FOUR-PER-EM SPACE */
2236     case 0x2006: /* SIX-PER-EM SPACE */
2237     case 0x2007: /* FIGURE SPACE */
2238     case 0x2008: /* PUNCTUATION SPACE */
2239     case 0x2009: /* THIN SPACE */
2240     case 0x200A: /* HAIR SPACE */
2241     case 0x202f: /* NARROW NO-BREAK SPACE */
2242     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2243     case 0x3000: /* IDEOGRAPHIC SPACE */
2244     break;
2245     }
2246     ecode++;
2247     break;
2248    
2249     case OP_NOT_VSPACE:
2250 ph10 443 if (eptr >= md->end_subject)
2251 ph10 428 {
2252 ph10 443 SCHECK_PARTIAL();
2253 ph10 510 MRRETURN(MATCH_NOMATCH);
2254 ph10 443 }
2255 ph10 178 GETCHARINCTEST(c, eptr);
2256     switch(c)
2257     {
2258     default: break;
2259     case 0x0a: /* LF */
2260     case 0x0b: /* VT */
2261     case 0x0c: /* FF */
2262     case 0x0d: /* CR */
2263     case 0x85: /* NEL */
2264     case 0x2028: /* LINE SEPARATOR */
2265     case 0x2029: /* PARAGRAPH SEPARATOR */
2266 ph10 510 MRRETURN(MATCH_NOMATCH);
2267 ph10 178 }
2268     ecode++;
2269     break;
2270    
2271     case OP_VSPACE:
2272 ph10 443 if (eptr >= md->end_subject)
2273 ph10 428 {
2274 ph10 443 SCHECK_PARTIAL();
2275 ph10 510 MRRETURN(MATCH_NOMATCH);
2276 ph10 443 }
2277 ph10 178 GETCHARINCTEST(c, eptr);
2278     switch(c)
2279     {
2280 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2281 ph10 178 case 0x0a: /* LF */
2282     case 0x0b: /* VT */
2283     case 0x0c: /* FF */
2284     case 0x0d: /* CR */
2285     case 0x85: /* NEL */
2286     case 0x2028: /* LINE SEPARATOR */
2287     case 0x2029: /* PARAGRAPH SEPARATOR */
2288     break;
2289     }
2290     ecode++;
2291     break;
2292    
2293 nigel 77 #ifdef SUPPORT_UCP
2294     /* Check the next character by Unicode property. We will get here only
2295     if the support is in the binary; otherwise a compile-time error occurs. */
2296    
2297     case OP_PROP:
2298     case OP_NOTPROP:
2299 ph10 443 if (eptr >= md->end_subject)
2300 ph10 428 {
2301 ph10 443 SCHECK_PARTIAL();
2302 ph10 510 MRRETURN(MATCH_NOMATCH);
2303 ph10 443 }
2304 nigel 77 GETCHARINCTEST(c, eptr);
2305     {
2306 ph10 384 const ucd_record *prop = GET_UCD(c);
2307 nigel 77
2308 nigel 87 switch(ecode[1])
2309     {
2310     case PT_ANY:
2311 ph10 510 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2312 nigel 87 break;
2313 nigel 77
2314 nigel 87 case PT_LAMP:
2315 ph10 349 if ((prop->chartype == ucp_Lu ||
2316     prop->chartype == ucp_Ll ||
2317     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2318 ph10 510 MRRETURN(MATCH_NOMATCH);
2319 ph10 517 break;
2320 nigel 87
2321     case PT_GC:
2322 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2323 ph10 510 MRRETURN(MATCH_NOMATCH);
2324 nigel 87 break;
2325    
2326     case PT_PC:
2327 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2328 ph10 510 MRRETURN(MATCH_NOMATCH);
2329 nigel 87 break;
2330    
2331     case PT_SC:
2332 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
2333 ph10 510 MRRETURN(MATCH_NOMATCH);
2334 nigel 87 break;
2335 ph10 527
2336 ph10 517 /* These are specials */
2337 ph10 527
2338 ph10 517 case PT_ALNUM:
2339     if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2340     _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2341     MRRETURN(MATCH_NOMATCH);
2342 ph10 527 break;
2343    
2344 ph10 517 case PT_SPACE: /* Perl space */
2345     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2346     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2347     == (op == OP_NOTPROP))
2348     MRRETURN(MATCH_NOMATCH);
2349 ph10 527 break;
2350    
2351 ph10 517 case PT_PXSPACE: /* POSIX space */
2352     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2353 ph10 527 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2354 ph10 517 c == CHAR_FF || c == CHAR_CR)
2355     == (op == OP_NOTPROP))
2356     MRRETURN(MATCH_NOMATCH);
2357 ph10 527 break;
2358 nigel 87
2359 ph10 527 case PT_WORD:
2360 ph10 517 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2361 ph10 527 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2362 ph10 517 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2363     MRRETURN(MATCH_NOMATCH);
2364 ph10 527 break;
2365    
2366 ph10 517 /* This should never occur */
2367    
2368 nigel 87 default:
2369     RRETURN(PCRE_ERROR_INTERNAL);
2370 nigel 77 }
2371 nigel 87
2372     ecode += 3;
2373 nigel 77 }
2374     break;
2375    
2376     /* Match an extended Unicode sequence. We will get here only if the support
2377     is in the binary; otherwise a compile-time error occurs. */
2378    
2379     case OP_EXTUNI:
2380 ph10 443 if (eptr >= md->end_subject)
2381 ph10 428 {
2382 ph10 443 SCHECK_PARTIAL();
2383 ph10 510 MRRETURN(MATCH_NOMATCH);
2384 ph10 443 }
2385 nigel 77 GETCHARINCTEST(c, eptr);
2386     {
2387 ph10 349 int category = UCD_CATEGORY(c);
2388 ph10 510 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2389 nigel 77 while (eptr < md->end_subject)
2390     {
2391     int len = 1;
2392     if (!utf8) c = *eptr; else
2393     {
2394     GETCHARLEN(c, eptr, len);
2395     }
2396 ph10 349 category = UCD_CATEGORY(c);
2397 nigel 77 if (category != ucp_M) break;
2398     eptr += len;
2399     }
2400     }
2401     ecode++;
2402     break;
2403     #endif
2404    
2405    
2406     /* Match a back reference, possibly repeatedly. Look past the end of the
2407     item to see if there is repeat information following. The code is similar
2408     to that for character classes, but repeated for efficiency. Then obey
2409     similar code to character type repeats - written out again for speed.
2410     However, if the referenced string is the empty string, always treat
2411     it as matched, any number of times (otherwise there could be infinite
2412     loops). */
2413    
2414     case OP_REF:
2415 ph10 602 case OP_REFI:
2416     caseless = op == OP_REFI;
2417 ph10 595 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2418     ecode += 3;
2419 ph10 345
2420 ph10 595 /* If the reference is unset, there are two possibilities:
2421 ph10 345
2422 ph10 595 (a) In the default, Perl-compatible state, set the length negative;
2423     this ensures that every attempt at a match fails. We can't just fail
2424     here, because of the possibility of quantifiers with zero minima.
2425 ph10 345
2426 ph10 595 (b) If the JavaScript compatibility flag is set, set the length to zero
2427     so that the back reference matches an empty string.
2428 ph10 345
2429 ph10 595 Otherwise, set the length to the length of what was matched by the
2430     referenced subpattern. */
2431 ph10 345
2432 ph10 595 if (offset >= offset_top || md->offset_vector[offset] < 0)
2433     length = (md->jscript_compat)? 0 : -1;
2434     else
2435     length = md->offset_vector[offset+1] - md->offset_vector[offset];
2436 nigel 77
2437 ph10 595 /* Set up for repetition, or handle the non-repeated case */
2438 nigel 77
2439 ph10 595 switch (*ecode)
2440     {
2441     case OP_CRSTAR:
2442     case OP_CRMINSTAR:
2443     case OP_CRPLUS:
2444     case OP_CRMINPLUS:
2445     case OP_CRQUERY:
2446     case OP_CRMINQUERY:
2447     c = *ecode++ - OP_CRSTAR;
2448     minimize = (c & 1) != 0;
2449     min = rep_min[c]; /* Pick up values from tables; */
2450     max = rep_max[c]; /* zero for max => infinity */
2451     if (max == 0) max = INT_MAX;
2452     break;
2453 nigel 77
2454 ph10 595 case OP_CRRANGE:
2455     case OP_CRMINRANGE:
2456     minimize = (*ecode == OP_CRMINRANGE);
2457     min = GET2(ecode, 1);
2458     max = GET2(ecode, 3);
2459     if (max == 0) max = INT_MAX;
2460     ecode += 5;
2461     break;
2462 nigel 77
2463 ph10 595 default: /* No repeat follows */
2464 ph10 602 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2465 ph10 595 {
2466     CHECK_PARTIAL();
2467     MRRETURN(MATCH_NOMATCH);
2468 nigel 77 }
2469 ph10 595 eptr += length;
2470     continue; /* With the main loop */
2471     }
2472 nigel 77
2473 ph10 595 /* Handle repeated back references. If the length of the reference is
2474     zero, just continue with the main loop. */
2475 ph10 443
2476 ph10 595 if (length == 0) continue;
2477 nigel 77
2478 ph10 595 /* First, ensure the minimum number of matches are present. We get back
2479     the length of the reference string explicitly rather than passing the
2480     address of eptr, so that eptr can be a register variable. */
2481 nigel 77
2482 ph10 595 for (i = 1; i <= min; i++)
2483     {
2484     int slength;
2485 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2486 nigel 77 {
2487 ph10 595 CHECK_PARTIAL();
2488     MRRETURN(MATCH_NOMATCH);
2489 nigel 77 }
2490 ph10 595 eptr += slength;
2491     }
2492 nigel 77
2493 ph10 595 /* If min = max, continue at the same level without recursion.
2494     They are not both allowed to be zero. */
2495 nigel 77
2496 ph10 595 if (min == max) continue;
2497 nigel 77
2498 ph10 595 /* If minimizing, keep trying and advancing the pointer */
2499 nigel 77
2500 ph10 595 if (minimize)
2501     {
2502     for (fi = min;; fi++)
2503 nigel 77 {
2504 ph10 595 int slength;
2505 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2506 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2507     if (fi >= max) MRRETURN(MATCH_NOMATCH);
2508 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2509 nigel 77 {
2510 ph10 595 CHECK_PARTIAL();
2511     MRRETURN(MATCH_NOMATCH);
2512 nigel 77 }
2513 ph10 595 eptr += slength;
2514 nigel 77 }
2515 ph10 595 /* Control never gets here */
2516     }
2517 nigel 77
2518 ph10 595 /* If maximizing, find the longest string and work backwards */
2519 nigel 77
2520 ph10 595 else
2521     {
2522     pp = eptr;
2523     for (i = min; i < max; i++)
2524 nigel 77 {
2525 ph10 595 int slength;
2526 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2527 nigel 77 {
2528 ph10 595 CHECK_PARTIAL();
2529     break;
2530 nigel 77 }
2531 ph10 595 eptr += slength;
2532 nigel 77 }
2533 ph10 595 while (eptr >= pp)
2534     {
2535 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2536 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2537     eptr -= length;
2538     }
2539     MRRETURN(MATCH_NOMATCH);
2540 nigel 77 }
2541     /* Control never gets here */
2542    
2543     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2544     used when all the characters in the class have values in the range 0-255,
2545     and either the matching is caseful, or the characters are in the range
2546     0-127 when UTF-8 processing is enabled. The only difference between
2547     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2548     encountered.
2549    
2550     First, look past the end of the item to see if there is repeat information
2551     following. Then obey similar code to character type repeats - written out
2552     again for speed. */
2553    
2554     case OP_NCLASS:
2555     case OP_CLASS:
2556     {
2557     data = ecode + 1; /* Save for matching */
2558     ecode += 33; /* Advance past the item */
2559    
2560     switch (*ecode)
2561     {
2562     case OP_CRSTAR:
2563     case OP_CRMINSTAR:
2564     case OP_CRPLUS:
2565     case OP_CRMINPLUS:
2566     case OP_CRQUERY:
2567     case OP_CRMINQUERY:
2568     c = *ecode++ - OP_CRSTAR;
2569     minimize = (c & 1) != 0;
2570     min = rep_min[c]; /* Pick up values from tables; */
2571     max = rep_max[c]; /* zero for max => infinity */
2572     if (max == 0) max = INT_MAX;
2573     break;
2574    
2575     case OP_CRRANGE:
2576     case OP_CRMINRANGE:
2577     minimize = (*ecode == OP_CRMINRANGE);
2578     min = GET2(ecode, 1);
2579     max = GET2(ecode, 3);
2580     if (max == 0) max = INT_MAX;
2581     ecode += 5;
2582     break;
2583    
2584     default: /* No repeat follows */
2585     min = max = 1;
2586     break;
2587     }
2588    
2589     /* First, ensure the minimum number of matches are present. */
2590    
2591     #ifdef SUPPORT_UTF8
2592     /* UTF-8 mode */
2593     if (utf8)
2594     {
2595     for (i = 1; i <= min; i++)
2596     {
2597 ph10 427 if (eptr >= md->end_subject)
2598 ph10 426 {
2599 ph10 428 SCHECK_PARTIAL();
2600 ph10 510 MRRETURN(MATCH_NOMATCH);
2601 ph10 427 }
2602 nigel 77 GETCHARINC(c, eptr);
2603     if (c > 255)
2604     {
2605 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2606 nigel 77 }
2607     else
2608     {
2609 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2610 nigel 77 }
2611     }
2612     }
2613     else
2614     #endif
2615     /* Not UTF-8 mode */
2616     {
2617     for (i = 1; i <= min; i++)
2618     {
2619 ph10 427 if (eptr >= md->end_subject)
2620 ph10 426 {
2621 ph10 428 SCHECK_PARTIAL();
2622 ph10 510 MRRETURN(MATCH_NOMATCH);
2623 ph10 427 }
2624 nigel 77 c = *eptr++;
2625 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2626 nigel 77 }
2627     }
2628    
2629     /* If max == min we can continue with the main loop without the
2630     need to recurse. */
2631    
2632     if (min == max) continue;
2633    
2634     /* If minimizing, keep testing the rest of the expression and advancing
2635     the pointer while it matches the class. */
2636    
2637     if (minimize)
2638     {
2639     #ifdef SUPPORT_UTF8
2640     /* UTF-8 mode */
2641     if (utf8)
2642     {
2643     for (fi = min;; fi++)
2644     {
2645 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2646 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2647 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2648 ph10 427 if (eptr >= md->end_subject)
2649 ph10 426 {
2650 ph10 427 SCHECK_PARTIAL();
2651 ph10 510 MRRETURN(MATCH_NOMATCH);
2652 ph10 427 }
2653 nigel 77 GETCHARINC(c, eptr);
2654     if (c > 255)
2655     {
2656 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2657 nigel 77 }
2658     else
2659     {
2660 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2661 nigel 77 }
2662     }
2663     }
2664     else
2665     #endif
2666     /* Not UTF-8 mode */
2667     {
2668     for (fi = min;; fi++)
2669     {
2670 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2671 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2672 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2673 ph10 427 if (eptr >= md->end_subject)
2674 ph10 426 {
2675 ph10 427 SCHECK_PARTIAL();
2676 ph10 510 MRRETURN(MATCH_NOMATCH);
2677 ph10 427 }
2678 nigel 77 c = *eptr++;
2679 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2680 nigel 77 }
2681     }
2682     /* Control never gets here */
2683     }
2684    
2685     /* If maximizing, find the longest possible run, then work backwards. */
2686    
2687     else
2688     {
2689     pp = eptr;
2690    
2691     #ifdef SUPPORT_UTF8
2692     /* UTF-8 mode */
2693     if (utf8)
2694     {
2695     for (i = min; i < max; i++)
2696     {
2697     int len = 1;
2698 ph10 463 if (eptr >= md->end_subject)
2699 ph10 462 {
2700 ph10 463 SCHECK_PARTIAL();
2701 ph10 462 break;
2702 ph10 463 }
2703 nigel 77 GETCHARLEN(c, eptr, len);
2704     if (c > 255)
2705     {
2706     if (op == OP_CLASS) break;
2707     }
2708     else
2709     {
2710     if ((data[c/8] & (1 << (c&7))) == 0) break;
2711     }
2712     eptr += len;
2713     }
2714     for (;;)
2715     {
2716 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2717 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2718     if (eptr-- == pp) break; /* Stop if tried at original pos */
2719     BACKCHAR(eptr);
2720     }
2721     }
2722     else
2723     #endif
2724     /* Not UTF-8 mode */
2725     {
2726     for (i = min; i < max; i++)
2727     {
2728 ph10 463 if (eptr >= md->end_subject)
2729 ph10 462 {
2730 ph10 463 SCHECK_PARTIAL();
2731 ph10 462 break;
2732 ph10 463 }
2733 nigel 77 c = *eptr;
2734     if ((data[c/8] & (1 << (c&7))) == 0) break;
2735     eptr++;
2736     }
2737     while (eptr >= pp)
2738     {
2739 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2740 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2741 nigel 77 eptr--;
2742     }
2743     }
2744    
2745 ph10 510 MRRETURN(MATCH_NOMATCH);
2746 nigel 77 }
2747     }
2748     /* Control never gets here */
2749    
2750    
2751     /* Match an extended character class. This opcode is encountered only
2752 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2753     mode, because Unicode properties are supported in non-UTF-8 mode. */
2754 nigel 77
2755     #ifdef SUPPORT_UTF8
2756     case OP_XCLASS:
2757     {
2758     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2759     ecode += GET(ecode, 1); /* Advance past the item */
2760    
2761     switch (*ecode)
2762     {
2763     case OP_CRSTAR:
2764     case OP_CRMINSTAR:
2765     case OP_CRPLUS:
2766     case OP_CRMINPLUS:
2767     case OP_CRQUERY:
2768     case OP_CRMINQUERY:
2769     c = *ecode++ - OP_CRSTAR;
2770     minimize = (c & 1) != 0;
2771     min = rep_min[c]; /* Pick up values from tables; */
2772     max = rep_max[c]; /* zero for max => infinity */
2773     if (max == 0) max = INT_MAX;
2774     break;
2775    
2776     case OP_CRRANGE:
2777     case OP_CRMINRANGE:
2778     minimize = (*ecode == OP_CRMINRANGE);
2779     min = GET2(ecode, 1);
2780     max = GET2(ecode, 3);
2781     if (max == 0) max = INT_MAX;
2782     ecode += 5;
2783     break;
2784    
2785     default: /* No repeat follows */
2786     min = max = 1;
2787     break;
2788     }
2789    
2790     /* First, ensure the minimum number of matches are present. */
2791    
2792     for (i = 1; i <= min; i++)
2793     {
2794 ph10 427 if (eptr >= md->end_subject)
2795 ph10 426 {
2796     SCHECK_PARTIAL();
2797 ph10 510 MRRETURN(MATCH_NOMATCH);
2798 ph10 427 }
2799 ph10 384 GETCHARINCTEST(c, eptr);
2800 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2801 nigel 77 }
2802    
2803     /* If max == min we can continue with the main loop without the
2804     need to recurse. */
2805    
2806     if (min == max) continue;
2807    
2808     /* If minimizing, keep testing the rest of the expression and advancing
2809     the pointer while it matches the class. */
2810    
2811     if (minimize)
2812     {
2813     for (fi = min;; fi++)
2814     {
2815 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2816 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2817 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2818 ph10 427 if (eptr >= md->end_subject)
2819 ph10 426 {
2820 ph10 427 SCHECK_PARTIAL();
2821 ph10 510 MRRETURN(MATCH_NOMATCH);
2822 ph10 427 }
2823 ph10 384 GETCHARINCTEST(c, eptr);
2824 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2825 nigel 77 }
2826     /* Control never gets here */
2827     }
2828    
2829     /* If maximizing, find the longest possible run, then work backwards. */
2830    
2831     else
2832     {
2833     pp = eptr;
2834     for (i = min; i < max; i++)
2835     {
2836     int len = 1;
2837 ph10 463 if (eptr >= md->end_subject)
2838 ph10 462 {
2839 ph10 463 SCHECK_PARTIAL();
2840 ph10 462 break;
2841 ph10 463 }
2842 ph10 384 GETCHARLENTEST(c, eptr, len);
2843 nigel 77 if (!_pcre_xclass(c, data)) break;
2844     eptr += len;
2845     }
2846     for(;;)
2847     {
2848 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2849 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2850     if (eptr-- == pp) break; /* Stop if tried at original pos */
2851 ph10 214 if (utf8) BACKCHAR(eptr);
2852 nigel 77 }
2853 ph10 510 MRRETURN(MATCH_NOMATCH);
2854 nigel 77 }
2855    
2856     /* Control never gets here */
2857     }
2858     #endif /* End of XCLASS */
2859    
2860     /* Match a single character, casefully */
2861    
2862     case OP_CHAR:
2863     #ifdef SUPPORT_UTF8
2864     if (utf8)
2865     {
2866     length = 1;
2867     ecode++;
2868     GETCHARLEN(fc, ecode, length);
2869 ph10 443 if (length > md->end_subject - eptr)
2870 ph10 428 {
2871     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2872 ph10 510 MRRETURN(MATCH_NOMATCH);
2873 ph10 443 }
2874 ph10 510 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2875 nigel 77 }
2876     else
2877     #endif
2878    
2879     /* Non-UTF-8 mode */
2880     {
2881 ph10 443 if (md->end_subject - eptr < 1)
2882 ph10 428 {
2883     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2884 ph10 510 MRRETURN(MATCH_NOMATCH);
2885 ph10 443 }
2886 ph10 510 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2887 nigel 77 ecode += 2;
2888     }
2889     break;
2890    
2891     /* Match a single character, caselessly */
2892    
2893 ph10 602 case OP_CHARI:
2894 nigel 77 #ifdef SUPPORT_UTF8
2895     if (utf8)
2896     {
2897     length = 1;
2898     ecode++;
2899     GETCHARLEN(fc, ecode, length);
2900    
2901 ph10 443 if (length > md->end_subject - eptr)
2902 ph10 428 {
2903     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2904 ph10 510 MRRETURN(MATCH_NOMATCH);
2905 ph10 443 }
2906 nigel 77
2907     /* If the pattern character's value is < 128, we have only one byte, and
2908     can use the fast lookup table. */
2909    
2910     if (fc < 128)
2911     {
2912 ph10 510 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2913 nigel 77 }
2914    
2915     /* Otherwise we must pick up the subject character */
2916    
2917     else
2918     {
2919 nigel 93 unsigned int dc;
2920 nigel 77 GETCHARINC(dc, eptr);
2921     ecode += length;
2922    
2923     /* If we have Unicode property support, we can use it to test the other
2924 nigel 87 case of the character, if there is one. */
2925 nigel 77
2926     if (fc != dc)
2927     {
2928     #ifdef SUPPORT_UCP
2929 ph10 349 if (dc != UCD_OTHERCASE(fc))
2930 nigel 77 #endif
2931 ph10 510 MRRETURN(MATCH_NOMATCH);
2932 nigel 77 }
2933     }
2934     }
2935     else
2936     #endif /* SUPPORT_UTF8 */
2937    
2938     /* Non-UTF-8 mode */
2939     {
2940 ph10 443 if (md->end_subject - eptr < 1)
2941 ph10 428 {
2942 ph10 443 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2943 ph10 510 MRRETURN(MATCH_NOMATCH);
2944 ph10 443 }
2945 ph10 510 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2946 nigel 77 ecode += 2;
2947     }
2948     break;
2949    
2950 nigel 93 /* Match a single character repeatedly. */
2951 nigel 77
2952     case OP_EXACT:
2953 ph10 602 case OP_EXACTI:
2954 nigel 77 min = max = GET2(ecode, 1);
2955     ecode += 3;
2956     goto REPEATCHAR;
2957    
2958 nigel 93 case OP_POSUPTO:
2959 ph10 602 case OP_POSUPTOI:
2960 nigel 93 possessive = TRUE;
2961     /* Fall through */
2962    
2963 nigel 77 case OP_UPTO:
2964 ph10 602 case OP_UPTOI:
2965 nigel 77 case OP_MINUPTO:
2966 ph10 602 case OP_MINUPTOI:
2967 nigel 77 min = 0;
2968     max = GET2(ecode, 1);
2969 ph10 602 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2970 nigel 77 ecode += 3;
2971     goto REPEATCHAR;
2972    
2973 nigel 93 case OP_POSSTAR:
2974 ph10 602 case OP_POSSTARI:
2975 nigel 93 possessive = TRUE;
2976     min = 0;
2977     max = INT_MAX;
2978     ecode++;
2979     goto REPEATCHAR;
2980    
2981     case OP_POSPLUS:
2982 ph10 602 case OP_POSPLUSI:
2983 nigel 93 possessive = TRUE;
2984     min = 1;
2985     max = INT_MAX;
2986     ecode++;
2987     goto REPEATCHAR;
2988    
2989     case OP_POSQUERY:
2990 ph10 602 case OP_POSQUERYI:
2991 nigel 93 possessive = TRUE;
2992     min = 0;
2993     max = 1;
2994     ecode++;
2995     goto REPEATCHAR;
2996    
2997 nigel 77 case OP_STAR:
2998 ph10 602 case OP_STARI:
2999 nigel 77 case OP_MINSTAR:
3000 ph10 602 case OP_MINSTARI:
3001 nigel 77 case OP_PLUS:
3002 ph10 602 case OP_PLUSI:
3003 nigel 77 case OP_MINPLUS:
3004 ph10 602 case OP_MINPLUSI:
3005 nigel 77 case OP_QUERY:
3006 ph10 602 case OP_QUERYI:
3007 nigel 77 case OP_MINQUERY:
3008 ph10 602 case OP_MINQUERYI:
3009     c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3010 nigel 77 minimize = (c & 1) != 0;
3011     min = rep_min[c]; /* Pick up values from tables; */
3012     max = rep_max[c]; /* zero for max => infinity */
3013     if (max == 0) max = INT_MAX;
3014    
3015 ph10 426 /* Common code for all repeated single-character matches. */
3016 nigel 77
3017     REPEATCHAR:
3018     #ifdef SUPPORT_UTF8
3019     if (utf8)
3020     {
3021     length = 1;
3022     charptr = ecode;
3023     GETCHARLEN(fc, ecode, length);
3024     ecode += length;
3025    
3026     /* Handle multibyte character matching specially here. There is
3027     support for caseless matching if UCP support is present. */
3028    
3029     if (length > 1)
3030     {
3031     #ifdef SUPPORT_UCP
3032 nigel 93 unsigned int othercase;
3033 ph10 602 if (op >= OP_STARI && /* Caseless */
3034 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
3035 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
3036 ph10 115 else oclength = 0;
3037 nigel 77 #endif /* SUPPORT_UCP */
3038    
3039     for (i = 1; i <= min; i++)
3040     {
3041 ph10 426 if (eptr <= md->end_subject - length &&
3042     memcmp(eptr, charptr, length) == 0) eptr += length;
3043 ph10 123 #ifdef SUPPORT_UCP
3044 ph10 426 else if (oclength > 0 &&
3045     eptr <= md->end_subject - oclength &&
3046     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3047     #endif /* SUPPORT_UCP */
3048 nigel 77 else
3049     {
3050 ph10 426 CHECK_PARTIAL();
3051 ph10 510 MRRETURN(MATCH_NOMATCH);
3052 nigel 77 }
3053     }
3054    
3055     if (min == max) continue;
3056    
3057     if (minimize)
3058     {
3059     for (fi = min;; fi++)
3060     {
3061 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3062 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3063 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3064 ph10 426 if (eptr <= md->end_subject - length &&
3065     memcmp(eptr, charptr, length) == 0) eptr += length;
3066 ph10 123 #ifdef SUPPORT_UCP
3067 ph10 426 else if (oclength > 0 &&
3068     eptr <= md->end_subject - oclength &&
3069     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3070     #endif /* SUPPORT_UCP */
3071 nigel 77 else
3072     {
3073 ph10 426 CHECK_PARTIAL();
3074 ph10 510 MRRETURN(MATCH_NOMATCH);
3075 nigel 77 }
3076     }
3077     /* Control never gets here */
3078     }
3079 nigel 93
3080     else /* Maximize */
3081 nigel 77 {
3082     pp = eptr;
3083     for (i = min; i < max; i++)
3084     {
3085 ph10 426 if (eptr <= md->end_subject - length &&
3086     memcmp(eptr, charptr, length) == 0) eptr += length;
3087 ph10 123 #ifdef SUPPORT_UCP
3088 ph10 426 else if (oclength > 0 &&
3089     eptr <= md->end_subject - oclength &&
3090     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3091     #endif /* SUPPORT_UCP */
3092 ph10 463 else
3093 ph10 462 {
3094 ph10 463 CHECK_PARTIAL();
3095 ph10 462 break;
3096 ph10 463 }
3097 nigel 77 }
3098 nigel 93
3099     if (possessive) continue;
3100 ph10 427
3101 ph10 120 for(;;)
3102 ph10 426 {
3103 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3104 ph10 426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3105 ph10 510 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3106 ph10 115 #ifdef SUPPORT_UCP
3107 ph10 426 eptr--;
3108     BACKCHAR(eptr);
3109 ph10 123 #else /* without SUPPORT_UCP */
3110 ph10 426 eptr -= length;
3111 ph10 123 #endif /* SUPPORT_UCP */
3112 ph10 426 }
3113 nigel 77 }
3114     /* Control never gets here */
3115     }
3116    
3117     /* If the length of a UTF-8 character is 1, we fall through here, and
3118     obey the code as for non-UTF-8 characters below, though in this case the
3119     value of fc will always be < 128. */
3120     }
3121     else
3122     #endif /* SUPPORT_UTF8 */
3123    
3124     /* When not in UTF-8 mode, load a single-byte character. */
3125    
3126 ph10 426 fc = *ecode++;
3127 ph10 443
3128 nigel 77 /* The value of fc at this point is always less than 256, though we may or
3129     may not be in UTF-8 mode. The code is duplicated for the caseless and
3130     caseful cases, for speed, since matching characters is likely to be quite
3131     common. First, ensure the minimum number of matches are present. If min =
3132     max, continue at the same level without recursing. Otherwise, if
3133     minimizing, keep trying the rest of the expression and advancing one
3134     matching character if failing, up to the maximum. Alternatively, if
3135     maximizing, find the maximum number of characters and work backwards. */
3136    
3137     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3138     max, eptr));
3139    
3140 ph10 602 if (op >= OP_STARI) /* Caseless */
3141 nigel 77 {
3142     fc = md->lcc[fc];
3143     for (i = 1; i <= min; i++)
3144 ph10 426 {
3145     if (eptr >= md->end_subject)
3146     {
3147     SCHECK_PARTIAL();
3148 ph10 510 MRRETURN(MATCH_NOMATCH);
3149 ph10 426 }
3150 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3151 ph10 426 }
3152 nigel 77 if (min == max) continue;
3153     if (minimize)
3154     {
3155     for (fi = min;; fi++)
3156     {
3157 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3158 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3159 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3160 ph10 426 if (eptr >= md->end_subject)
3161     {
3162 ph10 427 SCHECK_PARTIAL();
3163 ph10 510 MRRETURN(MATCH_NOMATCH);
3164 ph10 426 }
3165 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3166 nigel 77 }
3167     /* Control never gets here */
3168     }
3169 nigel 93 else /* Maximize */
3170 nigel 77 {
3171     pp = eptr;
3172     for (i = min; i < max; i++)
3173     {
3174 ph10 463 if (eptr >= md->end_subject)
3175 ph10 462 {
3176     SCHECK_PARTIAL();
3177     break;
3178 ph10 463 }
3179 ph10 462 if (fc != md->lcc[*eptr]) break;
3180 nigel 77 eptr++;
3181     }
3182 ph10 427
3183 nigel 93 if (possessive) continue;
3184 ph10 427
3185 nigel 77 while (eptr >= pp)
3186     {
3187 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3188 nigel 77 eptr--;
3189     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3190     }
3191 ph10 510 MRRETURN(MATCH_NOMATCH);
3192 nigel 77 }
3193     /* Control never gets here */
3194     }
3195    
3196     /* Caseful comparisons (includes all multi-byte characters) */
3197    
3198     else
3199     {
3200 ph10 427 for (i = 1; i <= min; i++)
3201 ph10 426 {
3202     if (eptr >= md->end_subject)
3203     {
3204     SCHECK_PARTIAL();
3205 ph10 510 MRRETURN(MATCH_NOMATCH);
3206 ph10 426 }
3207 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3208 ph10 427 }
3209 ph10 443
3210 nigel 77 if (min == max) continue;
3211 ph10 443
3212 nigel 77 if (minimize)
3213     {
3214     for (fi = min;; fi++)
3215     {
3216 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3217 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3218 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3219 ph10 426 if (eptr >= md->end_subject)
3220 ph10 427 {
3221 ph10 426 SCHECK_PARTIAL();
3222 ph10 510 MRRETURN(MATCH_NOMATCH);
3223 ph10 427 }
3224 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3225 nigel 77 }
3226     /* Control never gets here */
3227     }
3228 nigel 93 else /* Maximize */
3229 nigel 77 {
3230     pp = eptr;
3231     for (i = min; i < max; i++)
3232     {
3233 ph10 463 if (eptr >= md->end_subject)
3234 ph10 462 {
3235 ph10 463 SCHECK_PARTIAL();
3236 ph10 462 break;
3237 ph10 463 }
3238 ph10 462 if (fc != *eptr) break;
3239 nigel 77 eptr++;
3240     }
3241 nigel 93 if (possessive) continue;
3242 ph10 443
3243 nigel 77 while (eptr >= pp)
3244     {
3245 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3246 nigel 77 eptr--;
3247     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3248     }
3249 ph10 510 MRRETURN(MATCH_NOMATCH);
3250 nigel 77 }
3251     }
3252     /* Control never gets here */
3253    
3254     /* Match a negated single one-byte character. The character we are
3255     checking can be multibyte. */
3256    
3257     case OP_NOT:
3258 ph10 602 case OP_NOTI:
3259 ph10 443 if (eptr >= md->end_subject)
3260 ph10 428 {
3261 ph10 443 SCHECK_PARTIAL();
3262 ph10 510 MRRETURN(MATCH_NOMATCH);
3263 ph10 443 }
3264 nigel 77 ecode++;
3265     GETCHARINCTEST(c, eptr);
3266 ph10 602 if (op == OP_NOTI) /* The caseless case */
3267 nigel 77 {
3268     #ifdef SUPPORT_UTF8
3269     if (c < 256)
3270     #endif
3271     c = md->lcc[c];
3272 ph10 510 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3273 nigel 77 }
3274 ph10 602 else /* Caseful */
3275 nigel 77 {
3276 ph10 510 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3277 nigel 77 }
3278     break;
3279    
3280     /* Match a negated single one-byte character repeatedly. This is almost a
3281     repeat of the code for a repeated single character, but I haven't found a
3282     nice way of commoning these up that doesn't require a test of the
3283     positive/negative option for each character match. Maybe that wouldn't add
3284     very much to the time taken, but character matching *is* what this is all
3285     about... */
3286    
3287     case OP_NOTEXACT:
3288 ph10 602 case OP_NOTEXACTI:
3289 nigel 77 min = max = GET2(ecode, 1);
3290     ecode += 3;
3291     goto REPEATNOTCHAR;
3292    
3293     case OP_NOTUPTO:
3294 ph10 602 case OP_NOTUPTOI:
3295 nigel 77 case OP_NOTMINUPTO:
3296 ph10 602 case OP_NOTMINUPTOI:
3297 nigel 77 min = 0;
3298     max = GET2(ecode, 1);
3299 ph10 602 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3300 nigel 77 ecode += 3;
3301     goto REPEATNOTCHAR;
3302    
3303 nigel 93 case OP_NOTPOSSTAR:
3304 ph10 602 case OP_NOTPOSSTARI:
3305 nigel 93 possessive = TRUE;
3306     min = 0;
3307     max = INT_MAX;
3308     ecode++;
3309     goto REPEATNOTCHAR;
3310    
3311     case OP_NOTPOSPLUS:
3312 ph10 602 case OP_NOTPOSPLUSI:
3313 nigel 93 possessive = TRUE;
3314     min = 1;
3315     max = INT_MAX;
3316     ecode++;
3317     goto REPEATNOTCHAR;
3318    
3319     case OP_NOTPOSQUERY:
3320 ph10 602 case OP_NOTPOSQUERYI:
3321 nigel 93 possessive = TRUE;
3322     min = 0;
3323     max = 1;
3324     ecode++;
3325     goto REPEATNOTCHAR;
3326    
3327     case OP_NOTPOSUPTO:
3328 ph10 602 case OP_NOTPOSUPTOI:
3329 nigel 93 possessive = TRUE;
3330     min = 0;
3331     max = GET2(ecode, 1);
3332     ecode += 3;
3333     goto REPEATNOTCHAR;
3334    
3335 nigel 77 case OP_NOTSTAR:
3336 ph10 602 case OP_NOTSTARI:
3337 nigel 77 case OP_NOTMINSTAR:
3338 ph10 602 case OP_NOTMINSTARI:
3339 nigel 77 case OP_NOTPLUS:
3340 ph10 602 case OP_NOTPLUSI:
3341 nigel 77 case OP_NOTMINPLUS:
3342 ph10 602 case OP_NOTMINPLUSI:
3343 nigel 77 case OP_NOTQUERY:
3344 ph10 602 case OP_NOTQUERYI:
3345 nigel 77 case OP_NOTMINQUERY:
3346 ph10 602 case OP_NOTMINQUERYI:
3347     c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3348 nigel 77 minimize = (c & 1) != 0;
3349     min = rep_min[c]; /* Pick up values from tables; */
3350     max = rep_max[c]; /* zero for max => infinity */
3351     if (max == 0) max = INT_MAX;
3352    
3353 ph10 426 /* Common code for all repeated single-byte matches. */
3354 nigel 77
3355     REPEATNOTCHAR:
3356     fc = *ecode++;
3357    
3358     /* The code is duplicated for the caseless and caseful cases, for speed,
3359     since matching characters is likely to be quite common. First, ensure the
3360     minimum number of matches are present. If min = max, continue at the same
3361     level without recursing. Otherwise, if minimizing, keep trying the rest of
3362     the expression and advancing one matching character if failing, up to the
3363     maximum. Alternatively, if maximizing, find the maximum number of
3364     characters and work backwards. */
3365    
3366     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3367     max, eptr));
3368    
3369 ph10 602 if (op >= OP_NOTSTARI) /* Caseless */
3370 nigel 77 {
3371     fc = md->lcc[fc];
3372    
3373     #ifdef SUPPORT_UTF8
3374     /* UTF-8 mode */
3375     if (utf8)
3376     {
3377 nigel 93 register unsigned int d;
3378 nigel 77 for (i = 1; i <= min; i++)
3379     {
3380 ph10 426 if (eptr >= md->end_subject)
3381     {
3382     SCHECK_PARTIAL();
3383 ph10 510 MRRETURN(MATCH_NOMATCH);
3384 ph10 427 }
3385 nigel 77 GETCHARINC(d, eptr);
3386     if (d < 256) d = md->lcc[d];
3387 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3388 nigel 77 }
3389     }
3390     else
3391     #endif
3392    
3393     /* Not UTF-8 mode */
3394     {
3395     for (i = 1; i <= min; i++)
3396 ph10 426 {
3397     if (eptr >= md->end_subject)
3398     {
3399     SCHECK_PARTIAL();
3400 ph10 510 MRRETURN(MATCH_NOMATCH);
3401 ph10 427 }
3402 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3403 ph10 427 }
3404 nigel 77 }
3405    
3406     if (min == max) continue;
3407    
3408     if (minimize)
3409     {
3410     #ifdef SUPPORT_UTF8
3411     /* UTF-8 mode */
3412     if (utf8)
3413     {
3414 nigel 93 register unsigned int d;
3415 nigel 77 for (fi = min;; fi++)
3416     {
3417 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3418 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3419 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3420 ph10 427 if (eptr >= md->end_subject)
3421 ph10 426 {
3422 ph10 427 SCHECK_PARTIAL();
3423 ph10 510 MRRETURN(MATCH_NOMATCH);
3424 ph10 427 }
3425 nigel 77 GETCHARINC(d, eptr);
3426     if (d < 256) d = md->lcc[d];
3427 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3428 nigel 77 }
3429     }
3430     else
3431     #endif
3432     /* Not UTF-8 mode */
3433     {
3434     for (fi = min;; fi++)
3435     {
3436 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3437 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3438 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3439 ph10 426 if (eptr >= md->end_subject)
3440     {
3441     SCHECK_PARTIAL();
3442 ph10 510 MRRETURN(MATCH_NOMATCH);
3443 ph10 426 }
3444 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3445 nigel 77 }
3446     }
3447     /* Control never gets here */
3448     }
3449    
3450     /* Maximize case */
3451    
3452     else
3453     {
3454     pp = eptr;
3455    
3456     #ifdef SUPPORT_UTF8
3457     /* UTF-8 mode */
3458     if (utf8)
3459     {
3460 nigel 93 register unsigned int d;
3461 nigel 77 for (i = min; i < max; i++)
3462     {
3463     int len = 1;
3464 ph10 463 if (eptr >= md->end_subject)
3465 ph10 462 {
3466 ph10 463 SCHECK_PARTIAL();
3467 ph10 462 break;
3468 ph10 463 }
3469 nigel 77 GETCHARLEN(d, eptr, len);
3470     if (d < 256) d = md->lcc[d];
3471     if (fc == d) break;
3472     eptr += len;
3473     }
3474 nigel 93 if (possessive) continue;
3475     for(;;)
3476 nigel 77 {
3477 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3478 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3479     if (eptr-- == pp) break; /* Stop if tried at original pos */
3480     BACKCHAR(eptr);
3481     }
3482     }
3483     else
3484     #endif
3485     /* Not UTF-8 mode */
3486     {
3487     for (i = min; i < max; i++)
3488     {
3489 ph10 463 if (eptr >= md->end_subject)
3490 ph10 462 {
3491     SCHECK_PARTIAL();
3492     break;
3493 ph10 463 }
3494 ph10 462 if (fc == md->lcc[*eptr]) break;
3495 nigel 77 eptr++;
3496     }
3497 nigel 93 if (possessive) continue;
3498 nigel 77 while (eptr >= pp)
3499     {
3500 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3501 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3502     eptr--;
3503     }
3504     }
3505    
3506 ph10 510 MRRETURN(MATCH_NOMATCH);
3507 nigel 77 }
3508     /* Control never gets here */
3509     }
3510    
3511     /* Caseful comparisons */
3512    
3513     else
3514     {
3515     #ifdef SUPPORT_UTF8
3516     /* UTF-8 mode */
3517     if (utf8)
3518     {
3519 nigel 93 register unsigned int d;
3520 nigel 77 for (i = 1; i <= min; i++)
3521     {
3522 ph10 426 if (eptr >= md->end_subject)
3523     {
3524     SCHECK_PARTIAL();
3525 ph10 510 MRRETURN(MATCH_NOMATCH);
3526 ph10 427 }
3527 nigel 77 GETCHARINC(d, eptr);
3528 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3529 nigel 77 }
3530     }
3531     else
3532     #endif
3533     /* Not UTF-8 mode */
3534     {
3535     for (i = 1; i <= min; i++)
3536 ph10 426 {
3537     if (eptr >= md->end_subject)
3538     {
3539     SCHECK_PARTIAL();
3540 ph10 510 MRRETURN(MATCH_NOMATCH);
3541 ph10 427 }
3542 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3543 ph10 427 }
3544 nigel 77 }
3545    
3546     if (min == max) continue;
3547    
3548     if (minimize)
3549     {
3550     #ifdef SUPPORT_UTF8
3551     /* UTF-8 mode */
3552     if (utf8)
3553     {
3554 nigel 93 register unsigned int d;
3555 nigel 77 for (fi = min;; fi++)
3556     {
3557 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3558 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3559 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3560 ph10 427 if (eptr >= md->end_subject)
3561 ph10 426 {
3562 ph10 427 SCHECK_PARTIAL();
3563 ph10 510 MRRETURN(MATCH_NOMATCH);
3564 ph10 427 }
3565 nigel 77 GETCHARINC(d, eptr);
3566 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3567 nigel 77 }
3568     }
3569     else
3570     #endif
3571     /* Not UTF-8 mode */
3572     {
3573     for (fi = min;; fi++)
3574     {
3575 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3576 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3577 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3578 ph10 426 if (eptr >= md->end_subject)
3579     {
3580     SCHECK_PARTIAL();
3581 ph10 510 MRRETURN(MATCH_NOMATCH);
3582 ph10 427 }
3583 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3584 nigel 77 }
3585     }
3586     /* Control never gets here */
3587     }
3588    
3589     /* Maximize case */
3590    
3591     else
3592     {
3593     pp = eptr;
3594    
3595     #ifdef SUPPORT_UTF8
3596     /* UTF-8 mode */
3597     if (utf8)
3598     {
3599 nigel 93 register unsigned int d;
3600 nigel 77 for (i = min; i < max; i++)
3601     {
3602     int len = 1;
3603 ph10 463 if (eptr >= md->end_subject)
3604 ph10 462 {
3605 ph10 463 SCHECK_PARTIAL();
3606 ph10 462 break;
3607 ph10 463 }
3608 nigel 77 GETCHARLEN(d, eptr, len);
3609     if (fc == d) break;
3610     eptr += len;
3611     }
3612 nigel 93 if (possessive) continue;
3613 nigel 77 for(;;)
3614     {
3615 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3616 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3617     if (eptr-- == pp) break; /* Stop if tried at original pos */
3618     BACKCHAR(eptr);
3619     }
3620     }
3621     else
3622     #endif
3623     /* Not UTF-8 mode */
3624     {
3625     for (i = min; i < max; i++)
3626     {
3627 ph10 463 if (eptr >= md->end_subject)
3628 ph10 462 {
3629 ph10 463 SCHECK_PARTIAL();
3630 ph10 462 break;
3631 ph10 463 }
3632 ph10 462 if (fc == *eptr) break;
3633 nigel 77 eptr++;
3634     }
3635 nigel 93 if (possessive) continue;
3636 nigel 77 while (eptr >= pp)
3637     {
3638 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3639 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3640     eptr--;
3641     }
3642     }
3643    
3644 ph10 510 MRRETURN(MATCH_NOMATCH);
3645 nigel 77 }
3646     }
3647     /* Control never gets here */
3648    
3649     /* Match a single character type repeatedly; several different opcodes
3650     share code. This is very similar to the code for single characters, but we
3651     repeat it in the interests of efficiency. */
3652    
3653     case OP_TYPEEXACT:
3654     min = max = GET2(ecode, 1);
3655     minimize = TRUE;
3656     ecode += 3;
3657     goto REPEATTYPE;
3658    
3659     case OP_TYPEUPTO:
3660     case OP_TYPEMINUPTO:
3661     min = 0;
3662     max = GET2(ecode, 1);
3663     minimize = *ecode == OP_TYPEMINUPTO;
3664     ecode += 3;
3665     goto REPEATTYPE;
3666    
3667 nigel 93 case OP_TYPEPOSSTAR:
3668     possessive = TRUE;
3669     min = 0;
3670     max = INT_MAX;
3671     ecode++;
3672     goto REPEATTYPE;
3673    
3674     case OP_TYPEPOSPLUS:
3675     possessive = TRUE;
3676     min = 1;
3677     max = INT_MAX;
3678     ecode++;
3679     goto REPEATTYPE;
3680    
3681     case OP_TYPEPOSQUERY:
3682     possessive = TRUE;
3683     min = 0;
3684     max = 1;
3685     ecode++;
3686     goto REPEATTYPE;
3687    
3688     case OP_TYPEPOSUPTO:
3689     possessive = TRUE;
3690     min = 0;
3691     max = GET2(ecode, 1);
3692     ecode += 3;
3693     goto REPEATTYPE;
3694    
3695 nigel 77 case OP_TYPESTAR:
3696     case OP_TYPEMINSTAR:
3697     case OP_TYPEPLUS:
3698     case OP_TYPEMINPLUS:
3699     case OP_TYPEQUERY:
3700     case OP_TYPEMINQUERY:
3701     c = *ecode++ - OP_TYPESTAR;
3702     minimize = (c & 1) != 0;
3703     min = rep_min[c]; /* Pick up values from tables; */
3704     max = rep_max[c]; /* zero for max => infinity */
3705     if (max == 0) max = INT_MAX;
3706    
3707     /* Common code for all repeated single character type matches. Note that
3708     in UTF-8 mode, '.' matches a character of any length, but for the other
3709     character types, the valid characters are all one-byte long. */
3710    
3711     REPEATTYPE:
3712     ctype = *ecode++; /* Code for the character type */
3713    
3714     #ifdef SUPPORT_UCP
3715     if (ctype == OP_PROP || ctype == OP_NOTPROP)
3716     {
3717     prop_fail_result = ctype == OP_NOTPROP;
3718     prop_type = *ecode++;
3719 nigel 87 prop_value = *ecode++;
3720 nigel 77 }
3721     else prop_type = -1;
3722     #endif
3723    
3724     /* First, ensure the minimum number of matches are present. Use inline
3725     code for maximizing the speed, and do the type test once at the start
3726 ph10 426 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3727 nigel 77 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3728     and single-bytes. */
3729    
3730     if (min > 0)
3731     {
3732     #ifdef SUPPORT_UCP
3733 nigel 87 if (prop_type >= 0)
3734 nigel 77 {
3735 nigel 87 switch(prop_type)
3736 nigel 77 {
3737 nigel 87 case PT_ANY:
3738 ph10 510 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3739 nigel 87 for (i = 1; i <= min; i++)
3740     {
3741 ph10 427 if (eptr >= md->end_subject)
3742 ph10 426 {
3743 ph10 427 SCHECK_PARTIAL();
3744 ph10 510 MRRETURN(MATCH_NOMATCH);
3745 ph10 427 }
3746 ph10 184 GETCHARINCTEST(c, eptr);
3747 nigel 87 }
3748     break;
3749    
3750     case PT_LAMP:
3751     for (i = 1; i <= min; i++)
3752     {
3753 ph10 427 if (eptr >= md->end_subject)
3754 ph10 426 {
3755 ph10 427 SCHECK_PARTIAL();
3756 ph10 510 MRRETURN(MATCH_NOMATCH);
3757 ph10 427 }
3758 ph10 184 GETCHARINCTEST(c, eptr);
3759 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3760 nigel 87 if ((prop_chartype == ucp_Lu ||
3761     prop_chartype == ucp_Ll ||
3762     prop_chartype == ucp_Lt) == prop_fail_result)
3763 ph10 510 MRRETURN(MATCH_NOMATCH);
3764 nigel 87 }
3765     break;
3766    
3767     case PT_GC:
3768     for (i = 1; i <= min; i++)
3769     {
3770 ph10 427 if (eptr >= md->end_subject)
3771 ph10 426 {
3772 ph10 427 SCHECK_PARTIAL();
3773 ph10 510 MRRETURN(MATCH_NOMATCH);
3774 ph10 427 }
3775 ph10 184 GETCHARINCTEST(c, eptr);
3776 ph10 349 prop_category = UCD_CATEGORY(c);
3777 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3778 ph10 510 MRRETURN(MATCH_NOMATCH);
3779 nigel 87 }
3780     break;
3781    
3782     case PT_PC:
3783     for (i = 1; i <= min; i++)
3784     {
3785 ph10 427 if (eptr >= md->end_subject)
3786 ph10 426 {
3787 ph10 427 SCHECK_PARTIAL();
3788 ph10 510 MRRETURN(MATCH_NOMATCH);
3789 ph10 427 }
3790 ph10 184 GETCHARINCTEST(c, eptr);
3791 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3792 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3793 ph10 510 MRRETURN(MATCH_NOMATCH);
3794 nigel 87 }
3795     break;
3796    
3797     case PT_SC:
3798     for (i = 1; i <= min; i++)
3799     {
3800 ph10 427 if (eptr >= md->end_subject)
3801 ph10 426 {
3802 ph10 427 SCHECK_PARTIAL();
3803 ph10 510 MRRETURN(MATCH_NOMATCH);
3804 ph10 427 }
3805 ph10 184 GETCHARINCTEST(c, eptr);
3806 ph10 349 prop_script = UCD_SCRIPT(c);
3807 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3808 ph10 510 MRRETURN(MATCH_NOMATCH);
3809 nigel 87 }
3810     break;
3811 ph10 527
3812 ph10 517 case PT_ALNUM:
3813     for (i = 1; i <= min; i++)
3814     {
3815     if (eptr >= md->end_subject)
3816     {
3817     SCHECK_PARTIAL();
3818     MRRETURN(MATCH_NOMATCH);
3819     }
3820     GETCHARINCTEST(c, eptr);
3821 ph10 527 prop_category = UCD_CATEGORY(c);
3822     if ((prop_category == ucp_L || prop_category == ucp_N)
3823 ph10 517 == prop_fail_result)
3824     MRRETURN(MATCH_NOMATCH);
3825     }
3826     break;
3827 ph10 527
3828 ph10 517 case PT_SPACE: /* Perl space */
3829     for (i = 1; i <= min; i++)
3830     {
3831     if (eptr >= md->end_subject)
3832     {
3833     SCHECK_PARTIAL();
3834     MRRETURN(MATCH_NOMATCH);
3835     }
3836     GETCHARINCTEST(c, eptr);
3837 ph10 527 prop_category = UCD_CATEGORY(c);
3838     if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3839     c == CHAR_FF || c == CHAR_CR)
3840 ph10 517 == prop_fail_result)
3841     MRRETURN(MATCH_NOMATCH);
3842     }
3843     break;
3844 ph10 527
3845 ph10 517 case PT_PXSPACE: /* POSIX space */
3846     for (i = 1; i <= min; i++)
3847     {
3848     if (eptr >= md->end_subject)
3849     {
3850     SCHECK_PARTIAL();
3851     MRRETURN(MATCH_NOMATCH);
3852     }
3853     GETCHARINCTEST(c, eptr);
3854 ph10 527 prop_category = UCD_CATEGORY(c);
3855     if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3856     c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3857 ph10 517 == prop_fail_result)
3858     MRRETURN(MATCH_NOMATCH);
3859     }
3860     break;
3861 ph10 527
3862     case PT_WORD:
3863 ph10 517 for (i = 1; i <= min; i++)
3864     {
3865     if (eptr >= md->end_subject)
3866     {
3867     SCHECK_PARTIAL();
3868     MRRETURN(MATCH_NOMATCH);
3869     }
3870     GETCHARINCTEST(c, eptr);
3871 ph10 527 prop_category = UCD_CATEGORY(c);
3872 ph10 517 if ((prop_category == ucp_L || prop_category == ucp_N ||
3873 ph10 527 c == CHAR_UNDERSCORE)
3874 ph10 517