/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 708 - (hide annotations) (download)
Fri Sep 23 11:03:03 2011 UTC (20 months, 3 weeks ago) by ph10
File MIME type: text/plain
File size: 197205 byte(s)
File tidies for 8.20-RC2 release.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 ph10 625 /* Values for setting in md->match_function_type to indicate two special types
61     of call to match(). We do it this way to save on using another stack variable,
62 ph10 604 as stack usage is to be discouraged. */
63 nigel 77
64 ph10 604 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65     #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66 nigel 77
67     /* Non-error returns from the match() function. Error returns are externally
68     defined PCRE_ERROR_xxx codes, which are all negative. */
69    
70     #define MATCH_MATCH 1
71     #define MATCH_NOMATCH 0
72    
73 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
74 ph10 210 negative to avoid the external error codes. */
75    
76 ph10 511 #define MATCH_ACCEPT (-999)
77     #define MATCH_COMMIT (-998)
78 ph10 604 #define MATCH_KETRPOS (-997)
79 ph10 618 #define MATCH_ONCE (-996)
80     #define MATCH_PRUNE (-995)
81     #define MATCH_SKIP (-994)
82     #define MATCH_SKIP_ARG (-993)
83     #define MATCH_THEN (-992)
84 ph10 210
85 ph10 510 /* This is a convenience macro for code that occurs many times. */
86    
87     #define MRRETURN(ra) \
88     { \
89     md->mark = markptr; \
90     RRETURN(ra); \
91     }
92    
93 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
94     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95     because the offset vector is always a multiple of 3 long. */
96    
97     #define REC_STACK_SAVE_MAX 30
98    
99     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100    
101     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103    
104    
105    
106 ph10 475 #ifdef PCRE_DEBUG
107 nigel 77 /*************************************************
108     * Debugging function to print chars *
109     *************************************************/
110    
111     /* Print a sequence of chars in printable format, stopping at the end of the
112     subject if the requested.
113    
114     Arguments:
115     p points to characters
116     length number to print
117     is_subject TRUE if printing from within md->start_subject
118     md pointer to matching data block, if is_subject is TRUE
119    
120     Returns: nothing
121     */
122    
123     static void
124     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
125     {
126 nigel 93 unsigned int c;
127 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128     while (length-- > 0)
129     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130     }
131     #endif
132    
133    
134    
135     /*************************************************
136     * Match a back-reference *
137     *************************************************/
138    
139 ph10 595 /* Normally, if a back reference hasn't been set, the length that is passed is
140     negative, so the match always fails. However, in JavaScript compatibility mode,
141 ph10 625 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 ph10 595 subject bytes matched may be different to the number of reference bytes.
143 nigel 77
144     Arguments:
145     offset index into the offset vector
146 ph10 595 eptr pointer into the subject
147     length length of reference to be matched (number of bytes)
148 nigel 77 md points to match data block
149 ph10 602 caseless TRUE if caseless
150 nigel 77
151 ph10 595 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 nigel 77 */
153    
154 ph10 595 static int
155 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
156 ph10 602 BOOL caseless)
157 nigel 77 {
158 ph10 595 USPTR eptr_start = eptr;
159     register USPTR p = md->start_subject + md->offset_vector[offset];
160 nigel 77
161 ph10 475 #ifdef PCRE_DEBUG
162 nigel 77 if (eptr >= md->end_subject)
163     printf("matching subject <null>");
164     else
165     {
166     printf("matching subject ");
167     pchars(eptr, length, TRUE, md);
168     }
169     printf(" against backref ");
170     pchars(p, length, FALSE, md);
171     printf("\n");
172     #endif
173    
174 ph10 595 /* Always fail if reference not set (and not JavaScript compatible). */
175 nigel 77
176 ph10 595 if (length < 0) return -1;
177 nigel 77
178 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179     properly if Unicode properties are supported. Otherwise, we can check only
180     ASCII characters. */
181 nigel 77
182 ph10 602 if (caseless)
183 nigel 77 {
184 ph10 354 #ifdef SUPPORT_UTF8
185     #ifdef SUPPORT_UCP
186     if (md->utf8)
187     {
188 ph10 625 /* Match characters up to the end of the reference. NOTE: the number of
189 ph10 595 bytes matched may differ, because there are some characters whose upper and
190     lower case versions code as different numbers of bytes. For example, U+023A
191     (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192     a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 ph10 625 the latter. It is important, therefore, to check the length along the
194 ph10 595 reference, not along the subject (earlier code did this wrong). */
195 ph10 625
196 ph10 595 USPTR endptr = p + length;
197     while (p < endptr)
198 ph10 354 {
199 ph10 358 int c, d;
200 ph10 597 if (eptr >= md->end_subject) return -1;
201 ph10 354 GETCHARINC(c, eptr);
202     GETCHARINC(d, p);
203 ph10 595 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 ph10 358 }
205     }
206 ph10 354 else
207     #endif
208     #endif
209    
210     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211     is no UCP support. */
212 ph10 597 {
213 ph10 625 if (eptr + length > md->end_subject) return -1;
214 ph10 597 while (length-- > 0)
215     { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 ph10 625 }
217 nigel 77 }
218 ph10 358
219 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
220     are in UTF-8 mode. */
221 ph10 358
222 nigel 77 else
223 ph10 625 {
224     if (eptr + length > md->end_subject) return -1;
225     while (length-- > 0) if (*p++ != *eptr++) return -1;
226 ph10 597 }
227 nigel 77
228 ph10 595 return eptr - eptr_start;
229 nigel 77 }
230    
231    
232    
233     /***************************************************************************
234     ****************************************************************************
235     RECURSION IN THE match() FUNCTION
236    
237 nigel 87 The match() function is highly recursive, though not every recursive call
238     increases the recursive depth. Nevertheless, some regular expressions can cause
239     it to recurse to a great depth. I was writing for Unix, so I just let it call
240     itself recursively. This uses the stack for saving everything that has to be
241     saved for a recursive call. On Unix, the stack can be large, and this works
242     fine.
243 nigel 77
244 nigel 87 It turns out that on some non-Unix-like systems there are problems with
245     programs that use a lot of stack. (This despite the fact that every last chip
246     has oodles of memory these days, and techniques for extending the stack have
247     been known for decades.) So....
248 nigel 77
249     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250     calls by keeping local variables that need to be preserved in blocks of memory
251 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
252 nigel 77 achieve this so that the actual code doesn't look very different to what it
253     always used to.
254 ph10 164
255 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
256 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
257     Switzer, the use of longjmp() has been abolished, at the cost of having to
258     provide a unique number for each call to RMATCH. There is no way of generating
259     a sequence of numbers at compile time in C. I have given them names, to make
260     them stand out more clearly.
261    
262     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
265     don't have indeterminate values; this has meant that the frame size can be
266 ph10 164 reduced because the result can be "passed back" by straight setting of the
267     variable instead of being passed in the frame.
268 nigel 77 ****************************************************************************
269     ***************************************************************************/
270    
271 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272     below must be updated in sync. */
273 nigel 77
274 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 ph10 527 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 ph10 623 RM61, RM62, RM63 };
281 ph10 164
282 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
283 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 ph10 501 actually used in this definition. */
285 nigel 77
286     #ifndef NO_RECURSE
287     #define REGISTER register
288 ph10 164
289 ph10 475 #ifdef PCRE_DEBUG
290 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 nigel 87 { \
292     printf("match() called in line %d\n", __LINE__); \
293 ph10 604 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 nigel 87 printf("to line %d\n", __LINE__); \
295     }
296     #define RRETURN(ra) \
297     { \
298     printf("match() returned %d from line %d ", ra, __LINE__); \
299     return ra; \
300     }
301     #else
302 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
303     rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 nigel 77 #define RRETURN(ra) return ra
305 nigel 87 #endif
306    
307 nigel 77 #else
308    
309    
310 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
311     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312     argument of match(), which never changes. */
313 nigel 77
314     #define REGISTER
315    
316 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 nigel 77 {\
318 ph10 563 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 ph10 534 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 ph10 164 frame->Xwhere = rw; \
321     newframe->Xeptr = ra;\
322     newframe->Xecode = rb;\
323 ph10 168 newframe->Xmstart = mstart;\
324 ph10 501 newframe->Xmarkptr = markptr;\
325 ph10 164 newframe->Xoffset_top = rc;\
326 ph10 602 newframe->Xeptrb = re;\
327 ph10 164 newframe->Xrdepth = frame->Xrdepth + 1;\
328     newframe->Xprevframe = frame;\
329     frame = newframe;\
330     DPRINTF(("restarting from line %d\n", __LINE__));\
331     goto HEAP_RECURSE;\
332     L_##rw:\
333     DPRINTF(("jumped back to line %d\n", __LINE__));\
334 nigel 77 }
335    
336     #define RRETURN(ra)\
337     {\
338 ph10 527 heapframe *oldframe = frame;\
339     frame = oldframe->Xprevframe;\
340     (pcre_stack_free)(oldframe);\
341 nigel 77 if (frame != NULL)\
342     {\
343 ph10 164 rrc = ra;\
344     goto HEAP_RETURN;\
345 nigel 77 }\
346     return ra;\
347     }
348    
349    
350     /* Structure for remembering the local variables in a private frame */
351    
352     typedef struct heapframe {
353     struct heapframe *Xprevframe;
354    
355     /* Function arguments that may change */
356    
357 ph10 409 USPTR Xeptr;
358 nigel 77 const uschar *Xecode;
359 ph10 409 USPTR Xmstart;
360 ph10 501 USPTR Xmarkptr;
361 nigel 77 int Xoffset_top;
362     eptrblock *Xeptrb;
363 nigel 91 unsigned int Xrdepth;
364 nigel 77
365     /* Function local variables */
366    
367 ph10 409 USPTR Xcallpat;
368 ph10 406 #ifdef SUPPORT_UTF8
369 ph10 409 USPTR Xcharptr;
370 ph10 406 #endif
371 ph10 409 USPTR Xdata;
372     USPTR Xnext;
373     USPTR Xpp;
374     USPTR Xprev;
375     USPTR Xsaved_eptr;
376 nigel 77
377     recursion_info Xnew_recursive;
378    
379     BOOL Xcur_is_word;
380     BOOL Xcondition;
381     BOOL Xprev_is_word;
382    
383     #ifdef SUPPORT_UCP
384     int Xprop_type;
385 nigel 87 int Xprop_value;
386 nigel 77 int Xprop_fail_result;
387 ph10 123 int Xoclength;
388     uschar Xocchars[8];
389 nigel 77 #endif
390    
391 ph10 403 int Xcodelink;
392 nigel 77 int Xctype;
393 nigel 93 unsigned int Xfc;
394 nigel 77 int Xfi;
395     int Xlength;
396     int Xmax;
397     int Xmin;
398     int Xnumber;
399     int Xoffset;
400     int Xop;
401     int Xsave_capture_last;
402     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403     int Xstacksave[REC_STACK_SAVE_MAX];
404    
405     eptrblock Xnewptrb;
406    
407 ph10 164 /* Where to jump back to */
408 nigel 77
409 ph10 164 int Xwhere;
410 ph10 165
411 nigel 77 } heapframe;
412    
413     #endif
414    
415    
416     /***************************************************************************
417     ***************************************************************************/
418    
419    
420    
421     /*************************************************
422     * Match from current position *
423     *************************************************/
424    
425 nigel 93 /* This function is called recursively in many circumstances. Whenever it
426 nigel 77 returns a negative (error) response, the outer incarnation must also return the
427 ph10 426 same response. */
428 nigel 77
429 ph10 426 /* These macros pack up tests that are used for partial matching, and which
430     appears several times in the code. We set the "hit end" flag if the pointer is
431     at the end of the subject and also past the start of the subject (i.e.
432 ph10 427 something has been matched). For hard partial matching, we then return
433     immediately. The second one is used when we already know we are past the end of
434     the subject. */
435 ph10 426
436     #define CHECK_PARTIAL()\
437 ph10 553 if (md->partial != 0 && eptr >= md->end_subject && \
438     eptr > md->start_used_ptr) \
439     { \
440     md->hitend = TRUE; \
441     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 ph10 427 }
443 ph10 426
444     #define SCHECK_PARTIAL()\
445 ph10 553 if (md->partial != 0 && eptr > md->start_used_ptr) \
446     { \
447     md->hitend = TRUE; \
448     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 ph10 427 }
450 ph10 426
451 ph10 427
452 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
453     the md structure (e.g. utf8, end_subject) into individual variables to improve
454 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455     made performance worse.
456    
457     Arguments:
458 nigel 93 eptr pointer to current character in subject
459     ecode pointer to current position in compiled code
460 ph10 168 mstart pointer to the current match start position (can be modified
461 ph10 172 by encountering \K)
462 ph10 501 markptr pointer to the most recent MARK name, or NULL
463 nigel 77 offset_top current top pointer
464     md pointer to "static" info for the match
465     eptrb pointer to chain of blocks containing eptr at start of
466     brackets - for testing for empty matches
467 nigel 87 rdepth the recursion depth
468 nigel 77
469     Returns: MATCH_MATCH if matched ) these values are >= 0
470     MATCH_NOMATCH if failed to match )
471 ph10 510 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 nigel 77 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 nigel 87 (e.g. stopped by repeated call or recursion limit)
474 nigel 77 */
475    
476     static int
477 ph10 510 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
478 ph10 625 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
479 ph10 604 unsigned int rdepth)
480 nigel 77 {
481     /* These variables do not need to be preserved over recursion in this function,
482 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
483     "register" because they are used a lot in loops. */
484 nigel 77
485 nigel 91 register int rrc; /* Returns from recursive calls */
486     register int i; /* Used for loops not involving calls to RMATCH() */
487 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
489 nigel 77
490 nigel 93 BOOL minimize, possessive; /* Quantifier options */
491 ph10 602 BOOL caseless;
492 ph10 403 int condcode;
493 nigel 93
494 nigel 77 /* When recursion is not being used, all "local" variables that have to be
495     preserved over calls to RMATCH() are part of a "frame" which is obtained from
496     heap storage. Set up the top-level frame here; others are obtained from the
497     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498    
499     #ifdef NO_RECURSE
500 ph10 563 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 ph10 531 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 nigel 77 frame->Xprevframe = NULL; /* Marks the top level */
503    
504     /* Copy in the original argument variables */
505    
506     frame->Xeptr = eptr;
507     frame->Xecode = ecode;
508 ph10 168 frame->Xmstart = mstart;
509 ph10 501 frame->Xmarkptr = markptr;
510 nigel 77 frame->Xoffset_top = offset_top;
511     frame->Xeptrb = eptrb;
512 nigel 87 frame->Xrdepth = rdepth;
513 nigel 77
514     /* This is where control jumps back to to effect "recursion" */
515    
516     HEAP_RECURSE:
517    
518     /* Macros make the argument variables come from the current frame */
519    
520     #define eptr frame->Xeptr
521     #define ecode frame->Xecode
522 ph10 168 #define mstart frame->Xmstart
523 ph10 501 #define markptr frame->Xmarkptr
524 nigel 77 #define offset_top frame->Xoffset_top
525     #define eptrb frame->Xeptrb
526 nigel 87 #define rdepth frame->Xrdepth
527 nigel 77
528     /* Ditto for the local variables */
529    
530     #ifdef SUPPORT_UTF8
531     #define charptr frame->Xcharptr
532     #endif
533     #define callpat frame->Xcallpat
534 ph10 403 #define codelink frame->Xcodelink
535 nigel 77 #define data frame->Xdata
536     #define next frame->Xnext
537     #define pp frame->Xpp
538     #define prev frame->Xprev
539     #define saved_eptr frame->Xsaved_eptr
540    
541     #define new_recursive frame->Xnew_recursive
542    
543     #define cur_is_word frame->Xcur_is_word
544     #define condition frame->Xcondition
545     #define prev_is_word frame->Xprev_is_word
546    
547     #ifdef SUPPORT_UCP
548     #define prop_type frame->Xprop_type
549 nigel 87 #define prop_value frame->Xprop_value
550 nigel 77 #define prop_fail_result frame->Xprop_fail_result
551 ph10 115 #define oclength frame->Xoclength
552     #define occhars frame->Xocchars
553 nigel 77 #endif
554    
555     #define ctype frame->Xctype
556     #define fc frame->Xfc
557     #define fi frame->Xfi
558     #define length frame->Xlength
559     #define max frame->Xmax
560     #define min frame->Xmin
561     #define number frame->Xnumber
562     #define offset frame->Xoffset
563     #define op frame->Xop
564     #define save_capture_last frame->Xsave_capture_last
565     #define save_offset1 frame->Xsave_offset1
566     #define save_offset2 frame->Xsave_offset2
567     #define save_offset3 frame->Xsave_offset3
568     #define stacksave frame->Xstacksave
569    
570     #define newptrb frame->Xnewptrb
571    
572     /* When recursion is being used, local variables are allocated on the stack and
573     get preserved during recursion in the normal way. In this environment, fi and
574     i, and fc and c, can be the same variables. */
575    
576 nigel 93 #else /* NO_RECURSE not defined */
577 nigel 77 #define fi i
578     #define fc c
579    
580 ph10 604 /* Many of the following variables are used only in small blocks of the code.
581     My normal style of coding would have declared them within each of those blocks.
582     However, in order to accommodate the version of this code that uses an external
583     "stack" implemented on the heap, it is easier to declare them all here, so the
584     declarations can be cut out in a block. The only declarations within blocks
585     below are for variables that do not have to be preserved over a recursive call
586     to RMATCH(). */
587 nigel 77
588 ph10 625 #ifdef SUPPORT_UTF8
589     const uschar *charptr;
590     #endif
591     const uschar *callpat;
592     const uschar *data;
593     const uschar *next;
594     USPTR pp;
595     const uschar *prev;
596     USPTR saved_eptr;
597    
598     recursion_info new_recursive;
599    
600     BOOL cur_is_word;
601 nigel 87 BOOL condition;
602 nigel 77 BOOL prev_is_word;
603    
604     #ifdef SUPPORT_UCP
605     int prop_type;
606 nigel 87 int prop_value;
607 nigel 77 int prop_fail_result;
608 ph10 115 int oclength;
609     uschar occhars[8];
610 nigel 77 #endif
611    
612 ph10 399 int codelink;
613 nigel 77 int ctype;
614     int length;
615     int max;
616     int min;
617     int number;
618     int offset;
619     int op;
620     int save_capture_last;
621     int save_offset1, save_offset2, save_offset3;
622     int stacksave[REC_STACK_SAVE_MAX];
623    
624     eptrblock newptrb;
625 nigel 93 #endif /* NO_RECURSE */
626 nigel 77
627 ph10 625 /* To save space on the stack and in the heap frame, I have doubled up on some
628     of the local variables that are used only in localised parts of the code, but
629     still need to be preserved over recursive calls of match(). These macros define
630 ph10 604 the alternative names that are used. */
631    
632     #define allow_zero cur_is_word
633     #define cbegroup condition
634     #define code_offset codelink
635     #define condassert condition
636     #define matched_once prev_is_word
637    
638 nigel 77 /* These statements are here to stop the compiler complaining about unitialized
639     variables. */
640    
641     #ifdef SUPPORT_UCP
642 nigel 87 prop_value = 0;
643 nigel 77 prop_fail_result = 0;
644     #endif
645    
646 nigel 93
647 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
648     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
649     used. Thanks to Ian Taylor for noticing this possibility and sending the
650     original patch. */
651    
652     TAIL_RECURSE:
653    
654 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
655     are specified by the macro RMATCH and RRETURN is used to return. When
656     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
657 ph10 475 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
658 nigel 87 defined). However, RMATCH isn't like a function call because it's quite a
659     complicated macro. It has to be used in one particular way. This shouldn't,
660     however, impact performance when true recursion is being used. */
661 nigel 77
662 ph10 164 #ifdef SUPPORT_UTF8
663     utf8 = md->utf8; /* Local copy of the flag */
664     #else
665     utf8 = FALSE;
666     #endif
667    
668 nigel 87 /* First check that we haven't called match() too many times, or that we
669     haven't exceeded the recursive call limit. */
670    
671 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
672 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
673 nigel 77
674 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
675 ph10 625 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
676     done this way to save having to use another function argument, which would take
677 ph10 604 up space on the stack. See also MATCH_CONDASSERT below.
678 nigel 77
679 ph10 604 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
680     such remembered pointers, to be checked when we hit the closing ket, in order
681     to break infinite loops that match no characters. When match() is called in
682     other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
683     NOT be used with tail recursion, because the memory block that is used is on
684     the stack, so a new one may be required for each match(). */
685    
686     if (md->match_function_type == MATCH_CBEGROUP)
687 nigel 77 {
688 ph10 197 newptrb.epb_saved_eptr = eptr;
689     newptrb.epb_prev = eptrb;
690     eptrb = &newptrb;
691 ph10 604 md->match_function_type = 0;
692 nigel 77 }
693    
694 nigel 93 /* Now start processing the opcodes. */
695 nigel 77
696     for (;;)
697     {
698 nigel 93 minimize = possessive = FALSE;
699 nigel 77 op = *ecode;
700 ph10 625
701 nigel 93 switch(op)
702     {
703 ph10 510 case OP_MARK:
704     markptr = ecode + 2;
705     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
706 ph10 604 eptrb, RM55);
707 ph10 512
708     /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
709     argument, and we must check whether that argument matches this MARK's
710     argument. It is passed back in md->start_match_ptr (an overloading of that
711     variable). If it does match, we reset that variable to the current subject
712     position and return MATCH_SKIP. Otherwise, pass back the return code
713 ph10 510 unaltered. */
714 ph10 512
715     if (rrc == MATCH_SKIP_ARG &&
716 ph10 510 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
717     {
718     md->start_match_ptr = eptr;
719     RRETURN(MATCH_SKIP);
720     }
721    
722 ph10 512 if (md->mark == NULL) md->mark = markptr;
723 ph10 510 RRETURN(rrc);
724    
725 ph10 210 case OP_FAIL:
726 ph10 510 MRRETURN(MATCH_NOMATCH);
727 ph10 211
728 ph10 551 /* COMMIT overrides PRUNE, SKIP, and THEN */
729 ph10 553
730 ph10 510 case OP_COMMIT:
731     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
732 ph10 604 eptrb, RM52);
733 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
734 ph10 553 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
735     rrc != MATCH_THEN)
736 ph10 551 RRETURN(rrc);
737 ph10 510 MRRETURN(MATCH_COMMIT);
738    
739 ph10 551 /* PRUNE overrides THEN */
740 ph10 553
741 ph10 210 case OP_PRUNE:
742     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
743 ph10 604 eptrb, RM51);
744 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
745 ph10 510 MRRETURN(MATCH_PRUNE);
746 ph10 211
747 ph10 510 case OP_PRUNE_ARG:
748     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
749 ph10 604 eptrb, RM56);
750 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 ph10 510 md->mark = ecode + 2;
752     RRETURN(MATCH_PRUNE);
753 ph10 211
754 ph10 551 /* SKIP overrides PRUNE and THEN */
755 ph10 553
756 ph10 210 case OP_SKIP:
757     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
758 ph10 604 eptrb, RM53);
759 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
760 ph10 551 RRETURN(rrc);
761 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
762 ph10 510 MRRETURN(MATCH_SKIP);
763 ph10 211
764 ph10 510 case OP_SKIP_ARG:
765     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
766 ph10 604 eptrb, RM57);
767 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 ph10 551 RRETURN(rrc);
769 ph10 512
770     /* Pass back the current skip name by overloading md->start_match_ptr and
771     returning the special MATCH_SKIP_ARG return code. This will either be
772     caught by a matching MARK, or get to the top, where it is treated the same
773 ph10 510 as PRUNE. */
774 ph10 512
775 ph10 510 md->start_match_ptr = ecode + 2;
776 ph10 512 RRETURN(MATCH_SKIP_ARG);
777 ph10 553
778 ph10 550 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
779 ph10 553 the alt that is at the start of the current branch. This makes it possible
780     to skip back past alternatives that precede the THEN within the current
781     branch. */
782 ph10 512
783 ph10 210 case OP_THEN:
784     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
785 ph10 604 eptrb, RM54);
786 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
787 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
788 ph10 510 MRRETURN(MATCH_THEN);
789    
790     case OP_THEN_ARG:
791 ph10 553 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
792 ph10 604 offset_top, md, eptrb, RM58);
793 ph10 510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
794 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
795     md->mark = ecode + LINK_SIZE + 2;
796 ph10 212 RRETURN(MATCH_THEN);
797 ph10 211
798 ph10 604 /* Handle a capturing bracket, other than those that are possessive with an
799     unlimited repeat. If there is space in the offset vector, save the current
800     subject position in the working slot at the top of the vector. We mustn't
801     change the current values of the data slot, because they may be set from a
802     previous iteration of this group, and be referred to by a reference inside
803 ph10 625 the group. A failure to match might occur after the group has succeeded,
804 ph10 617 if something later on doesn't match. For this reason, we need to restore
805     the working value and also the values of the final offsets, in case they
806     were set by a previous iteration of the same bracket.
807 nigel 77
808 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
809     a non-capturing bracket. Don't worry about setting the flag for the error
810     case here; that is handled in the code for KET. */
811 nigel 77
812 nigel 93 case OP_CBRA:
813     case OP_SCBRA:
814     number = GET2(ecode, 1+LINK_SIZE);
815 nigel 77 offset = number << 1;
816 ph10 625
817 ph10 475 #ifdef PCRE_DEBUG
818 nigel 93 printf("start bracket %d\n", number);
819     printf("subject=");
820 nigel 77 pchars(eptr, 16, TRUE, md);
821     printf("\n");
822     #endif
823    
824     if (offset < md->offset_max)
825     {
826     save_offset1 = md->offset_vector[offset];
827     save_offset2 = md->offset_vector[offset+1];
828     save_offset3 = md->offset_vector[md->offset_end - number];
829     save_capture_last = md->capture_last;
830    
831     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
832 ph10 531 md->offset_vector[md->offset_end - number] =
833 ph10 530 (int)(eptr - md->start_subject);
834 nigel 77
835 ph10 604 for (;;)
836 nigel 77 {
837 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
838     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
839 ph10 604 eptrb, RM1);
840 ph10 618 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
841 ph10 550 if (rrc != MATCH_NOMATCH &&
842     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
843     RRETURN(rrc);
844 nigel 77 md->capture_last = save_capture_last;
845     ecode += GET(ecode, 1);
846 ph10 625 if (*ecode != OP_ALT) break;
847 nigel 77 }
848    
849     DPRINTF(("bracket %d failed\n", number));
850     md->offset_vector[offset] = save_offset1;
851     md->offset_vector[offset+1] = save_offset2;
852     md->offset_vector[md->offset_end - number] = save_offset3;
853 ph10 625
854     /* At this point, rrc will be one of MATCH_ONCE, MATCH_NOMATCH, or
855 ph10 618 MATCH_THEN. */
856 nigel 77
857 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
858 ph10 618 RRETURN(((rrc == MATCH_ONCE)? MATCH_ONCE:MATCH_NOMATCH));
859 nigel 77 }
860    
861 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
862     as a non-capturing bracket. */
863 nigel 77
864 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
865     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
866    
867 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
868 nigel 77
869 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
871    
872 ph10 618 /* Non-capturing or atomic group, except for possessive with unlimited
873 ph10 708 repeat. Loop for all the alternatives.
874    
875 ph10 702 When we get to the final alternative within the brackets, we used to return
876     the result of a recursive call to match() whatever happened so it was
877     possible to reduce stack usage by turning this into a tail recursion,
878     except in the case of a possibly empty group. However, now that there is
879     the possiblity of (*THEN) occurring in the final alternative, this
880     optimization is no longer always possible.
881 ph10 625
882 ph10 708 We can optimize if we know there are no (*THEN)s in the pattern; at present
883     this is the best that can be done.
884    
885 ph10 625 MATCH_ONCE is returned when the end of an atomic group is successfully
886     reached, but subsequent matching fails. It passes back up the tree (causing
887     captured values to be reset) until the original atomic group level is
888 ph10 618 reached. This is tested by comparing md->once_target with the start of the
889     group. At this point, the return is converted into MATCH_NOMATCH so that
890     previous backup points can be taken. */
891 nigel 77
892 ph10 618 case OP_ONCE:
893 nigel 93 case OP_BRA:
894     case OP_SBRA:
895     DPRINTF(("start non-capturing bracket\n"));
896 ph10 618
897 nigel 91 for (;;)
898 nigel 77 {
899 ph10 618 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
900 ph10 702
901     /* If this is not a possibly empty group, and there are no (*THEN)s in
902 ph10 708 the pattern, and this is the final alternative, optimize as described
903 ph10 702 above. */
904    
905     else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
906     {
907     ecode += _pcre_OP_lengths[*ecode];
908     goto TAIL_RECURSE;
909 ph10 708 }
910 ph10 702
911     /* In all other cases, we have to make another call to match(). */
912    
913 ph10 708 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
914 ph10 604 RM2);
915 ph10 550 if (rrc != MATCH_NOMATCH &&
916     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
917 ph10 625 {
918 ph10 618 if (rrc == MATCH_ONCE)
919     {
920     const uschar *scode = ecode;
921     if (*scode != OP_ONCE) /* If not at start, find it */
922     {
923     while (*scode == OP_ALT) scode += GET(scode, 1);
924     scode -= GET(scode, 1);
925 ph10 625 }
926 ph10 618 if (md->once_target == scode) rrc = MATCH_NOMATCH;
927 ph10 625 }
928 ph10 550 RRETURN(rrc);
929 ph10 625 }
930 nigel 77 ecode += GET(ecode, 1);
931 ph10 625 if (*ecode != OP_ALT) break;
932 nigel 77 }
933 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
934     RRETURN(MATCH_NOMATCH);
935    
936 ph10 625 /* Handle possessive capturing brackets with an unlimited repeat. We come
937 ph10 604 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
938     handled similarly to the normal case above. However, the matching is
939     different. The end of these brackets will always be OP_KETRPOS, which
940     returns MATCH_KETRPOS without going further in the pattern. By this means
941     we can handle the group by iteration rather than recursion, thereby
942     reducing the amount of stack needed. */
943 ph10 625
944 ph10 604 case OP_CBRAPOS:
945     case OP_SCBRAPOS:
946     allow_zero = FALSE;
947 ph10 625
948 ph10 604 POSSESSIVE_CAPTURE:
949     number = GET2(ecode, 1+LINK_SIZE);
950     offset = number << 1;
951    
952     #ifdef PCRE_DEBUG
953     printf("start possessive bracket %d\n", number);
954     printf("subject=");
955     pchars(eptr, 16, TRUE, md);
956     printf("\n");
957     #endif
958    
959     if (offset < md->offset_max)
960     {
961     matched_once = FALSE;
962 ph10 625 code_offset = ecode - md->start_code;
963 ph10 604
964     save_offset1 = md->offset_vector[offset];
965     save_offset2 = md->offset_vector[offset+1];
966     save_offset3 = md->offset_vector[md->offset_end - number];
967     save_capture_last = md->capture_last;
968    
969     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
970 ph10 625
971     /* Each time round the loop, save the current subject position for use
972     when the group matches. For MATCH_MATCH, the group has matched, so we
973     restart it with a new subject starting position, remembering that we had
974     at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
975     usual. If we haven't matched any alternatives in any iteration, check to
976     see if a previous iteration matched. If so, the group has matched;
977     continue from afterwards. Otherwise it has failed; restore the previous
978 ph10 604 capture values before returning NOMATCH. */
979 ph10 625
980 ph10 604 for (;;)
981     {
982     md->offset_vector[md->offset_end - number] =
983     (int)(eptr - md->start_subject);
984 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
985 ph10 604 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
986     eptrb, RM63);
987     if (rrc == MATCH_KETRPOS)
988     {
989     offset_top = md->end_offset_top;
990     eptr = md->end_match_ptr;
991 ph10 625 ecode = md->start_code + code_offset;
992 ph10 604 save_capture_last = md->capture_last;
993 ph10 625 matched_once = TRUE;
994     continue;
995     }
996 ph10 604 if (rrc != MATCH_NOMATCH &&
997     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
998     RRETURN(rrc);
999     md->capture_last = save_capture_last;
1000     ecode += GET(ecode, 1);
1001 ph10 625 if (*ecode != OP_ALT) break;
1002 ph10 604 }
1003 ph10 610
1004 ph10 604 if (!matched_once)
1005 ph10 625 {
1006 ph10 604 md->offset_vector[offset] = save_offset1;
1007     md->offset_vector[offset+1] = save_offset2;
1008     md->offset_vector[md->offset_end - number] = save_offset3;
1009     }
1010 ph10 625
1011 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
1012 ph10 604 if (allow_zero || matched_once)
1013 ph10 625 {
1014 ph10 604 ecode += 1 + LINK_SIZE;
1015     break;
1016 ph10 625 }
1017    
1018 ph10 604 RRETURN(MATCH_NOMATCH);
1019     }
1020 ph10 625
1021 ph10 604 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1022     as a non-capturing bracket. */
1023    
1024     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1025     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1026    
1027     DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1028    
1029     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1030     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1031    
1032 ph10 625 /* Non-capturing possessive bracket with unlimited repeat. We come here
1033 ph10 604 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1034     without the capturing complication. It is written out separately for speed
1035     and cleanliness. */
1036    
1037     case OP_BRAPOS:
1038     case OP_SBRAPOS:
1039 ph10 625 allow_zero = FALSE;
1040    
1041 ph10 604 POSSESSIVE_NON_CAPTURE:
1042     matched_once = FALSE;
1043 ph10 625 code_offset = ecode - md->start_code;
1044 ph10 604
1045     for (;;)
1046     {
1047 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1048 ph10 604 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1049 ph10 609 eptrb, RM48);
1050 ph10 604 if (rrc == MATCH_KETRPOS)
1051     {
1052 ph10 610 offset_top = md->end_offset_top;
1053 ph10 604 eptr = md->end_match_ptr;
1054 ph10 625 ecode = md->start_code + code_offset;
1055     matched_once = TRUE;
1056     continue;
1057     }
1058 ph10 604 if (rrc != MATCH_NOMATCH &&
1059     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1060     RRETURN(rrc);
1061     ecode += GET(ecode, 1);
1062 ph10 625 if (*ecode != OP_ALT) break;
1063 ph10 604 }
1064 ph10 625
1065     if (matched_once || allow_zero)
1066 ph10 604 {
1067     ecode += 1 + LINK_SIZE;
1068     break;
1069 ph10 625 }
1070 ph10 604 RRETURN(MATCH_NOMATCH);
1071    
1072     /* Control never reaches here. */
1073    
1074 nigel 77 /* Conditional group: compilation checked that there are no more than
1075     two branches. If the condition is false, skipping the first branch takes us
1076     past the end if there is only one branch, but that's OK because that is
1077 ph10 609 exactly what going to the ket would do. */
1078 nigel 77
1079     case OP_COND:
1080 nigel 93 case OP_SCOND:
1081 ph10 604 codelink = GET(ecode, 1);
1082 ph10 406
1083 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
1084     inserted between OP_COND and an assertion condition. */
1085 ph10 392
1086 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1087     {
1088     if (pcre_callout != NULL)
1089     {
1090     pcre_callout_block cb;
1091 ph10 645 cb.version = 2; /* Version 1 of the callout block */
1092 ph10 381 cb.callout_number = ecode[LINK_SIZE+2];
1093     cb.offset_vector = md->offset_vector;
1094     cb.subject = (PCRE_SPTR)md->start_subject;
1095 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1096     cb.start_match = (int)(mstart - md->start_subject);
1097     cb.current_position = (int)(eptr - md->start_subject);
1098 ph10 381 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1099     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1100     cb.capture_top = offset_top/2;
1101     cb.capture_last = md->capture_last;
1102     cb.callout_data = md->callout_data;
1103 ph10 654 cb.mark = markptr;
1104 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1105 ph10 381 if (rrc < 0) RRETURN(rrc);
1106     }
1107     ecode += _pcre_OP_lengths[OP_CALLOUT];
1108     }
1109 ph10 392
1110 ph10 399 condcode = ecode[LINK_SIZE+1];
1111 ph10 406
1112 ph10 381 /* Now see what the actual condition is */
1113 ph10 392
1114 ph10 459 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1115 nigel 77 {
1116 ph10 459 if (md->recursive == NULL) /* Not recursing => FALSE */
1117     {
1118 ph10 461 condition = FALSE;
1119     ecode += GET(ecode, 1);
1120     }
1121 ph10 459 else
1122 ph10 461 {
1123 ph10 459 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1124     condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1125 ph10 461
1126 ph10 459 /* If the test is for recursion into a specific subpattern, and it is
1127     false, but the test was set up by name, scan the table to see if the
1128     name refers to any other numbers, and test them. The condition is true
1129     if any one is set. */
1130 ph10 461
1131 ph10 459 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1132     {
1133     uschar *slotA = md->name_table;
1134     for (i = 0; i < md->name_count; i++)
1135 ph10 461 {
1136     if (GET2(slotA, 0) == recno) break;
1137 ph10 459 slotA += md->name_entry_size;
1138     }
1139 ph10 461
1140 ph10 459 /* Found a name for the number - there can be only one; duplicate
1141     names for different numbers are allowed, but not vice versa. First
1142     scan down for duplicates. */
1143 ph10 461
1144 ph10 459 if (i < md->name_count)
1145 ph10 461 {
1146 ph10 459 uschar *slotB = slotA;
1147     while (slotB > md->name_table)
1148     {
1149     slotB -= md->name_entry_size;
1150     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1151     {
1152     condition = GET2(slotB, 0) == md->recursive->group_num;
1153 ph10 461 if (condition) break;
1154     }
1155 ph10 459 else break;
1156 ph10 461 }
1157    
1158 ph10 459 /* Scan up for duplicates */
1159 ph10 461
1160 ph10 459 if (!condition)
1161 ph10 461 {
1162 ph10 459 slotB = slotA;
1163     for (i++; i < md->name_count; i++)
1164     {
1165     slotB += md->name_entry_size;
1166     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1167     {
1168     condition = GET2(slotB, 0) == md->recursive->group_num;
1169     if (condition) break;
1170 ph10 461 }
1171 ph10 459 else break;
1172 ph10 461 }
1173     }
1174 ph10 459 }
1175 ph10 461 }
1176    
1177 ph10 459 /* Chose branch according to the condition */
1178 ph10 461
1179 ph10 459 ecode += condition? 3 : GET(ecode, 1);
1180     }
1181 ph10 461 }
1182 nigel 93
1183 ph10 459 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1184 nigel 93 {
1185 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1186 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1187 ph10 461
1188 ph10 459 /* If the numbered capture is unset, but the reference was by name,
1189 ph10 461 scan the table to see if the name refers to any other numbers, and test
1190     them. The condition is true if any one is set. This is tediously similar
1191     to the code above, but not close enough to try to amalgamate. */
1192    
1193 ph10 459 if (!condition && condcode == OP_NCREF)
1194     {
1195 ph10 461 int refno = offset >> 1;
1196 ph10 459 uschar *slotA = md->name_table;
1197 ph10 461
1198 ph10 459 for (i = 0; i < md->name_count; i++)
1199 ph10 461 {
1200     if (GET2(slotA, 0) == refno) break;
1201 ph10 459 slotA += md->name_entry_size;
1202     }
1203 ph10 461
1204     /* Found a name for the number - there can be only one; duplicate names
1205     for different numbers are allowed, but not vice versa. First scan down
1206 ph10 459 for duplicates. */
1207 ph10 461
1208 ph10 459 if (i < md->name_count)
1209 ph10 461 {
1210 ph10 459 uschar *slotB = slotA;
1211     while (slotB > md->name_table)
1212     {
1213     slotB -= md->name_entry_size;
1214     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1215     {
1216     offset = GET2(slotB, 0) << 1;
1217 ph10 461 condition = offset < offset_top &&
1218 ph10 459 md->offset_vector[offset] >= 0;
1219 ph10 461 if (condition) break;
1220     }
1221 ph10 459 else break;
1222 ph10 461 }
1223    
1224 ph10 459 /* Scan up for duplicates */
1225 ph10 461
1226 ph10 459 if (!condition)
1227 ph10 461 {
1228 ph10 459 slotB = slotA;
1229     for (i++; i < md->name_count; i++)
1230     {
1231     slotB += md->name_entry_size;
1232     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1233     {
1234     offset = GET2(slotB, 0) << 1;
1235 ph10 461 condition = offset < offset_top &&
1236 ph10 459 md->offset_vector[offset] >= 0;
1237 ph10 461 if (condition) break;
1238     }
1239 ph10 459 else break;
1240 ph10 461 }
1241     }
1242 ph10 459 }
1243 ph10 461 }
1244    
1245 ph10 459 /* Chose branch according to the condition */
1246    
1247 nigel 93 ecode += condition? 3 : GET(ecode, 1);
1248 nigel 77 }
1249    
1250 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
1251 nigel 93 {
1252     condition = FALSE;
1253     ecode += GET(ecode, 1);
1254     }
1255    
1256 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
1257 ph10 604 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1258     an assertion. */
1259 nigel 77
1260     else
1261     {
1262 ph10 625 md->match_function_type = MATCH_CONDASSERT;
1263 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1264 nigel 77 if (rrc == MATCH_MATCH)
1265     {
1266 ph10 619 if (md->end_offset_top > offset_top)
1267     offset_top = md->end_offset_top; /* Captures may have happened */
1268 nigel 93 condition = TRUE;
1269     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1270 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1271     }
1272 ph10 550 else if (rrc != MATCH_NOMATCH &&
1273     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1274 nigel 77 {
1275     RRETURN(rrc); /* Need braces because of following else */
1276     }
1277 nigel 93 else
1278     {
1279     condition = FALSE;
1280 ph10 399 ecode += codelink;
1281 nigel 93 }
1282     }
1283 nigel 91
1284 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
1285 ph10 702 we used always to use tail recursion to avoid using another stack frame,
1286     except when there was unlimited repeat of a possibly empty group. However,
1287     that strategy no longer works because of the possibilty of (*THEN) being
1288     encountered in the branch. However, we can still use tail recursion if
1289     there are no (*THEN)s in the pattern. Otherwise, a recursive call to
1290     match() is always required, unless the second alternative doesn't exist, in
1291     which case we can just plough on. */
1292 nigel 91
1293 nigel 93 if (condition || *ecode == OP_ALT)
1294     {
1295 ph10 625 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1296 ph10 702 else if (!md->hasthen)
1297     {
1298     ecode += 1 + LINK_SIZE;
1299     goto TAIL_RECURSE;
1300 ph10 708 }
1301 ph10 702
1302     /* A call to match() is required. */
1303 ph10 708
1304 ph10 609 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1305 ph10 708
1306 ph10 699 /* If the result is THEN from within the "true" branch of the condition,
1307 ph10 708 md->start_match_ptr will point to the original OP_COND, not to the start
1308     of the branch, so we have do work to see if it matches. If THEN comes
1309 ph10 699 from the "false" branch, md->start_match_ptr does point to OP_ALT. */
1310    
1311     if (rrc == MATCH_THEN)
1312     {
1313     if (*ecode != OP_ALT)
1314 ph10 708 {
1315 ph10 699 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1316     ecode -= GET(ecode, 1);
1317 ph10 708 }
1318     if (md->start_match_ptr == ecode) rrc = MATCH_NOMATCH;
1319     }
1320 ph10 609 RRETURN(rrc);
1321 nigel 77 }
1322 ph10 708
1323 ph10 702 /* Condition false & no alternative; continue after the group. */
1324 ph10 708
1325 ph10 702 else
1326 nigel 93 {
1327     ecode += 1 + LINK_SIZE;
1328     }
1329     break;
1330 nigel 77
1331 ph10 461
1332 ph10 447 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1333     to close any currently open capturing brackets. */
1334 ph10 461
1335 ph10 447 case OP_CLOSE:
1336 ph10 461 number = GET2(ecode, 1);
1337 ph10 447 offset = number << 1;
1338 ph10 461
1339 ph10 475 #ifdef PCRE_DEBUG
1340 ph10 447 printf("end bracket %d at *ACCEPT", number);
1341     printf("\n");
1342     #endif
1343 nigel 77
1344 ph10 447 md->capture_last = number;
1345     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1346     {
1347     md->offset_vector[offset] =
1348     md->offset_vector[md->offset_end - number];
1349 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1350 ph10 447 if (offset_top <= offset) offset_top = offset + 2;
1351     }
1352     ecode += 3;
1353 ph10 461 break;
1354 ph10 447
1355    
1356 ph10 619 /* End of the pattern, either real or forced. */
1357 nigel 77
1358 ph10 619 case OP_END:
1359 ph10 210 case OP_ACCEPT:
1360 ph10 625 case OP_ASSERT_ACCEPT:
1361    
1362 ph10 619 /* If we have matched an empty string, fail if not in an assertion and not
1363     in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1364 ph10 613 is set and we have matched at the start of the subject. In both cases,
1365     backtracking will then try other alternatives, if any. */
1366 ph10 443
1367 ph10 619 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1368 ph10 618 md->recursive == NULL &&
1369 ph10 619 (md->notempty ||
1370     (md->notempty_atstart &&
1371     mstart == md->start_subject + md->start_offset)))
1372 ph10 510 MRRETURN(MATCH_NOMATCH);
1373 ph10 443
1374 ph10 442 /* Otherwise, we have a match. */
1375 ph10 625
1376 ph10 168 md->end_match_ptr = eptr; /* Record where we ended */
1377     md->end_offset_top = offset_top; /* and how many extracts were taken */
1378 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1379 nigel 77
1380 ph10 512 /* For some reason, the macros don't work properly if an expression is
1381     given as the argument to MRRETURN when the heap is in use. */
1382    
1383     rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1384     MRRETURN(rrc);
1385    
1386 nigel 77 /* Assertion brackets. Check the alternative branches in turn - the
1387     matching won't pass the KET for an assertion. If any one branch matches,
1388     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1389     start of each branch to move the current point backwards, so the code at
1390 ph10 625 this level is identical to the lookahead case. When the assertion is part
1391     of a condition, we want to return immediately afterwards. The caller of
1392     this incarnation of the match() function will have set MATCH_CONDASSERT in
1393     md->match_function type, and one of these opcodes will be the first opcode
1394     that is processed. We use a local variable that is preserved over calls to
1395 ph10 604 match() to remember this case. */
1396 nigel 77
1397     case OP_ASSERT:
1398     case OP_ASSERTBACK:
1399 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1400     {
1401     condassert = TRUE;
1402     md->match_function_type = 0;
1403     }
1404 ph10 625 else condassert = FALSE;
1405    
1406 nigel 77 do
1407     {
1408 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1409 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1410 ph10 500 {
1411     mstart = md->start_match_ptr; /* In case \K reset it */
1412 ph10 630 markptr = md->mark;
1413 ph10 500 break;
1414 ph10 501 }
1415 ph10 550 if (rrc != MATCH_NOMATCH &&
1416     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1417     RRETURN(rrc);
1418 nigel 77 ecode += GET(ecode, 1);
1419     }
1420     while (*ecode == OP_ALT);
1421 ph10 625
1422 ph10 510 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1423 nigel 77
1424     /* If checking an assertion for a condition, return MATCH_MATCH. */
1425    
1426 ph10 604 if (condassert) RRETURN(MATCH_MATCH);
1427 nigel 77
1428     /* Continue from after the assertion, updating the offsets high water
1429     mark, since extracts may have been taken during the assertion. */
1430    
1431     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1432     ecode += 1 + LINK_SIZE;
1433     offset_top = md->end_offset_top;
1434     continue;
1435    
1436 ph10 473 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1437 ph10 482 PRUNE, or COMMIT means we must assume failure without checking subsequent
1438 ph10 473 branches. */
1439 nigel 77
1440     case OP_ASSERT_NOT:
1441     case OP_ASSERTBACK_NOT:
1442 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1443     {
1444     condassert = TRUE;
1445     md->match_function_type = 0;
1446     }
1447 ph10 625 else condassert = FALSE;
1448 ph10 604
1449 nigel 77 do
1450     {
1451 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1452 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1453 ph10 473 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1454     {
1455     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1456 ph10 482 break;
1457     }
1458 ph10 550 if (rrc != MATCH_NOMATCH &&
1459     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1460     RRETURN(rrc);
1461 nigel 77 ecode += GET(ecode,1);
1462     }
1463     while (*ecode == OP_ALT);
1464    
1465 ph10 604 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1466 ph10 625
1467 nigel 77 ecode += 1 + LINK_SIZE;
1468     continue;
1469    
1470     /* Move the subject pointer back. This occurs only at the start of
1471     each branch of a lookbehind assertion. If we are too close to the start to
1472     move back, this match function fails. When working with UTF-8 we move
1473     back a number of characters, not bytes. */
1474    
1475     case OP_REVERSE:
1476     #ifdef SUPPORT_UTF8
1477     if (utf8)
1478     {
1479 nigel 93 i = GET(ecode, 1);
1480     while (i-- > 0)
1481 nigel 77 {
1482     eptr--;
1483 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1484 ph10 207 BACKCHAR(eptr);
1485 nigel 77 }
1486     }
1487     else
1488     #endif
1489    
1490     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1491    
1492     {
1493 nigel 93 eptr -= GET(ecode, 1);
1494 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1495 nigel 77 }
1496    
1497 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1498 nigel 77
1499 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1500 nigel 77 ecode += 1 + LINK_SIZE;
1501     break;
1502    
1503     /* The callout item calls an external function, if one is provided, passing
1504     details of the match so far. This is mainly for debugging, though the
1505     function is able to force a failure. */
1506    
1507     case OP_CALLOUT:
1508     if (pcre_callout != NULL)
1509     {
1510     pcre_callout_block cb;
1511 ph10 645 cb.version = 2; /* Version 1 of the callout block */
1512 nigel 77 cb.callout_number = ecode[1];
1513     cb.offset_vector = md->offset_vector;
1514 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1515 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1516     cb.start_match = (int)(mstart - md->start_subject);
1517     cb.current_position = (int)(eptr - md->start_subject);
1518 nigel 77 cb.pattern_position = GET(ecode, 2);
1519     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1520     cb.capture_top = offset_top/2;
1521     cb.capture_last = md->capture_last;
1522     cb.callout_data = md->callout_data;
1523 ph10 654 cb.mark = markptr;
1524 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1525 nigel 77 if (rrc < 0) RRETURN(rrc);
1526     }
1527     ecode += 2 + 2*LINK_SIZE;
1528     break;
1529    
1530     /* Recursion either matches the current regex, or some subexpression. The
1531     offset data is the offset to the starting bracket from the start of the
1532     whole pattern. (This is so that it works from duplicated subpatterns.)
1533 ph10 625
1534 ph10 618 The state of the capturing groups is preserved over recursion, and
1535 ph10 625 re-instated afterwards. We don't know how many are started and not yet
1536 ph10 618 finished (offset_top records the completed total) so we just have to save
1537     all the potential data. There may be up to 65535 such values, which is too
1538     large to put on the stack, but using malloc for small numbers seems
1539     expensive. As a compromise, the stack is used when there are no more than
1540     REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1541 nigel 77
1542     There are also other values that have to be saved. We use a chained
1543     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1544 ph10 625 for the original version of this logic. It has, however, been hacked around
1545 ph10 618 a lot, so he is not to blame for the current way it works. */
1546 nigel 77
1547     case OP_RECURSE:
1548     {
1549 ph10 642 recursion_info *ri;
1550     int recno;
1551 ph10 654
1552 nigel 77 callpat = md->start_code + GET(ecode, 1);
1553 ph10 642 recno = (callpat == md->start_code)? 0 :
1554 ph10 654 GET2(callpat, 1 + LINK_SIZE);
1555    
1556     /* Check for repeating a recursion without advancing the subject pointer.
1557 ph10 642 This should catch convoluted mutual recursions. (Some simple cases are
1558 ph10 654 caught at compile time.) */
1559    
1560 ph10 642 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1561 ph10 654 if (recno == ri->group_num && eptr == ri->subject_position)
1562 ph10 642 RRETURN(PCRE_ERROR_RECURSELOOP);
1563 nigel 77
1564     /* Add to "recursing stack" */
1565    
1566 ph10 642 new_recursive.group_num = recno;
1567     new_recursive.subject_position = eptr;
1568 nigel 77 new_recursive.prevrec = md->recursive;
1569     md->recursive = &new_recursive;
1570    
1571 ph10 618 /* Where to continue from afterwards */
1572 nigel 77
1573     ecode += 1 + LINK_SIZE;
1574    
1575 ph10 618 /* Now save the offset data */
1576 nigel 77
1577     new_recursive.saved_max = md->offset_end;
1578     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1579     new_recursive.offset_save = stacksave;
1580     else
1581     {
1582     new_recursive.offset_save =
1583     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1584     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1585     }
1586     memcpy(new_recursive.offset_save, md->offset_vector,
1587     new_recursive.saved_max * sizeof(int));
1588 ph10 625
1589 ph10 618 /* OK, now we can do the recursion. After processing each alternative,
1590 ph10 625 restore the offset data. If there were nested recursions, md->recursive
1591 ph10 618 might be changed, so reset it before looping. */
1592 nigel 77
1593     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1594 ph10 604 cbegroup = (*callpat >= OP_SBRA);
1595 nigel 77 do
1596     {
1597 ph10 604 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1598 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1599 ph10 604 md, eptrb, RM6);
1600 ph10 618 memcpy(md->offset_vector, new_recursive.offset_save,
1601     new_recursive.saved_max * sizeof(int));
1602 ph10 681 md->recursive = new_recursive.prevrec;
1603 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1604 nigel 77 {
1605 nigel 87 DPRINTF(("Recursion matched\n"));
1606 nigel 77 if (new_recursive.offset_save != stacksave)
1607     (pcre_free)(new_recursive.offset_save);
1608 ph10 618
1609     /* Set where we got to in the subject, and reset the start in case
1610 ph10 625 it was changed by \K. This *is* propagated back out of a recursion,
1611     for Perl compatibility. */
1612    
1613 ph10 618 eptr = md->end_match_ptr;
1614     mstart = md->start_match_ptr;
1615     goto RECURSION_MATCHED; /* Exit loop; end processing */
1616 nigel 77 }
1617 ph10 550 else if (rrc != MATCH_NOMATCH &&
1618     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1619 nigel 87 {
1620     DPRINTF(("Recursion gave error %d\n", rrc));
1621 ph10 400 if (new_recursive.offset_save != stacksave)
1622     (pcre_free)(new_recursive.offset_save);
1623 nigel 87 RRETURN(rrc);
1624     }
1625 nigel 77
1626     md->recursive = &new_recursive;
1627     callpat += GET(callpat, 1);
1628     }
1629     while (*callpat == OP_ALT);
1630    
1631     DPRINTF(("Recursion didn't match\n"));
1632     md->recursive = new_recursive.prevrec;
1633     if (new_recursive.offset_save != stacksave)
1634     (pcre_free)(new_recursive.offset_save);
1635 ph10 510 MRRETURN(MATCH_NOMATCH);
1636 nigel 77 }
1637 ph10 625
1638 ph10 618 RECURSION_MATCHED:
1639     break;
1640 nigel 77
1641     /* An alternation is the end of a branch; scan along to find the end of the
1642     bracketed group and go to there. */
1643    
1644     case OP_ALT:
1645     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1646     break;
1647    
1648 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1649     indicating that it may occur zero times. It may repeat infinitely, or not
1650     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1651     with fixed upper repeat limits are compiled as a number of copies, with the
1652     optional ones preceded by BRAZERO or BRAMINZERO. */
1653 ph10 625
1654 nigel 77 case OP_BRAZERO:
1655 ph10 604 next = ecode + 1;
1656     RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1657     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1658     do next += GET(next, 1); while (*next == OP_ALT);
1659     ecode = next + 1 + LINK_SIZE;
1660 nigel 77 break;
1661 ph10 625
1662 nigel 77 case OP_BRAMINZERO:
1663 ph10 604 next = ecode + 1;
1664     do next += GET(next, 1); while (*next == OP_ALT);
1665     RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1666     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1667     ecode++;
1668 nigel 77 break;
1669    
1670 ph10 335 case OP_SKIPZERO:
1671 ph10 604 next = ecode+1;
1672     do next += GET(next,1); while (*next == OP_ALT);
1673     ecode = next + 1 + LINK_SIZE;
1674 ph10 335 break;
1675 ph10 625
1676 ph10 604 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1677     here; just jump to the group, with allow_zero set TRUE. */
1678 ph10 625
1679 ph10 604 case OP_BRAPOSZERO:
1680 ph10 625 op = *(++ecode);
1681 ph10 604 allow_zero = TRUE;
1682     if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1683     goto POSSESSIVE_NON_CAPTURE;
1684 ph10 335
1685 nigel 93 /* End of a group, repeated or non-repeating. */
1686 nigel 77
1687     case OP_KET:
1688     case OP_KETRMIN:
1689     case OP_KETRMAX:
1690 ph10 625 case OP_KETRPOS:
1691 nigel 91 prev = ecode - GET(ecode, 1);
1692 ph10 625
1693 nigel 93 /* If this was a group that remembered the subject start, in order to break
1694     infinite repeats of empty string matches, retrieve the subject start from
1695     the chain. Otherwise, set it NULL. */
1696 nigel 77
1697 ph10 618 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1698 nigel 93 {
1699     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1700     eptrb = eptrb->epb_prev; /* Backup to previous group */
1701     }
1702     else saved_eptr = NULL;
1703 nigel 77
1704 ph10 618 /* If we are at the end of an assertion group, stop matching and return
1705     MATCH_MATCH, but record the current high water mark for use by positive
1706     assertions. We also need to record the match start in case it was changed
1707     by \K. */
1708 nigel 93
1709 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1710 ph10 618 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT)
1711 nigel 91 {
1712     md->end_match_ptr = eptr; /* For ONCE */
1713     md->end_offset_top = offset_top;
1714 ph10 500 md->start_match_ptr = mstart;
1715 ph10 630 MRRETURN(MATCH_MATCH); /* Sets md->mark */
1716 nigel 91 }
1717 nigel 77
1718 nigel 93 /* For capturing groups we have to check the group number back at the start
1719     and if necessary complete handling an extraction by setting the offsets and
1720 ph10 618 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1721     into group 0, so it won't be picked up here. Instead, we catch it when the
1722     OP_END is reached. Other recursion is handled here. We just have to record
1723     the current subject position and start match pointer and give a MATCH
1724     return. */
1725 nigel 77
1726 ph10 604 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1727     *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1728 nigel 91 {
1729 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1730 nigel 91 offset = number << 1;
1731 ph10 461
1732 ph10 475 #ifdef PCRE_DEBUG
1733 nigel 91 printf("end bracket %d", number);
1734     printf("\n");
1735 nigel 77 #endif
1736    
1737 ph10 618 /* Handle a recursively called group. */
1738    
1739     if (md->recursive != NULL && md->recursive->group_num == number)
1740     {
1741     md->end_match_ptr = eptr;
1742     md->start_match_ptr = mstart;
1743     RRETURN(MATCH_MATCH);
1744     }
1745    
1746     /* Deal with capturing */
1747    
1748 nigel 93 md->capture_last = number;
1749     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1750 nigel 91 {
1751 ph10 625 /* If offset is greater than offset_top, it means that we are
1752     "skipping" a capturing group, and that group's offsets must be marked
1753     unset. In earlier versions of PCRE, all the offsets were unset at the
1754     start of matching, but this doesn't work because atomic groups and
1755 ph10 615 assertions can cause a value to be set that should later be unset.
1756     Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1757 ph10 625 part of the atomic group, but this is not on the final matching path,
1758     so must be unset when 2 is set. (If there is no group 2, there is no
1759 ph10 615 problem, because offset_top will then be 2, indicating no capture.) */
1760 ph10 625
1761 ph10 615 if (offset > offset_top)
1762     {
1763     register int *iptr = md->offset_vector + offset_top;
1764     register int *iend = md->offset_vector + offset;
1765     while (iptr < iend) *iptr++ = -1;
1766 ph10 625 }
1767    
1768 ph10 615 /* Now make the extraction */
1769    
1770 nigel 93 md->offset_vector[offset] =
1771     md->offset_vector[md->offset_end - number];
1772 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1773 nigel 93 if (offset_top <= offset) offset_top = offset + 2;
1774     }
1775 nigel 91 }
1776 nigel 77
1777 ph10 618 /* For an ordinary non-repeating ket, just continue at this level. This
1778     also happens for a repeating ket if no characters were matched in the
1779     group. This is the forcible breaking of infinite loops as implemented in
1780 ph10 625 Perl 5.005. For a non-repeating atomic group, establish a backup point by
1781     processing the rest of the pattern at a lower level. If this results in a
1782     NOMATCH return, pass MATCH_ONCE back to the original OP_ONCE level, thereby
1783     bypassing intermediate backup points, but resetting any captures that
1784 ph10 618 happened along the way. */
1785 nigel 77
1786 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1787     {
1788 ph10 618 if (*prev == OP_ONCE)
1789     {
1790     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1791     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1792     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1793 ph10 625 RRETURN(MATCH_ONCE);
1794     }
1795 ph10 618 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1796 nigel 91 break;
1797     }
1798 ph10 625
1799     /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1800 ph10 604 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1801     at a time from the outer level, thus saving stack. */
1802 ph10 625
1803 ph10 604 if (*ecode == OP_KETRPOS)
1804 ph10 625 {
1805 ph10 604 md->end_match_ptr = eptr;
1806 ph10 625 md->end_offset_top = offset_top;
1807 ph10 604 RRETURN(MATCH_KETRPOS);
1808 ph10 625 }
1809 nigel 77
1810 ph10 604 /* The normal repeating kets try the rest of the pattern or restart from
1811     the preceding bracket, in the appropriate order. In the second case, we can
1812     use tail recursion to avoid using another stack frame, unless we have an
1813 ph10 618 an atomic group or an unlimited repeat of a group that can match an empty
1814     string. */
1815 nigel 77
1816 nigel 91 if (*ecode == OP_KETRMIN)
1817     {
1818 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1819 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1820 ph10 618 if (*prev == OP_ONCE)
1821     {
1822 ph10 623 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1823 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1824     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1825 ph10 625 RRETURN(MATCH_ONCE);
1826     }
1827 ph10 604 if (*prev >= OP_SBRA) /* Could match an empty string */
1828 ph10 197 {
1829 ph10 625 md->match_function_type = MATCH_CBEGROUP;
1830 ph10 604 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1831 ph10 197 RRETURN(rrc);
1832     }
1833 nigel 91 ecode = prev;
1834     goto TAIL_RECURSE;
1835 nigel 77 }
1836 nigel 91 else /* OP_KETRMAX */
1837     {
1838 ph10 625 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1839 ph10 604 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1840 ph10 618 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1841 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1842 ph10 618 if (*prev == OP_ONCE)
1843     {
1844 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1845 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1846     md->once_target = prev;
1847 ph10 625 RRETURN(MATCH_ONCE);
1848     }
1849 nigel 91 ecode += 1 + LINK_SIZE;
1850     goto TAIL_RECURSE;
1851     }
1852     /* Control never gets here */
1853 nigel 77
1854 ph10 602 /* Not multiline mode: start of subject assertion, unless notbol. */
1855 nigel 77
1856     case OP_CIRC:
1857 ph10 510 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1858 ph10 625
1859 nigel 77 /* Start of subject assertion */
1860    
1861     case OP_SOD:
1862 ph10 510 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1863 nigel 77 ecode++;
1864     break;
1865 ph10 625
1866 ph10 602 /* Multiline mode: start of subject unless notbol, or after any newline. */
1867 nigel 77
1868 ph10 602 case OP_CIRCM:
1869     if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1870     if (eptr != md->start_subject &&
1871     (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1872     MRRETURN(MATCH_NOMATCH);
1873     ecode++;
1874     break;
1875    
1876 nigel 77 /* Start of match assertion */
1877    
1878     case OP_SOM:
1879 ph10 510 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1880 nigel 77 ecode++;
1881     break;
1882 ph10 172
1883 ph10 168 /* Reset the start of match point */
1884 ph10 172
1885 ph10 168 case OP_SET_SOM:
1886     mstart = eptr;
1887 ph10 172 ecode++;
1888     break;
1889 nigel 77
1890 ph10 602 /* Multiline mode: assert before any newline, or before end of subject
1891     unless noteol is set. */
1892 nigel 77
1893 ph10 602 case OP_DOLLM:
1894     if (eptr < md->end_subject)
1895     { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1896     else
1897 nigel 77 {
1898 ph10 510 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1899 ph10 602 SCHECK_PARTIAL();
1900 nigel 77 }
1901 ph10 602 ecode++;
1902     break;
1903 ph10 579
1904 ph10 625 /* Not multiline mode: assert before a terminating newline or before end of
1905 ph10 602 subject unless noteol is set. */
1906    
1907     case OP_DOLL:
1908     if (md->noteol) MRRETURN(MATCH_NOMATCH);
1909     if (!md->endonly) goto ASSERT_NL_OR_EOS;
1910    
1911 nigel 91 /* ... else fall through for endonly */
1912 nigel 77
1913     /* End of subject assertion (\z) */
1914    
1915     case OP_EOD:
1916 ph10 510 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1917 ph10 553 SCHECK_PARTIAL();
1918 nigel 77 ecode++;
1919     break;
1920    
1921     /* End of subject or ending \n assertion (\Z) */
1922    
1923     case OP_EODN:
1924 ph10 553 ASSERT_NL_OR_EOS:
1925     if (eptr < md->end_subject &&
1926 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1927 ph10 510 MRRETURN(MATCH_NOMATCH);
1928 ph10 579
1929 ph10 553 /* Either at end of string or \n before end. */
1930 ph10 579
1931 ph10 553 SCHECK_PARTIAL();
1932 nigel 77 ecode++;
1933     break;
1934    
1935     /* Word boundary assertions */
1936    
1937     case OP_NOT_WORD_BOUNDARY:
1938     case OP_WORD_BOUNDARY:
1939     {
1940    
1941     /* Find out if the previous and current characters are "word" characters.
1942     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1943 ph10 443 be "non-word" characters. Remember the earliest consulted character for
1944 ph10 435 partial matching. */
1945 nigel 77
1946     #ifdef SUPPORT_UTF8
1947     if (utf8)
1948     {
1949 ph10 518 /* Get status of previous character */
1950 ph10 527
1951 nigel 77 if (eptr == md->start_subject) prev_is_word = FALSE; else
1952     {
1953 ph10 409 USPTR lastptr = eptr - 1;
1954 nigel 77 while((*lastptr & 0xc0) == 0x80) lastptr--;
1955 ph10 443 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1956 nigel 77 GETCHAR(c, lastptr);
1957 ph10 527 #ifdef SUPPORT_UCP
1958 ph10 518 if (md->use_ucp)
1959     {
1960     if (c == '_') prev_is_word = TRUE; else
1961 ph10 527 {
1962 ph10 518 int cat = UCD_CATEGORY(c);
1963     prev_is_word = (cat == ucp_L || cat == ucp_N);
1964 ph10 527 }
1965     }
1966     else
1967     #endif
1968 nigel 77 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1969     }
1970 ph10 527
1971 ph10 518 /* Get status of next character */
1972 ph10 527
1973 ph10 443 if (eptr >= md->end_subject)
1974 nigel 77 {
1975 ph10 443 SCHECK_PARTIAL();
1976     cur_is_word = FALSE;
1977 ph10 428 }
1978     else
1979     {
1980 nigel 77 GETCHAR(c, eptr);
1981 ph10 527 #ifdef SUPPORT_UCP
1982 ph10 518 if (md->use_ucp)
1983     {
1984     if (c == '_') cur_is_word = TRUE; else
1985 ph10 527 {
1986 ph10 518 int cat = UCD_CATEGORY(c);
1987     cur_is_word = (cat == ucp_L || cat == ucp_N);
1988 ph10 527 }
1989     }
1990     else
1991     #endif
1992 nigel 77 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1993     }
1994     }
1995     else
1996     #endif
1997    
1998 ph10 527 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1999 ph10 518 consistency with the behaviour of \w we do use it in this case. */
2000 nigel 77
2001     {
2002 ph10 518 /* Get status of previous character */
2003 ph10 527
2004 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
2005     {
2006 ph10 443 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2007 ph10 527 #ifdef SUPPORT_UCP
2008 ph10 518 if (md->use_ucp)
2009     {
2010 ph10 527 c = eptr[-1];
2011 ph10 518 if (c == '_') prev_is_word = TRUE; else
2012 ph10 527 {
2013 ph10 518 int cat = UCD_CATEGORY(c);
2014     prev_is_word = (cat == ucp_L || cat == ucp_N);
2015 ph10 527 }
2016     }
2017     else
2018     #endif
2019 ph10 435 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2020     }
2021 ph10 527
2022 ph10 518 /* Get status of next character */
2023 ph10 527
2024 ph10 443 if (eptr >= md->end_subject)
2025 ph10 428 {
2026 ph10 443 SCHECK_PARTIAL();
2027     cur_is_word = FALSE;
2028 ph10 428 }
2029 ph10 527 else
2030     #ifdef SUPPORT_UCP
2031 ph10 518 if (md->use_ucp)
2032     {
2033 ph10 527 c = *eptr;
2034 ph10 518 if (c == '_') cur_is_word = TRUE; else
2035 ph10 527 {
2036 ph10 518 int cat = UCD_CATEGORY(c);
2037     cur_is_word = (cat == ucp_L || cat == ucp_N);
2038 ph10 527 }
2039     }
2040     else
2041     #endif
2042 ph10 518 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2043 nigel 77 }
2044    
2045     /* Now see if the situation is what we want */
2046    
2047     if ((*ecode++ == OP_WORD_BOUNDARY)?
2048     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2049 ph10 510 MRRETURN(MATCH_NOMATCH);
2050 nigel 77 }
2051     break;
2052    
2053     /* Match a single character type; inline for speed */
2054    
2055     case OP_ANY:
2056 ph10 510 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2057 ph10 345 /* Fall through */
2058    
2059 ph10 341 case OP_ALLANY:
2060 ph10 648 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2061     { /* not be updated before SCHECK_PARTIAL. */
2062 ph10 443 SCHECK_PARTIAL();
2063 ph10 510 MRRETURN(MATCH_NOMATCH);
2064 ph10 443 }
2065 ph10 648 eptr++;
2066 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2067 nigel 77 ecode++;
2068     break;
2069    
2070     /* Match a single byte, even in UTF-8 mode. This opcode really does match
2071     any byte, even newline, independent of the setting of PCRE_DOTALL. */
2072    
2073     case OP_ANYBYTE:
2074 ph10 648 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2075     { /* not be updated before SCHECK_PARTIAL. */
2076 ph10 443 SCHECK_PARTIAL();
2077 ph10 510 MRRETURN(MATCH_NOMATCH);
2078 ph10 443 }
2079 ph10 654 eptr++;
2080 nigel 77 ecode++;
2081     break;
2082    
2083     case OP_NOT_DIGIT:
2084 ph10 443 if (eptr >= md->end_subject)
2085 ph10 428 {
2086 ph10 443 SCHECK_PARTIAL();
2087 ph10 510 MRRETURN(MATCH_NOMATCH);
2088 ph10 443 }
2089 nigel 77 GETCHARINCTEST(c, eptr);
2090     if (
2091     #ifdef SUPPORT_UTF8
2092     c < 256 &&
2093     #endif
2094     (md->ctypes[c] & ctype_digit) != 0
2095     )
2096 ph10 510 MRRETURN(MATCH_NOMATCH);
2097 nigel 77 ecode++;
2098     break;
2099    
2100     case OP_DIGIT:
2101 ph10 443 if (eptr >= md->end_subject)
2102 ph10 428 {
2103 ph10 443 SCHECK_PARTIAL();
2104 ph10 510 MRRETURN(MATCH_NOMATCH);
2105 ph10 443 }
2106 nigel 77 GETCHARINCTEST(c, eptr);
2107     if (
2108     #ifdef SUPPORT_UTF8
2109     c >= 256 ||
2110     #endif
2111     (md->ctypes[c] & ctype_digit) == 0
2112     )
2113 ph10 510 MRRETURN(MATCH_NOMATCH);
2114 nigel 77 ecode++;
2115     break;
2116    
2117     case OP_NOT_WHITESPACE:
2118 ph10 443 if (eptr >= md->end_subject)
2119 ph10 428 {
2120 ph10 443 SCHECK_PARTIAL();
2121 ph10 510 MRRETURN(MATCH_NOMATCH);
2122 ph10 443 }
2123 nigel 77 GETCHARINCTEST(c, eptr);
2124     if (
2125     #ifdef SUPPORT_UTF8
2126     c < 256 &&
2127     #endif
2128     (md->ctypes[c] & ctype_space) != 0
2129     )
2130 ph10 510 MRRETURN(MATCH_NOMATCH);
2131 nigel 77 ecode++;
2132     break;
2133    
2134     case OP_WHITESPACE:
2135 ph10 443 if (eptr >= md->end_subject)
2136 ph10 428 {
2137 ph10 443 SCHECK_PARTIAL();
2138 ph10 510 MRRETURN(MATCH_NOMATCH);
2139 ph10 443 }
2140 nigel 77 GETCHARINCTEST(c, eptr);
2141     if (
2142     #ifdef SUPPORT_UTF8
2143     c >= 256 ||
2144     #endif
2145     (md->ctypes[c] & ctype_space) == 0
2146     )
2147 ph10 510 MRRETURN(MATCH_NOMATCH);
2148 nigel 77 ecode++;
2149     break;
2150    
2151     case OP_NOT_WORDCHAR:
2152 ph10 443 if (eptr >= md->end_subject)
2153 ph10 428 {
2154 ph10 443 SCHECK_PARTIAL();
2155 ph10 510 MRRETURN(MATCH_NOMATCH);
2156 ph10 443 }
2157 nigel 77 GETCHARINCTEST(c, eptr);
2158     if (
2159     #ifdef SUPPORT_UTF8
2160     c < 256 &&
2161     #endif
2162     (md->ctypes[c] & ctype_word) != 0
2163     )
2164 ph10 510 MRRETURN(MATCH_NOMATCH);
2165 nigel 77 ecode++;
2166     break;
2167    
2168     case OP_WORDCHAR:
2169 ph10 443 if (eptr >= md->end_subject)
2170 ph10 428 {
2171 ph10 443 SCHECK_PARTIAL();
2172 ph10 510 MRRETURN(MATCH_NOMATCH);
2173 ph10 443 }
2174 nigel 77 GETCHARINCTEST(c, eptr);
2175     if (
2176     #ifdef SUPPORT_UTF8
2177     c >= 256 ||
2178     #endif
2179     (md->ctypes[c] & ctype_word) == 0
2180     )
2181 ph10 510 MRRETURN(MATCH_NOMATCH);
2182 nigel 77 ecode++;
2183     break;
2184    
2185 nigel 93 case OP_ANYNL:
2186 ph10 443 if (eptr >= md->end_subject)
2187 ph10 428 {
2188 ph10 443 SCHECK_PARTIAL();
2189 ph10 510 MRRETURN(MATCH_NOMATCH);
2190 ph10 443 }
2191 nigel 93 GETCHARINCTEST(c, eptr);
2192     switch(c)
2193     {
2194 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2195 ph10 625
2196 nigel 93 case 0x000d:
2197     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2198     break;
2199 ph10 231
2200 nigel 93 case 0x000a:
2201 ph10 231 break;
2202    
2203 nigel 93 case 0x000b:
2204     case 0x000c:
2205     case 0x0085:
2206     case 0x2028:
2207     case 0x2029:
2208 ph10 510 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2209 nigel 93 break;
2210     }
2211     ecode++;
2212     break;
2213    
2214 ph10 178 case OP_NOT_HSPACE:
2215 ph10 443 if (eptr >= md->end_subject)
2216 ph10 428 {
2217 ph10 443 SCHECK_PARTIAL();
2218 ph10 510 MRRETURN(MATCH_NOMATCH);
2219 ph10 443 }
2220 ph10 178 GETCHARINCTEST(c, eptr);
2221     switch(c)
2222     {
2223     default: break;
2224     case 0x09: /* HT */
2225     case 0x20: /* SPACE */
2226     case 0xa0: /* NBSP */
2227     case 0x1680: /* OGHAM SPACE MARK */
2228     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2229     case 0x2000: /* EN QUAD */
2230     case 0x2001: /* EM QUAD */
2231     case 0x2002: /* EN SPACE */
2232     case 0x2003: /* EM SPACE */
2233     case 0x2004: /* THREE-PER-EM SPACE */
2234     case 0x2005: /* FOUR-PER-EM SPACE */
2235     case 0x2006: /* SIX-PER-EM SPACE */
2236     case 0x2007: /* FIGURE SPACE */
2237     case 0x2008: /* PUNCTUATION SPACE */
2238     case 0x2009: /* THIN SPACE */
2239     case 0x200A: /* HAIR SPACE */
2240     case 0x202f: /* NARROW NO-BREAK SPACE */
2241     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2242     case 0x3000: /* IDEOGRAPHIC SPACE */
2243 ph10 510 MRRETURN(MATCH_NOMATCH);
2244 ph10 178 }
2245     ecode++;
2246     break;
2247    
2248     case OP_HSPACE:
2249 ph10 443 if (eptr >= md->end_subject)
2250 ph10 428 {
2251 ph10 443 SCHECK_PARTIAL();
2252 ph10 510 MRRETURN(MATCH_NOMATCH);
2253 ph10 443 }
2254 ph10 178 GETCHARINCTEST(c, eptr);
2255     switch(c)
2256     {
2257 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2258 ph10 178 case 0x09: /* HT */
2259     case 0x20: /* SPACE */
2260     case 0xa0: /* NBSP */
2261     case 0x1680: /* OGHAM SPACE MARK */
2262     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2263     case 0x2000: /* EN QUAD */
2264     case 0x2001: /* EM QUAD */
2265     case 0x2002: /* EN SPACE */
2266     case 0x2003: /* EM SPACE */
2267     case 0x2004: /* THREE-PER-EM SPACE */
2268     case 0x2005: /* FOUR-PER-EM SPACE */
2269     case 0x2006: /* SIX-PER-EM SPACE */
2270     case 0x2007: /* FIGURE SPACE */
2271     case 0x2008: /* PUNCTUATION SPACE */
2272     case 0x2009: /* THIN SPACE */
2273     case 0x200A: /* HAIR SPACE */
2274     case 0x202f: /* NARROW NO-BREAK SPACE */
2275     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2276     case 0x3000: /* IDEOGRAPHIC SPACE */
2277     break;
2278     }
2279     ecode++;
2280     break;
2281    
2282     case OP_NOT_VSPACE:
2283 ph10 443 if (eptr >= md->end_subject)
2284 ph10 428 {
2285 ph10 443 SCHECK_PARTIAL();
2286 ph10 510 MRRETURN(MATCH_NOMATCH);
2287 ph10 443 }
2288 ph10 178 GETCHARINCTEST(c, eptr);
2289     switch(c)
2290     {
2291     default: break;
2292     case 0x0a: /* LF */
2293     case 0x0b: /* VT */
2294     case 0x0c: /* FF */
2295     case 0x0d: /* CR */
2296     case 0x85: /* NEL */
2297     case 0x2028: /* LINE SEPARATOR */
2298     case 0x2029: /* PARAGRAPH SEPARATOR */
2299 ph10 510 MRRETURN(MATCH_NOMATCH);
2300 ph10 178 }
2301     ecode++;
2302     break;
2303    
2304     case OP_VSPACE:
2305 ph10 443 if (eptr >= md->end_subject)
2306 ph10 428 {
2307 ph10 443 SCHECK_PARTIAL();
2308 ph10 510 MRRETURN(MATCH_NOMATCH);
2309 ph10 443 }
2310 ph10 178 GETCHARINCTEST(c, eptr);
2311     switch(c)
2312     {
2313 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2314 ph10 178 case 0x0a: /* LF */
2315     case 0x0b: /* VT */
2316     case 0x0c: /* FF */
2317     case 0x0d: /* CR */
2318     case 0x85: /* NEL */
2319     case 0x2028: /* LINE SEPARATOR */
2320     case 0x2029: /* PARAGRAPH SEPARATOR */
2321     break;
2322     }
2323     ecode++;
2324     break;
2325    
2326 nigel 77 #ifdef SUPPORT_UCP
2327     /* Check the next character by Unicode property. We will get here only
2328     if the support is in the binary; otherwise a compile-time error occurs. */
2329    
2330     case OP_PROP:
2331     case OP_NOTPROP:
2332 ph10 443 if (eptr >= md->end_subject)
2333 ph10 428 {
2334 ph10 443 SCHECK_PARTIAL();
2335 ph10 510 MRRETURN(MATCH_NOMATCH);
2336 ph10 443 }
2337 nigel 77 GETCHARINCTEST(c, eptr);
2338     {
2339 ph10 384 const ucd_record *prop = GET_UCD(c);
2340 nigel 77
2341 nigel 87 switch(ecode[1])
2342     {
2343     case PT_ANY:
2344 ph10 510 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2345 nigel 87 break;
2346 nigel 77
2347 nigel 87 case PT_LAMP:
2348 ph10 349 if ((prop->chartype == ucp_Lu ||
2349     prop->chartype == ucp_Ll ||
2350     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2351 ph10 510 MRRETURN(MATCH_NOMATCH);
2352 ph10 517 break;
2353 nigel 87
2354     case PT_GC:
2355 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2356 ph10 510 MRRETURN(MATCH_NOMATCH);
2357 nigel 87 break;
2358    
2359     case PT_PC:
2360 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2361 ph10 510 MRRETURN(MATCH_NOMATCH);
2362 nigel 87 break;
2363    
2364     case PT_SC:
2365 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
2366 ph10 510 MRRETURN(MATCH_NOMATCH);
2367 nigel 87 break;
2368 ph10 527
2369 ph10 517 /* These are specials */
2370 ph10 527
2371 ph10 517 case PT_ALNUM:
2372     if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2373     _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2374     MRRETURN(MATCH_NOMATCH);
2375 ph10 527 break;
2376    
2377 ph10 517 case PT_SPACE: /* Perl space */
2378     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2379     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2380     == (op == OP_NOTPROP))
2381     MRRETURN(MATCH_NOMATCH);
2382 ph10 527 break;
2383    
2384 ph10 517 case PT_PXSPACE: /* POSIX space */
2385     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2386 ph10 527 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2387 ph10 517 c == CHAR_FF || c == CHAR_CR)
2388     == (op == OP_NOTPROP))
2389     MRRETURN(MATCH_NOMATCH);
2390 ph10 527 break;
2391 nigel 87
2392 ph10 527 case PT_WORD:
2393 ph10 517 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2394 ph10 527 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2395 ph10 517 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2396     MRRETURN(MATCH_NOMATCH);
2397 ph10 527 break;
2398    
2399 ph10 517 /* This should never occur */
2400    
2401 nigel 87 default:
2402     RRETURN(PCRE_ERROR_INTERNAL);
2403 nigel 77 }
2404 nigel 87
2405     ecode += 3;
2406 nigel 77 }
2407     break;
2408    
2409     /* Match an extended Unicode sequence. We will get here only if the support
2410     is in the binary; otherwise a compile-time error occurs. */
2411    
2412     case OP_EXTUNI:
2413 ph10 443 if (eptr >= md->end_subject)
2414 ph10 428 {
2415 ph10 443 SCHECK_PARTIAL();
2416 ph10 510 MRRETURN(MATCH_NOMATCH);
2417 ph10 443 }
2418 nigel 77 GETCHARINCTEST(c, eptr);
2419 ph10 623 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2420     while (eptr < md->end_subject)
2421 nigel 77 {
2422 ph10 623 int len = 1;
2423     if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2424     if (UCD_CATEGORY(c) != ucp_M) break;
2425     eptr += len;
2426 nigel 77 }
2427     ecode++;
2428     break;
2429     #endif
2430    
2431    
2432     /* Match a back reference, possibly repeatedly. Look past the end of the
2433     item to see if there is repeat information following. The code is similar
2434     to that for character classes, but repeated for efficiency. Then obey
2435     similar code to character type repeats - written out again for speed.
2436     However, if the referenced string is the empty string, always treat
2437     it as matched, any number of times (otherwise there could be infinite
2438     loops). */
2439    
2440     case OP_REF:
2441 ph10 625 case OP_REFI:
2442     caseless = op == OP_REFI;
2443 ph10 595 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2444     ecode += 3;
2445 ph10 345
2446 ph10 595 /* If the reference is unset, there are two possibilities:
2447 ph10 345
2448 ph10 595 (a) In the default, Perl-compatible state, set the length negative;
2449     this ensures that every attempt at a match fails. We can't just fail
2450     here, because of the possibility of quantifiers with zero minima.
2451 ph10 345
2452 ph10 595 (b) If the JavaScript compatibility flag is set, set the length to zero
2453     so that the back reference matches an empty string.
2454 ph10 345
2455 ph10 595 Otherwise, set the length to the length of what was matched by the
2456     referenced subpattern. */
2457 ph10 345
2458 ph10 595 if (offset >= offset_top || md->offset_vector[offset] < 0)
2459     length = (md->jscript_compat)? 0 : -1;
2460     else
2461     length = md->offset_vector[offset+1] - md->offset_vector[offset];
2462 nigel 77
2463 ph10 595 /* Set up for repetition, or handle the non-repeated case */
2464 nigel 77
2465 ph10 595 switch (*ecode)
2466     {
2467     case OP_CRSTAR:
2468     case OP_CRMINSTAR:
2469     case OP_CRPLUS:
2470     case OP_CRMINPLUS:
2471     case OP_CRQUERY:
2472     case OP_CRMINQUERY:
2473     c = *ecode++ - OP_CRSTAR;
2474     minimize = (c & 1) != 0;
2475     min = rep_min[c]; /* Pick up values from tables; */
2476     max = rep_max[c]; /* zero for max => infinity */
2477     if (max == 0) max = INT_MAX;
2478     break;
2479 nigel 77
2480 ph10 595 case OP_CRRANGE:
2481     case OP_CRMINRANGE:
2482     minimize = (*ecode == OP_CRMINRANGE);
2483     min = GET2(ecode, 1);
2484     max = GET2(ecode, 3);
2485     if (max == 0) max = INT_MAX;
2486     ecode += 5;
2487     break;
2488 nigel 77
2489 ph10 595 default: /* No repeat follows */
2490 ph10 602 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2491 ph10 595 {
2492     CHECK_PARTIAL();
2493     MRRETURN(MATCH_NOMATCH);
2494 nigel 77 }
2495 ph10 595 eptr += length;
2496     continue; /* With the main loop */
2497     }
2498 nigel 77
2499 ph10 595 /* Handle repeated back references. If the length of the reference is
2500     zero, just continue with the main loop. */
2501 ph10 443
2502 ph10 595 if (length == 0) continue;
2503 nigel 77
2504 ph10 595 /* First, ensure the minimum number of matches are present. We get back
2505     the length of the reference string explicitly rather than passing the
2506     address of eptr, so that eptr can be a register variable. */
2507 nigel 77
2508 ph10 595 for (i = 1; i <= min; i++)
2509     {
2510 ph10 625 int slength;
2511 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2512 nigel 77 {
2513 ph10 595 CHECK_PARTIAL();
2514     MRRETURN(MATCH_NOMATCH);
2515 nigel 77 }
2516 ph10 595 eptr += slength;
2517     }
2518 nigel 77
2519 ph10 595 /* If min = max, continue at the same level without recursion.
2520     They are not both allowed to be zero. */
2521 nigel 77
2522 ph10 595 if (min == max) continue;
2523 nigel 77
2524 ph10 595 /* If minimizing, keep trying and advancing the pointer */
2525 nigel 77
2526 ph10 595 if (minimize)
2527     {
2528     for (fi = min;; fi++)
2529 nigel 77 {
2530 ph10 625 int slength;
2531 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2532 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2533     if (fi >= max) MRRETURN(MATCH_NOMATCH);
2534 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2535 nigel 77 {
2536 ph10 595 CHECK_PARTIAL();
2537     MRRETURN(MATCH_NOMATCH);
2538 nigel 77 }
2539 ph10 595 eptr += slength;
2540 nigel 77 }
2541 ph10 595 /* Control never gets here */
2542     }
2543 nigel 77
2544 ph10 595 /* If maximizing, find the longest string and work backwards */
2545 nigel 77
2546 ph10 595 else
2547     {
2548     pp = eptr;
2549     for (i = min; i < max; i++)
2550 nigel 77 {
2551 ph10 625 int slength;
2552 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2553 nigel 77 {
2554 ph10 595 CHECK_PARTIAL();
2555     break;
2556 nigel 77 }
2557 ph10 595 eptr += slength;
2558 nigel 77 }
2559 ph10 595 while (eptr >= pp)
2560     {
2561 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2562 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2563     eptr -= length;
2564     }
2565     MRRETURN(MATCH_NOMATCH);
2566 nigel 77 }
2567     /* Control never gets here */
2568    
2569     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2570     used when all the characters in the class have values in the range 0-255,
2571     and either the matching is caseful, or the characters are in the range
2572     0-127 when UTF-8 processing is enabled. The only difference between
2573     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2574     encountered.
2575    
2576     First, look past the end of the item to see if there is repeat information
2577     following. Then obey similar code to character type repeats - written out
2578     again for speed. */
2579    
2580     case OP_NCLASS:
2581     case OP_CLASS:
2582     {
2583     data = ecode + 1; /* Save for matching */
2584     ecode += 33; /* Advance past the item */
2585    
2586     switch (*ecode)
2587     {
2588     case OP_CRSTAR:
2589     case OP_CRMINSTAR:
2590     case OP_CRPLUS:
2591     case OP_CRMINPLUS:
2592     case OP_CRQUERY:
2593     case OP_CRMINQUERY:
2594     c = *ecode++ - OP_CRSTAR;
2595     minimize = (c & 1) != 0;
2596     min = rep_min[c]; /* Pick up values from tables; */
2597     max = rep_max[c]; /* zero for max => infinity */
2598     if (max == 0) max = INT_MAX;
2599     break;
2600    
2601     case OP_CRRANGE:
2602     case OP_CRMINRANGE:
2603     minimize = (*ecode == OP_CRMINRANGE);
2604     min = GET2(ecode, 1);
2605     max = GET2(ecode, 3);
2606     if (max == 0) max = INT_MAX;
2607     ecode += 5;
2608     break;
2609    
2610     default: /* No repeat follows */
2611     min = max = 1;
2612     break;
2613     }
2614    
2615     /* First, ensure the minimum number of matches are present. */
2616    
2617     #ifdef SUPPORT_UTF8
2618     /* UTF-8 mode */
2619     if (utf8)
2620     {
2621     for (i = 1; i <= min; i++)
2622     {
2623 ph10 427 if (eptr >= md->end_subject)
2624 ph10 426 {
2625 ph10 428 SCHECK_PARTIAL();
2626 ph10 510 MRRETURN(MATCH_NOMATCH);
2627 ph10 427 }
2628 nigel 77 GETCHARINC(c, eptr);
2629     if (c > 255)
2630     {
2631 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2632 nigel 77 }
2633     else
2634     {
2635 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2636 nigel 77 }
2637     }
2638     }
2639     else
2640     #endif
2641     /* Not UTF-8 mode */
2642     {
2643     for (i = 1; i <= min; i++)
2644     {
2645 ph10 427 if (eptr >= md->end_subject)
2646 ph10 426 {
2647 ph10 428 SCHECK_PARTIAL();
2648 ph10 510 MRRETURN(MATCH_NOMATCH);
2649 ph10 427 }
2650 nigel 77 c = *eptr++;
2651 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2652 nigel 77 }
2653     }
2654    
2655     /* If max == min we can continue with the main loop without the
2656     need to recurse. */
2657    
2658     if (min == max) continue;
2659    
2660     /* If minimizing, keep testing the rest of the expression and advancing
2661     the pointer while it matches the class. */
2662    
2663     if (minimize)
2664     {
2665     #ifdef SUPPORT_UTF8
2666     /* UTF-8 mode */
2667     if (utf8)
2668     {
2669     for (fi = min;; fi++)
2670     {
2671 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2672 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2673 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2674 ph10 427 if (eptr >= md->end_subject)
2675 ph10 426 {
2676 ph10 427 SCHECK_PARTIAL();
2677 ph10 510 MRRETURN(MATCH_NOMATCH);
2678 ph10 427 }
2679 nigel 77 GETCHARINC(c, eptr);
2680     if (c > 255)
2681     {
2682 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2683 nigel 77 }
2684     else
2685     {
2686 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2687 nigel 77 }
2688     }
2689     }
2690     else
2691     #endif
2692     /* Not UTF-8 mode */
2693     {
2694     for (fi = min;; fi++)
2695     {
2696 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2697 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2698 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2699 ph10 427 if (eptr >= md->end_subject)
2700 ph10 426 {
2701 ph10 427 SCHECK_PARTIAL();
2702 ph10 510 MRRETURN(MATCH_NOMATCH);
2703 ph10 427 }
2704 nigel 77 c = *eptr++;
2705 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2706 nigel 77 }
2707     }
2708     /* Control never gets here */
2709     }
2710    
2711     /* If maximizing, find the longest possible run, then work backwards. */
2712    
2713     else
2714     {
2715     pp = eptr;
2716    
2717     #ifdef SUPPORT_UTF8
2718     /* UTF-8 mode */
2719     if (utf8)
2720     {
2721     for (i = min; i < max; i++)
2722     {
2723     int len = 1;
2724 ph10 463 if (eptr >= md->end_subject)
2725 ph10 462 {
2726 ph10 463 SCHECK_PARTIAL();
2727 ph10 462 break;
2728 ph10 463 }
2729 nigel 77 GETCHARLEN(c, eptr, len);
2730     if (c > 255)
2731     {
2732     if (op == OP_CLASS) break;
2733     }
2734     else
2735     {
2736     if ((data[c/8] & (1 << (c&7))) == 0) break;
2737     }
2738     eptr += len;
2739     }
2740     for (;;)
2741     {
2742 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2743 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2744     if (eptr-- == pp) break; /* Stop if tried at original pos */
2745     BACKCHAR(eptr);
2746     }
2747     }
2748     else
2749     #endif
2750     /* Not UTF-8 mode */
2751     {
2752     for (i = min; i < max; i++)
2753     {
2754 ph10 463 if (eptr >= md->end_subject)
2755 ph10 462 {
2756 ph10 463 SCHECK_PARTIAL();
2757 ph10 462 break;
2758 ph10 463 }
2759 nigel 77 c = *eptr;
2760     if ((data[c/8] & (1 << (c&7))) == 0) break;
2761     eptr++;
2762     }
2763     while (eptr >= pp)
2764     {
2765 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2766 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2767 nigel 77 eptr--;
2768     }
2769     }
2770    
2771 ph10 510 MRRETURN(MATCH_NOMATCH);
2772 nigel 77 }
2773     }
2774     /* Control never gets here */
2775    
2776    
2777     /* Match an extended character class. This opcode is encountered only
2778 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2779     mode, because Unicode properties are supported in non-UTF-8 mode. */
2780 nigel 77
2781     #ifdef SUPPORT_UTF8
2782     case OP_XCLASS:
2783     {
2784     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2785     ecode += GET(ecode, 1); /* Advance past the item */
2786    
2787     switch (*ecode)
2788     {
2789     case OP_CRSTAR:
2790     case OP_CRMINSTAR:
2791     case OP_CRPLUS:
2792     case OP_CRMINPLUS:
2793     case OP_CRQUERY:
2794     case OP_CRMINQUERY:
2795     c = *ecode++ - OP_CRSTAR;
2796     minimize = (c & 1) != 0;
2797     min = rep_min[c]; /* Pick up values from tables; */
2798     max = rep_max[c]; /* zero for max => infinity */
2799     if (max == 0) max = INT_MAX;
2800     break;
2801    
2802     case OP_CRRANGE:
2803     case OP_CRMINRANGE:
2804     minimize = (*ecode == OP_CRMINRANGE);
2805     min = GET2(ecode, 1);
2806     max = GET2(ecode, 3);
2807     if (max == 0) max = INT_MAX;
2808     ecode += 5;
2809     break;
2810    
2811     default: /* No repeat follows */
2812     min = max = 1;
2813     break;
2814     }
2815    
2816     /* First, ensure the minimum number of matches are present. */
2817    
2818     for (i = 1; i <= min; i++)
2819     {
2820 ph10 427 if (eptr >= md->end_subject)
2821 ph10 426 {
2822     SCHECK_PARTIAL();
2823 ph10 510 MRRETURN(MATCH_NOMATCH);
2824 ph10 427 }
2825 ph10 384 GETCHARINCTEST(c, eptr);
2826 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2827 nigel 77 }
2828    
2829     /* If max == min we can continue with the main loop without the
2830     need to recurse. */
2831    
2832     if (min == max) continue;
2833    
2834     /* If minimizing, keep testing the rest of the expression and advancing
2835     the pointer while it matches the class. */
2836    
2837     if (minimize)
2838     {
2839     for (fi = min;; fi++)
2840     {
2841 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2842 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2843 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2844 ph10 427 if (eptr >= md->end_subject)
2845 ph10 426 {
2846 ph10 427 SCHECK_PARTIAL();
2847 ph10 510 MRRETURN(MATCH_NOMATCH);
2848 ph10 427 }
2849 ph10 384 GETCHARINCTEST(c, eptr);
2850 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2851 nigel 77 }
2852     /* Control never gets here */
2853     }
2854    
2855     /* If maximizing, find the longest possible run, then work backwards. */
2856    
2857     else
2858     {
2859     pp = eptr;
2860     for (i = min; i < max; i++)
2861     {
2862     int len = 1;
2863 ph10 463 if (eptr >= md->end_subject)
2864 ph10 462 {
2865 ph10 463 SCHECK_PARTIAL();
2866 ph10 462 break;
2867 ph10 463 }
2868 ph10 384 GETCHARLENTEST(c, eptr, len);
2869 nigel 77 if (!_pcre_xclass(c, data)) break;
2870     eptr += len;
2871     }
2872     for(;;)
2873     {
2874 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2875 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2876     if (eptr-- == pp) break; /* Stop if tried at original pos */
2877 ph10 214 if (utf8) BACKCHAR(eptr);
2878 nigel 77 }
2879 ph10 510 MRRETURN(MATCH_NOMATCH);
2880 nigel 77 }
2881    
2882     /* Control never gets here */
2883     }
2884     #endif /* End of XCLASS */
2885    
2886     /* Match a single character, casefully */
2887    
2888     case OP_CHAR:
2889     #ifdef SUPPORT_UTF8
2890     if (utf8)
2891     {
2892     length = 1;
2893     ecode++;
2894     GETCHARLEN(fc, ecode, length);
2895 ph10 443 if (length > md->end_subject - eptr)
2896 ph10 428 {
2897     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2898 ph10 510 MRRETURN(MATCH_NOMATCH);
2899 ph10 443 }
2900 ph10 510 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2901 nigel 77 }
2902     else
2903     #endif
2904    
2905     /* Non-UTF-8 mode */
2906     {
2907 ph10 443 if (md->end_subject - eptr < 1)
2908 ph10 428 {
2909     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2910 ph10 510 MRRETURN(MATCH_NOMATCH);
2911 ph10 443 }
2912 ph10 510 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2913 nigel 77 ecode += 2;
2914     }
2915     break;
2916    
2917     /* Match a single character, caselessly */
2918    
2919 ph10 602 case OP_CHARI:
2920 nigel 77 #ifdef SUPPORT_UTF8
2921     if (utf8)
2922     {
2923     length = 1;
2924     ecode++;
2925     GETCHARLEN(fc, ecode, length);
2926    
2927 ph10 443 if (length > md->end_subject - eptr)
2928 ph10 428 {
2929     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2930 ph10 510 MRRETURN(MATCH_NOMATCH);
2931 ph10 443 }
2932 nigel 77
2933     /* If the pattern character's value is < 128, we have only one byte, and
2934     can use the fast lookup table. */
2935    
2936     if (fc < 128)
2937     {
2938 ph10 510 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2939 nigel 77 }
2940    
2941     /* Otherwise we must pick up the subject character */
2942    
2943     else
2944     {
2945 nigel 93 unsigned int dc;
2946 nigel 77 GETCHARINC(dc, eptr);
2947     ecode += length;
2948    
2949     /* If we have Unicode property support, we can use it to test the other
2950 nigel 87 case of the character, if there is one. */
2951 nigel 77
2952     if (fc != dc)
2953     {
2954     #ifdef SUPPORT_UCP
2955 ph10 349 if (dc != UCD_OTHERCASE(fc))
2956 nigel 77 #endif
2957 ph10 510 MRRETURN(MATCH_NOMATCH);
2958 nigel 77 }
2959     }
2960     }
2961     else
2962     #endif /* SUPPORT_UTF8 */
2963    
2964     /* Non-UTF-8 mode */
2965     {
2966 ph10 443 if (md->end_subject - eptr < 1)
2967 ph10 428 {
2968 ph10 443 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2969 ph10 510 MRRETURN(MATCH_NOMATCH);
2970 ph10 443 }
2971 ph10 510 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2972 nigel 77 ecode += 2;
2973     }
2974     break;
2975    
2976 nigel 93 /* Match a single character repeatedly. */
2977 nigel 77
2978     case OP_EXACT:
2979 ph10 602 case OP_EXACTI:
2980 nigel 77 min = max = GET2(ecode, 1);
2981     ecode += 3;
2982     goto REPEATCHAR;
2983    
2984 nigel 93 case OP_POSUPTO:
2985 ph10 602 case OP_POSUPTOI:
2986 nigel 93 possessive = TRUE;
2987     /* Fall through */
2988    
2989 nigel 77 case OP_UPTO:
2990 ph10 602 case OP_UPTOI:
2991 nigel 77 case OP_MINUPTO:
2992 ph10 602 case OP_MINUPTOI:
2993 nigel 77 min = 0;
2994     max = GET2(ecode, 1);
2995 ph10 602 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2996 nigel 77 ecode += 3;
2997     goto REPEATCHAR;
2998    
2999 nigel 93 case OP_POSSTAR:
3000 ph10 602 case OP_POSSTARI:
3001 nigel 93 possessive = TRUE;
3002     min = 0;
3003     max = INT_MAX;
3004     ecode++;
3005     goto REPEATCHAR;
3006    
3007     case OP_POSPLUS:
3008 ph10 602 case OP_POSPLUSI:
3009 nigel 93 possessive = TRUE;
3010     min = 1;
3011     max = INT_MAX;
3012     ecode++;
3013     goto REPEATCHAR;
3014    
3015     case OP_POSQUERY:
3016 ph10 602 case OP_POSQUERYI:
3017 nigel 93 possessive = TRUE;
3018     min = 0;
3019     max = 1;
3020     ecode++;
3021     goto REPEATCHAR;
3022    
3023 nigel 77 case OP_STAR:
3024 ph10 602 case OP_STARI:
3025 nigel 77 case OP_MINSTAR:
3026 ph10 602 case OP_MINSTARI:
3027 nigel 77 case OP_PLUS:
3028 ph10 602 case OP_PLUSI:
3029 nigel 77 case OP_MINPLUS:
3030 ph10 602 case OP_MINPLUSI:
3031 nigel 77 case OP_QUERY:
3032 ph10 602 case OP_QUERYI:
3033 nigel 77 case OP_MINQUERY:
3034 ph10 602 case OP_MINQUERYI:
3035     c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3036 nigel 77 minimize = (c & 1) != 0;
3037     min = rep_min[c]; /* Pick up values from tables; */
3038     max = rep_max[c]; /* zero for max => infinity */
3039     if (max == 0) max = INT_MAX;
3040    
3041 ph10 426 /* Common code for all repeated single-character matches. */
3042 nigel 77
3043     REPEATCHAR:
3044     #ifdef SUPPORT_UTF8
3045     if (utf8)
3046     {
3047     length = 1;
3048     charptr = ecode;
3049     GETCHARLEN(fc, ecode, length);
3050     ecode += length;
3051    
3052     /* Handle multibyte character matching specially here. There is
3053     support for caseless matching if UCP support is present. */
3054    
3055     if (length > 1)
3056     {
3057     #ifdef SUPPORT_UCP
3058 nigel 93 unsigned int othercase;
3059 ph10 602 if (op >= OP_STARI && /* Caseless */
3060 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
3061 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
3062 ph10 115 else oclength = 0;
3063 nigel 77 #endif /* SUPPORT_UCP */
3064    
3065     for (i = 1; i <= min; i++)
3066     {
3067 ph10 426 if (eptr <= md->end_subject - length &&
3068     memcmp(eptr, charptr, length) == 0) eptr += length;
3069 ph10 123 #ifdef SUPPORT_UCP
3070 ph10 426 else if (oclength > 0 &&
3071     eptr <= md->end_subject - oclength &&
3072     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3073     #endif /* SUPPORT_UCP */
3074 nigel 77 else
3075     {
3076 ph10 426 CHECK_PARTIAL();
3077 ph10 510 MRRETURN(MATCH_NOMATCH);
3078 nigel 77 }
3079     }
3080    
3081     if (min == max) continue;
3082    
3083     if (minimize)
3084     {
3085     for (fi = min;; fi++)
3086     {
3087 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3088 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3089 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3090 ph10 426 if (eptr <= md->end_subject - length &&
3091     memcmp(eptr, charptr, length) == 0) eptr += length;
3092 ph10 123 #ifdef SUPPORT_UCP
3093 ph10 426 else if (oclength > 0 &&
3094     eptr <= md->end_subject - oclength &&
3095     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3096     #endif /* SUPPORT_UCP */
3097 nigel 77 else
3098     {
3099 ph10 426 CHECK_PARTIAL();
3100 ph10 510 MRRETURN(MATCH_NOMATCH);
3101 nigel 77 }
3102     }
3103     /* Control never gets here */
3104     }
3105 nigel 93
3106     else /* Maximize */
3107 nigel 77 {
3108     pp = eptr;
3109     for (i = min; i < max; i++)
3110     {
3111 ph10 426 if (eptr <= md->end_subject - length &&
3112     memcmp(eptr, charptr, length) == 0) eptr += length;
3113 ph10 123 #ifdef SUPPORT_UCP
3114 ph10 426 else if (oclength > 0 &&
3115     eptr <= md->end_subject - oclength &&
3116     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3117     #endif /* SUPPORT_UCP */
3118 ph10 463 else
3119 ph10 462 {
3120 ph10 463 CHECK_PARTIAL();
3121 ph10 462 break;
3122 ph10 463 }
3123 nigel 77 }
3124 nigel 93
3125     if (possessive) continue;
3126 ph10 427
3127 ph10 120 for(;;)
3128 ph10 426 {
3129 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3130 ph10 426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3131 ph10 510 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3132 ph10 115 #ifdef SUPPORT_UCP
3133 ph10 426 eptr--;
3134     BACKCHAR(eptr);
3135 ph10 123 #else /* without SUPPORT_UCP */
3136 ph10 426 eptr -= length;
3137 ph10 123 #endif /* SUPPORT_UCP */
3138 ph10 426 }
3139 nigel 77 }
3140     /* Control never gets here */
3141     }
3142    
3143     /* If the length of a UTF-8 character is 1, we fall through here, and
3144     obey the code as for non-UTF-8 characters below, though in this case the
3145     value of fc will always be < 128. */
3146     }
3147     else
3148     #endif /* SUPPORT_UTF8 */
3149    
3150     /* When not in UTF-8 mode, load a single-byte character. */
3151    
3152 ph10 426 fc = *ecode++;
3153 ph10 443
3154 nigel 77 /* The value of fc at this point is always less than 256, though we may or
3155     may not be in UTF-8 mode. The code is duplicated for the caseless and
3156     caseful cases, for speed, since matching characters is likely to be quite
3157     common. First, ensure the minimum number of matches are present. If min =
3158     max, continue at the same level without recursing. Otherwise, if
3159     minimizing, keep trying the rest of the expression and advancing one
3160     matching character if failing, up to the maximum. Alternatively, if
3161     maximizing, find the maximum number of characters and work backwards. */
3162    
3163     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3164     max, eptr));
3165    
3166 ph10 602 if (op >= OP_STARI) /* Caseless */
3167 nigel 77 {
3168     fc = md->lcc[fc];
3169     for (i = 1; i <= min; i++)
3170 ph10 426 {
3171     if (eptr >= md->end_subject)
3172     {
3173     SCHECK_PARTIAL();
3174 ph10 510 MRRETURN(MATCH_NOMATCH);
3175 ph10 426 }
3176 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3177 ph10 426 }
3178 nigel 77 if (min == max) continue;
3179     if (minimize)
3180     {
3181     for (fi = min;; fi++)
3182     {
3183 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3184 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3185 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3186 ph10 426 if (eptr >= md->end_subject)
3187     {
3188 ph10 427 SCHECK_PARTIAL();
3189 ph10 510 MRRETURN(MATCH_NOMATCH);
3190 ph10 426 }
3191 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3192 nigel 77 }
3193     /* Control never gets here */
3194     }
3195 nigel 93 else /* Maximize */
3196 nigel 77 {
3197     pp = eptr;
3198     for (i = min; i < max; i++)
3199     {
3200 ph10 463 if (eptr >= md->end_subject)
3201 ph10 462 {
3202     SCHECK_PARTIAL();
3203     break;
3204 ph10 463 }
3205 ph10 462 if (fc != md->lcc[*eptr]) break;
3206 nigel 77 eptr++;
3207     }
3208 ph10 427
3209 nigel 93 if (possessive) continue;
3210 ph10 427
3211 nigel 77 while (eptr >= pp)
3212     {
3213 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3214 nigel 77 eptr--;
3215     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3216     }
3217 ph10 510 MRRETURN(MATCH_NOMATCH);
3218 nigel 77 }
3219     /* Control never gets here */
3220     }
3221    
3222     /* Caseful comparisons (includes all multi-byte characters) */
3223    
3224     else
3225     {
3226 ph10 427 for (i = 1; i <= min; i++)
3227 ph10 426 {
3228     if (eptr >= md->end_subject)
3229     {
3230     SCHECK_PARTIAL();
3231 ph10 510 MRRETURN(MATCH_NOMATCH);
3232 ph10 426 }
3233 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3234 ph10 427 }
3235 ph10 443
3236 nigel 77 if (min == max) continue;
3237 ph10 443
3238 nigel 77 if (minimize)
3239     {
3240     for (fi = min;; fi++)
3241     {
3242 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3243 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3244 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3245 ph10 426 if (eptr >= md->end_subject)
3246 ph10 427 {
3247 ph10 426 SCHECK_PARTIAL();
3248 ph10 510 MRRETURN(MATCH_NOMATCH);
3249 ph10 427 }
3250 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3251 nigel 77 }
3252     /* Control never gets here */
3253     }
3254 nigel 93 else /* Maximize */
3255 nigel 77 {
3256     pp = eptr;
3257     for (i = min; i < max; i++)
3258     {
3259 ph10 463 if (eptr >= md->end_subject)
3260 ph10 462 {
3261 ph10 463 SCHECK_PARTIAL();
3262 ph10 462 break;
3263 ph10 463 }
3264 ph10 462 if (fc != *eptr) break;
3265 nigel 77 eptr++;
3266     }
3267 nigel 93 if (possessive) continue;
3268 ph10 443
3269 nigel 77 while (eptr >= pp)
3270     {
3271 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3272 nigel 77 eptr--;
3273     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3274     }
3275 ph10 510 MRRETURN(MATCH_NOMATCH);
3276 nigel 77 }
3277     }
3278     /* Control never gets here */
3279    
3280     /* Match a negated single one-byte character. The character we are
3281     checking can be multibyte. */
3282    
3283     case OP_NOT:
3284 ph10 625 case OP_NOTI:
3285 ph10 443 if (eptr >= md->end_subject)
3286 ph10 428 {
3287 ph10 443 SCHECK_PARTIAL();
3288 ph10 510 MRRETURN(MATCH_NOMATCH);
3289 ph10 443 }
3290 nigel 77 ecode++;
3291     GETCHARINCTEST(c, eptr);
3292 ph10 602 if (op == OP_NOTI) /* The caseless case */
3293 nigel 77 {
3294     #ifdef SUPPORT_UTF8
3295     if (c < 256)
3296     #endif
3297     c = md->lcc[c];
3298 ph10 510 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3299 nigel 77 }
3300 ph10 602 else /* Caseful */
3301 nigel 77 {
3302 ph10 510 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3303 nigel 77 }
3304     break;
3305    
3306     /* Match a negated single one-byte character repeatedly. This is almost a
3307     repeat of the code for a repeated single character, but I haven't found a
3308     nice way of commoning these up that doesn't require a test of the
3309     positive/negative option for each character match. Maybe that wouldn't add
3310     very much to the time taken, but character matching *is* what this is all
3311     about... */
3312    
3313     case OP_NOTEXACT:
3314 ph10 602 case OP_NOTEXACTI:
3315 nigel 77 min = max = GET2(ecode, 1);
3316     ecode += 3;
3317     goto REPEATNOTCHAR;
3318    
3319     case OP_NOTUPTO:
3320 ph10 602 case OP_NOTUPTOI:
3321 nigel 77 case OP_NOTMINUPTO:
3322 ph10 602 case OP_NOTMINUPTOI:
3323 nigel 77 min = 0;
3324     max = GET2(ecode, 1);
3325 ph10 602 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3326 nigel 77 ecode += 3;
3327     goto REPEATNOTCHAR;
3328    
3329 nigel 93 case OP_NOTPOSSTAR:
3330 ph10 602 case OP_NOTPOSSTARI:
3331 nigel 93 possessive = TRUE;
3332     min = 0;
3333     max = INT_MAX;
3334     ecode++;
3335     goto REPEATNOTCHAR;
3336    
3337     case OP_NOTPOSPLUS:
3338 ph10 602 case OP_NOTPOSPLUSI:
3339 nigel 93 possessive = TRUE;
3340     min = 1;
3341     max = INT_MAX;
3342     ecode++;
3343     goto REPEATNOTCHAR;
3344    
3345     case OP_NOTPOSQUERY:
3346 ph10 602 case OP_NOTPOSQUERYI:
3347 nigel 93 possessive = TRUE;
3348     min = 0;
3349     max = 1;
3350     ecode++;
3351     goto REPEATNOTCHAR;
3352    
3353     case OP_NOTPOSUPTO:
3354 ph10 602 case OP_NOTPOSUPTOI:
3355 nigel 93 possessive = TRUE;
3356     min = 0;
3357     max = GET2(ecode, 1);
3358     ecode += 3;
3359     goto REPEATNOTCHAR;
3360    
3361 nigel 77 case OP_NOTSTAR:
3362 ph10 602 case OP_NOTSTARI:
3363 nigel 77 case OP_NOTMINSTAR:
3364 ph10 602 case OP_NOTMINSTARI:
3365 nigel 77 case OP_NOTPLUS:
3366 ph10 602 case OP_NOTPLUSI:
3367 nigel 77 case OP_NOTMINPLUS:
3368 ph10 602 case OP_NOTMINPLUSI:
3369 nigel 77 case OP_NOTQUERY:
3370 ph10 602 case OP_NOTQUERYI:
3371 nigel 77 case OP_NOTMINQUERY:
3372 ph10 602 case OP_NOTMINQUERYI:
3373     c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3374 nigel 77 minimize = (c & 1) != 0;
3375     min = rep_min[c]; /* Pick up values from tables; */
3376     max = rep_max[c]; /* zero for max => infinity */
3377     if (max == 0) max = INT_MAX;
3378    
3379 ph10 426 /* Common code for all repeated single-byte matches. */
3380 nigel 77
3381     REPEATNOTCHAR:
3382     fc = *ecode++;
3383    
3384     /* The code is duplicated for the caseless and caseful cases, for speed,
3385     since matching characters is likely to be quite common. First, ensure the
3386     minimum number of matches are present. If min = max, continue at the same
3387     level without recursing. Otherwise, if minimizing, keep trying the rest of
3388     the expression and advancing one matching character if failing, up to the
3389     maximum. Alternatively, if maximizing, find the maximum number of
3390     characters and work backwards. */
3391    
3392     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3393     max, eptr));
3394    
3395 ph10 602 if (op >= OP_NOTSTARI) /* Caseless */
3396 nigel 77 {
3397     fc = md->lcc[fc];
3398    
3399     #ifdef SUPPORT_UTF8
3400     /* UTF-8 mode */
3401     if (utf8)
3402     {
3403 nigel 93 register unsigned int d;
3404 nigel 77 for (i = 1; i <= min; i++)
3405     {
3406 ph10 426 if (eptr >= md->end_subject)
3407     {
3408     SCHECK_PARTIAL();
3409 ph10 510 MRRETURN(MATCH_NOMATCH);
3410 ph10 427 }
3411 nigel 77 GETCHARINC(d, eptr);
3412     if (d < 256) d = md->lcc[d];
3413 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3414 nigel 77 }
3415     }
3416     else
3417     #endif
3418    
3419     /* Not UTF-8 mode */
3420     {
3421     for (i = 1; i <= min; i++)
3422 ph10 426 {
3423     if (eptr >= md->end_subject)
3424     {
3425     SCHECK_PARTIAL();
3426 ph10 510 MRRETURN(MATCH_NOMATCH);
3427 ph10 427 }
3428 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3429 ph10 427 }
3430 nigel 77 }
3431    
3432     if (min == max) continue;
3433    
3434     if (minimize)
3435     {
3436     #ifdef SUPPORT_UTF8
3437     /* UTF-8 mode */
3438     if (utf8)
3439     {
3440 nigel 93 register unsigned int d;
3441 nigel 77 for (fi = min;; fi++)
3442     {
3443 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3444 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3445 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3446 ph10 427 if (eptr >= md->end_subject)
3447 ph10 426 {
3448 ph10 427 SCHECK_PARTIAL();
3449 ph10 510 MRRETURN(MATCH_NOMATCH);
3450 ph10 427 }
3451 nigel 77 GETCHARINC(d, eptr);
3452     if (d < 256) d = md->lcc[d];
3453 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3454 nigel 77 }
3455     }
3456     else
3457     #endif
3458     /* Not UTF-8 mode */
3459     {
3460     for (fi = min;; fi++)
3461     {
3462 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3463 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3464 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3465 ph10 426 if (eptr >= md->end_subject)
3466     {
3467     SCHECK_PARTIAL();
3468 ph10 510 MRRETURN(MATCH_NOMATCH);
3469 ph10 426 }
3470 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3471 nigel 77 }
3472     }
3473     /* Control never gets here */
3474     }
3475    
3476     /* Maximize case */
3477    
3478     else
3479     {
3480     pp = eptr;
3481    
3482     #ifdef SUPPORT_UTF8
3483     /* UTF-8 mode */
3484     if (utf8)
3485     {
3486 nigel 93 register unsigned int d;
3487 nigel 77 for (i = min; i < max; i++)
3488     {
3489     int len = 1;
3490 ph10 463 if (eptr >= md->end_subject)
3491 ph10 462 {
3492 ph10 463 SCHECK_PARTIAL();
3493 ph10 462 break;
3494 ph10 463 }
3495 nigel 77 GETCHARLEN(d, eptr, len);
3496     if (d < 256) d = md->lcc[d];
3497     if (fc == d) break;
3498     eptr += len;
3499     }
3500 nigel 93 if (possessive) continue;
3501     for(;;)
3502 nigel 77 {
3503 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3504 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3505     if (eptr-- == pp) break; /* Stop if tried at original pos */
3506     BACKCHAR(eptr);
3507     }
3508     }
3509     else
3510     #endif
3511     /* Not UTF-8 mode */
3512     {
3513     for (i = min; i < max; i++)
3514     {
3515 ph10 463 if (eptr >= md->end_subject)
3516 ph10 462 {
3517     SCHECK_PARTIAL();
3518     break;
3519 ph10 463 }
3520 ph10 462 if (fc == md->lcc[*eptr]) break;
3521 nigel 77 eptr++;
3522     }
3523 nigel 93 if (possessive) continue;
3524 nigel 77 while (eptr >= pp)
3525     {
3526 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3527 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3528     eptr--;
3529     }
3530     }
3531    
3532 ph10 510 MRRETURN(MATCH_NOMATCH);
3533 nigel 77 }
3534     /* Control never gets here */
3535     }
3536    
3537     /* Caseful comparisons */
3538    
3539     else
3540     {
3541     #ifdef SUPPORT_UTF8
3542     /* UTF-8 mode */
3543     if (utf8)
3544     {
3545 nigel 93 register unsigned int d;
3546 nigel 77 for (i = 1; i <= min; i++)
3547     {
3548 ph10 426 if (eptr >= md->end_subject)
3549     {
3550     SCHECK_PARTIAL();
3551 ph10 510 MRRETURN(MATCH_NOMATCH);
3552 ph10 427 }
3553 nigel 77 GETCHARINC(d, eptr);
3554 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3555 nigel 77 }
3556     }
3557     else
3558     #endif
3559     /* Not UTF-8 mode */
3560     {
3561     for (i = 1; i <= min; i++)
3562 ph10 426 {
3563     if (eptr >= md->end_subject)
3564     {
3565     SCHECK_PARTIAL();
3566 ph10 510 MRRETURN(MATCH_NOMATCH);
3567 ph10 427 }
3568 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3569 ph10 427 }
3570 nigel 77 }
3571    
3572     if (min == max) continue;
3573    
3574     if (minimize)
3575     {
3576     #ifdef SUPPORT_UTF8
3577     /* UTF-8 mode */
3578     if (utf8)
3579     {
3580 nigel 93 register unsigned int d;
3581 nigel 77 for (fi = min;; fi++)
3582     {
3583 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3584 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3585 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3586 ph10 427 if (eptr >= md->end_subject)
3587 ph10 426 {
3588 ph10 427 SCHECK_PARTIAL();
3589 ph10 510 MRRETURN(MATCH_NOMATCH);
3590 ph10 427 }
3591 nigel 77 GETCHARINC(d, eptr);
3592 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3593 nigel 77 }
3594     }
3595     else
3596     #endif
3597     /* Not UTF-8 mode */
3598     {
3599     for (fi = min;; fi++)
3600     {
3601 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3602 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3603 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3604 ph10 426 if (eptr >= md->end_subject)
3605     {
3606     SCHECK_PARTIAL();
3607 ph10 510 MRRETURN(MATCH_NOMATCH);
3608 ph10 427 }
3609 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3610 nigel 77 }
3611     }
3612     /* Control never gets here */
3613     }
3614    
3615     /* Maximize case */
3616    
3617     else
3618     {
3619     pp = eptr;
3620    
3621     #ifdef SUPPORT_UTF8
3622     /* UTF-8 mode */
3623     if (utf8)
3624     {
3625 nigel 93 register unsigned int d;
3626 nigel 77 for (i = min; i < max; i++)
3627     {
3628     int len = 1;
3629 ph10 463 if (eptr >= md->end_subject)
3630 ph10 462 {
3631 ph10 463 SCHECK_PARTIAL();
3632 ph10 462 break;
3633 ph10 463 }
3634 nigel 77 GETCHARLEN(d, eptr, len);
3635     if (fc == d) break;
3636     eptr += len;
3637     }
3638 nigel 93 if (possessive) continue;
3639 nigel 77 for(;;)
3640     {
3641 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3642 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3643     if (eptr-- == pp) break; /* Stop if tried at original pos */
3644     BACKCHAR(eptr);
3645     }
3646     }
3647     else
3648     #endif
3649     /* Not UTF-8 mode */
3650     {
3651     for (i = min; i < max; i++)
3652     {
3653 ph10 463 if (eptr >= md->end_subject)
3654 ph10 462 {
3655 ph10 463 SCHECK_PARTIAL();
3656 ph10 462 break;
3657 ph10 463 }
3658 ph10 462 if (fc == *eptr) break;
3659 nigel 77 eptr++;
3660     }
3661 nigel 93 if (possessive) continue;
3662 nigel 77 while (eptr >= pp)
3663     {
3664 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3665 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3666     eptr--;
3667     }
3668     }
3669    
3670 ph10 510 MRRETURN(MATCH_NOMATCH);
3671 nigel 77 }
3672     }
3673     /* Control never gets here */
3674    
3675     /* Match a single character type repeatedly; several different opcodes
3676     share code. This is very similar to the code for single characters, but we
3677     repeat it in the interests of efficiency. */
3678    
3679     case OP_TYPEEXACT:
3680     min = max = GET2(ecode, 1);
3681     minimize = TRUE;
3682     ecode += 3;
3683     goto REPEATTYPE;
3684    
3685     case OP_TYPEUPTO:
3686     case OP_TYPEMINUPTO:
3687     min = 0;
3688     max = GET2(ecode, 1);
3689     minimize = *ecode == OP_TYPEMINUPTO;
3690     ecode += 3;
3691     goto REPEATTYPE;
3692    
3693 nigel 93 case OP_TYPEPOSSTAR:
3694     possessive = TRUE;
3695     min = 0;
3696     max = INT_MAX;
3697     ecode++;
3698     goto REPEATTYPE;
3699    
3700     case OP_TYPEPOSPLUS:
3701     possessive = TRUE;
3702     min = 1;
3703     max = INT_MAX;
3704     ecode++;
3705     goto REPEATTYPE;
3706    
3707     case OP_TYPEPOSQUERY:
3708     possessive = TRUE;
3709     min = 0;
3710     max = 1;
3711     ecode++;
3712     goto REPEATTYPE;
3713    
3714     case OP_TYPEPOSUPTO:
3715     possessive = TRUE;
3716     min = 0;
3717     max = GET2(ecode, 1);
3718     ecode += 3;
3719     goto REPEATTYPE;
3720    
3721 nigel 77 case OP_TYPESTAR:
3722     case OP_TYPEMINSTAR:
3723     case OP_TYPEPLUS:
3724     case OP_TYPEMINPLUS:
3725     case OP_TYPEQUERY:
3726     case OP_TYPEMINQUERY:
3727     c = *ecode++ - OP_TYPESTAR;
3728     minimize = (c & 1) != 0;
3729     min = rep_min[c]; /* Pick up values from tables; */
3730     max = rep_max[c]; /* zero for max => infinity */
3731     if (max == 0) max = INT_MAX;
3732    
3733     /* Common code for all repeated single character type matches. Note that
3734     in UTF-8 mode, '.' matches a character of any length, but for the other
3735     character types, the valid characters are all one-byte long. */
3736    
3737     REPEATTYPE:
3738     ctype = *ecode++; /* Code for the character type */
3739    
3740     #ifdef SUPPORT_UCP
3741     if (ctype == OP_PROP || ctype == OP_NOTPROP)
3742     {
3743     prop_fail_result = ctype == OP_NOTPROP;
3744     prop_type = *ecode++;
3745 nigel 87 prop_value = *ecode++;
3746 nigel 77 }
3747     else prop_type = -1;
3748     #endif
3749    
3750     /* First, ensure the minimum number of matches are present. Use inline
3751     code for maximizing the speed, and do the type test once at the start
3752 ph10 426 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3753 nigel 77 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3754     and single-bytes. */
3755    
3756     if (min > 0)
3757     {
3758     #ifdef SUPPORT_UCP
3759 nigel 87 if (prop_type >= 0)
3760 nigel 77 {
3761 nigel 87 switch(prop_type)
3762 nigel 77 {
3763 nigel 87 case PT_ANY:
3764 ph10 510 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3765 nigel 87 for (i = 1; i <= min; i++)
3766     {
3767 ph10 427 if (eptr >= md->end_subject)
3768 ph10 426 {
3769 ph10 427 SCHECK_PARTIAL();
3770 ph10 510 MRRETURN(MATCH_NOMATCH);
3771 ph10 427 }
3772 ph10 184 GETCHARINCTEST(c, eptr);
3773 nigel 87 }
3774     break;
3775    
3776     case PT_LAMP:
3777     for (i = 1; i <= min; i++)
3778     {
3779 ph10 625 int chartype;
3780 ph10 427 if (eptr >= md->end_subject)
3781 ph10 426 {
3782 ph10 427 SCHECK_PARTIAL();
3783 ph10 510 MRRETURN(MATCH_NOMATCH);
3784 ph10 427 }
3785 ph10 184 GETCHARINCTEST(c, eptr);
3786 ph10 623 chartype = UCD_CHARTYPE(c);
3787     if ((chartype == ucp_Lu ||
3788     chartype == ucp_Ll ||
3789     chartype == ucp_Lt) == prop_fail_result)
3790 ph10 510 MRRETURN(MATCH_NOMATCH);
3791 nigel 87 }
3792     break;
3793    
3794     case PT_GC:
3795     for (i = 1; i <= min; i++)
3796     {
3797 ph10 427 if (eptr >= md->end_subject)
3798 ph10 426 {
3799 ph10 427 SCHECK_PARTIAL();
3800 ph10 510 MRRETURN(MATCH_NOMATCH);
3801 ph10 427 }
3802 ph10 184 GETCHARINCTEST(c, eptr);
3803 ph10 623 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3804 ph10 510 MRRETURN(MATCH_NOMATCH);
3805 nigel 87 }
3806     break;
3807    
3808     case PT_PC:
3809     for (i = 1; i <= min; i++)
3810     {
3811 ph10 427 if (eptr >= md->end_subject)
3812 ph10 426 {
3813 ph10 427 SCHECK_PARTIAL();
3814 ph10 510 MRRETURN(MATCH_NOMATCH);
3815 ph10 427 }
3816 ph10 184 GETCHARINCTEST(c, eptr);
3817 ph10 623 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3818 ph10 510 MRRETURN(MATCH_NOMATCH);
3819 nigel 87 }
3820     break;
3821    
3822     case PT_SC:
3823     for (i = 1; i <= min; i++)
3824     {
3825 ph10 427 if (eptr >= md->end_subject)
3826 ph10 426 {
3827 ph10 427 SCHECK_PARTIAL();
3828 ph10 510 MRRETURN(MATCH_NOMATCH);
3829 ph10 427 }
3830 ph10 184 GETCHARINCTEST(c, eptr);
3831 ph10 623 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
3832 ph10 510 MRRETURN(MATCH_NOMATCH);
3833 nigel 87 }
3834     break;
3835 ph10 527
3836 ph10 517 case PT_ALNUM:
3837     for (i = 1; i <= min; i++)
3838     {
3839 ph10 625 int category;
3840 ph10 517 if (eptr >= md->end_subject)
3841     {
3842     SCHECK_PARTIAL();
3843     MRRETURN(MATCH_NOMATCH);
3844     }
3845     GETCHARINCTEST(c, eptr);
3846 ph10 623 category = UCD_CATEGORY(c);
3847     if ((category == ucp_L || category == ucp_N) == prop_fail_result)
3848 ph10 517 MRRETURN(MATCH_NOMATCH);
3849     }
3850     break;
3851 ph10 527
3852 ph10 517 case PT_SPACE: /* Perl space */
3853     for (i = 1; i <= min; i++)
3854     {
3855     if (eptr >= md->end_subject)
3856     {
3857     SCHECK_PARTIAL();