/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 716 - (hide annotations) (download)
Tue Oct 4 16:38:05 2011 UTC (19 months, 2 weeks ago) by ph10
File MIME type: text/plain
File size: 198414 byte(s)
Make (*THEN) work as in Perl in subpatterns that do not contain | alternatives.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 ph10 625 /* Values for setting in md->match_function_type to indicate two special types
61     of call to match(). We do it this way to save on using another stack variable,
62 ph10 604 as stack usage is to be discouraged. */
63 nigel 77
64 ph10 604 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65     #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66 nigel 77
67     /* Non-error returns from the match() function. Error returns are externally
68     defined PCRE_ERROR_xxx codes, which are all negative. */
69    
70     #define MATCH_MATCH 1
71     #define MATCH_NOMATCH 0
72    
73 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
74 ph10 210 negative to avoid the external error codes. */
75    
76 ph10 511 #define MATCH_ACCEPT (-999)
77     #define MATCH_COMMIT (-998)
78 ph10 604 #define MATCH_KETRPOS (-997)
79 ph10 618 #define MATCH_ONCE (-996)
80     #define MATCH_PRUNE (-995)
81     #define MATCH_SKIP (-994)
82     #define MATCH_SKIP_ARG (-993)
83     #define MATCH_THEN (-992)
84 ph10 210
85 ph10 510 /* This is a convenience macro for code that occurs many times. */
86    
87     #define MRRETURN(ra) \
88     { \
89     md->mark = markptr; \
90     RRETURN(ra); \
91     }
92    
93 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
94     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95     because the offset vector is always a multiple of 3 long. */
96    
97     #define REC_STACK_SAVE_MAX 30
98    
99     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100    
101     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103    
104    
105    
106 ph10 475 #ifdef PCRE_DEBUG
107 nigel 77 /*************************************************
108     * Debugging function to print chars *
109     *************************************************/
110    
111     /* Print a sequence of chars in printable format, stopping at the end of the
112     subject if the requested.
113    
114     Arguments:
115     p points to characters
116     length number to print
117     is_subject TRUE if printing from within md->start_subject
118     md pointer to matching data block, if is_subject is TRUE
119    
120     Returns: nothing
121     */
122    
123     static void
124     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
125     {
126 nigel 93 unsigned int c;
127 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128     while (length-- > 0)
129     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130     }
131     #endif
132    
133    
134    
135     /*************************************************
136     * Match a back-reference *
137     *************************************************/
138    
139 ph10 595 /* Normally, if a back reference hasn't been set, the length that is passed is
140     negative, so the match always fails. However, in JavaScript compatibility mode,
141 ph10 625 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 ph10 595 subject bytes matched may be different to the number of reference bytes.
143 nigel 77
144     Arguments:
145     offset index into the offset vector
146 ph10 595 eptr pointer into the subject
147     length length of reference to be matched (number of bytes)
148 nigel 77 md points to match data block
149 ph10 602 caseless TRUE if caseless
150 nigel 77
151 ph10 595 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 nigel 77 */
153    
154 ph10 595 static int
155 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
156 ph10 602 BOOL caseless)
157 nigel 77 {
158 ph10 595 USPTR eptr_start = eptr;
159     register USPTR p = md->start_subject + md->offset_vector[offset];
160 nigel 77
161 ph10 475 #ifdef PCRE_DEBUG
162 nigel 77 if (eptr >= md->end_subject)
163     printf("matching subject <null>");
164     else
165     {
166     printf("matching subject ");
167     pchars(eptr, length, TRUE, md);
168     }
169     printf(" against backref ");
170     pchars(p, length, FALSE, md);
171     printf("\n");
172     #endif
173    
174 ph10 595 /* Always fail if reference not set (and not JavaScript compatible). */
175 nigel 77
176 ph10 595 if (length < 0) return -1;
177 nigel 77
178 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179     properly if Unicode properties are supported. Otherwise, we can check only
180     ASCII characters. */
181 nigel 77
182 ph10 602 if (caseless)
183 nigel 77 {
184 ph10 354 #ifdef SUPPORT_UTF8
185     #ifdef SUPPORT_UCP
186     if (md->utf8)
187     {
188 ph10 625 /* Match characters up to the end of the reference. NOTE: the number of
189 ph10 595 bytes matched may differ, because there are some characters whose upper and
190     lower case versions code as different numbers of bytes. For example, U+023A
191     (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192     a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 ph10 625 the latter. It is important, therefore, to check the length along the
194 ph10 595 reference, not along the subject (earlier code did this wrong). */
195 ph10 625
196 ph10 595 USPTR endptr = p + length;
197     while (p < endptr)
198 ph10 354 {
199 ph10 358 int c, d;
200 ph10 597 if (eptr >= md->end_subject) return -1;
201 ph10 354 GETCHARINC(c, eptr);
202     GETCHARINC(d, p);
203 ph10 595 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 ph10 358 }
205     }
206 ph10 354 else
207     #endif
208     #endif
209    
210     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211     is no UCP support. */
212 ph10 597 {
213 ph10 625 if (eptr + length > md->end_subject) return -1;
214 ph10 597 while (length-- > 0)
215     { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 ph10 625 }
217 nigel 77 }
218 ph10 358
219 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
220     are in UTF-8 mode. */
221 ph10 358
222 nigel 77 else
223 ph10 625 {
224     if (eptr + length > md->end_subject) return -1;
225     while (length-- > 0) if (*p++ != *eptr++) return -1;
226 ph10 597 }
227 nigel 77
228 ph10 595 return eptr - eptr_start;
229 nigel 77 }
230    
231    
232    
233     /***************************************************************************
234     ****************************************************************************
235     RECURSION IN THE match() FUNCTION
236    
237 nigel 87 The match() function is highly recursive, though not every recursive call
238     increases the recursive depth. Nevertheless, some regular expressions can cause
239     it to recurse to a great depth. I was writing for Unix, so I just let it call
240     itself recursively. This uses the stack for saving everything that has to be
241     saved for a recursive call. On Unix, the stack can be large, and this works
242     fine.
243 nigel 77
244 nigel 87 It turns out that on some non-Unix-like systems there are problems with
245     programs that use a lot of stack. (This despite the fact that every last chip
246     has oodles of memory these days, and techniques for extending the stack have
247     been known for decades.) So....
248 nigel 77
249     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250     calls by keeping local variables that need to be preserved in blocks of memory
251 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
252 nigel 77 achieve this so that the actual code doesn't look very different to what it
253     always used to.
254 ph10 164
255 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
256 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
257     Switzer, the use of longjmp() has been abolished, at the cost of having to
258     provide a unique number for each call to RMATCH. There is no way of generating
259     a sequence of numbers at compile time in C. I have given them names, to make
260     them stand out more clearly.
261    
262     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
265     don't have indeterminate values; this has meant that the frame size can be
266 ph10 164 reduced because the result can be "passed back" by straight setting of the
267     variable instead of being passed in the frame.
268 nigel 77 ****************************************************************************
269     ***************************************************************************/
270    
271 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272     below must be updated in sync. */
273 nigel 77
274 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 ph10 527 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 ph10 623 RM61, RM62, RM63 };
281 ph10 164
282 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
283 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 ph10 501 actually used in this definition. */
285 nigel 77
286     #ifndef NO_RECURSE
287     #define REGISTER register
288 ph10 164
289 ph10 475 #ifdef PCRE_DEBUG
290 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 nigel 87 { \
292     printf("match() called in line %d\n", __LINE__); \
293 ph10 604 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 nigel 87 printf("to line %d\n", __LINE__); \
295     }
296     #define RRETURN(ra) \
297     { \
298     printf("match() returned %d from line %d ", ra, __LINE__); \
299     return ra; \
300     }
301     #else
302 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
303     rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 nigel 77 #define RRETURN(ra) return ra
305 nigel 87 #endif
306    
307 nigel 77 #else
308    
309    
310 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
311     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312     argument of match(), which never changes. */
313 nigel 77
314     #define REGISTER
315    
316 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 nigel 77 {\
318 ph10 563 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 ph10 534 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 ph10 164 frame->Xwhere = rw; \
321     newframe->Xeptr = ra;\
322     newframe->Xecode = rb;\
323 ph10 168 newframe->Xmstart = mstart;\
324 ph10 501 newframe->Xmarkptr = markptr;\
325 ph10 164 newframe->Xoffset_top = rc;\
326 ph10 602 newframe->Xeptrb = re;\
327 ph10 164 newframe->Xrdepth = frame->Xrdepth + 1;\
328     newframe->Xprevframe = frame;\
329     frame = newframe;\
330     DPRINTF(("restarting from line %d\n", __LINE__));\
331     goto HEAP_RECURSE;\
332     L_##rw:\
333     DPRINTF(("jumped back to line %d\n", __LINE__));\
334 nigel 77 }
335    
336     #define RRETURN(ra)\
337     {\
338 ph10 527 heapframe *oldframe = frame;\
339     frame = oldframe->Xprevframe;\
340     (pcre_stack_free)(oldframe);\
341 nigel 77 if (frame != NULL)\
342     {\
343 ph10 164 rrc = ra;\
344     goto HEAP_RETURN;\
345 nigel 77 }\
346     return ra;\
347     }
348    
349    
350     /* Structure for remembering the local variables in a private frame */
351    
352     typedef struct heapframe {
353     struct heapframe *Xprevframe;
354    
355     /* Function arguments that may change */
356    
357 ph10 409 USPTR Xeptr;
358 nigel 77 const uschar *Xecode;
359 ph10 409 USPTR Xmstart;
360 ph10 501 USPTR Xmarkptr;
361 nigel 77 int Xoffset_top;
362     eptrblock *Xeptrb;
363 nigel 91 unsigned int Xrdepth;
364 nigel 77
365     /* Function local variables */
366    
367 ph10 409 USPTR Xcallpat;
368 ph10 406 #ifdef SUPPORT_UTF8
369 ph10 409 USPTR Xcharptr;
370 ph10 406 #endif
371 ph10 409 USPTR Xdata;
372     USPTR Xnext;
373     USPTR Xpp;
374     USPTR Xprev;
375     USPTR Xsaved_eptr;
376 nigel 77
377     recursion_info Xnew_recursive;
378    
379     BOOL Xcur_is_word;
380     BOOL Xcondition;
381     BOOL Xprev_is_word;
382    
383     #ifdef SUPPORT_UCP
384     int Xprop_type;
385 nigel 87 int Xprop_value;
386 nigel 77 int Xprop_fail_result;
387 ph10 123 int Xoclength;
388     uschar Xocchars[8];
389 nigel 77 #endif
390    
391 ph10 403 int Xcodelink;
392 nigel 77 int Xctype;
393 nigel 93 unsigned int Xfc;
394 nigel 77 int Xfi;
395     int Xlength;
396     int Xmax;
397     int Xmin;
398     int Xnumber;
399     int Xoffset;
400     int Xop;
401     int Xsave_capture_last;
402     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403     int Xstacksave[REC_STACK_SAVE_MAX];
404    
405     eptrblock Xnewptrb;
406    
407 ph10 164 /* Where to jump back to */
408 nigel 77
409 ph10 164 int Xwhere;
410 ph10 165
411 nigel 77 } heapframe;
412    
413     #endif
414    
415    
416     /***************************************************************************
417     ***************************************************************************/
418    
419    
420    
421     /*************************************************
422     * Match from current position *
423     *************************************************/
424    
425 nigel 93 /* This function is called recursively in many circumstances. Whenever it
426 nigel 77 returns a negative (error) response, the outer incarnation must also return the
427 ph10 426 same response. */
428 nigel 77
429 ph10 426 /* These macros pack up tests that are used for partial matching, and which
430     appears several times in the code. We set the "hit end" flag if the pointer is
431     at the end of the subject and also past the start of the subject (i.e.
432 ph10 427 something has been matched). For hard partial matching, we then return
433     immediately. The second one is used when we already know we are past the end of
434     the subject. */
435 ph10 426
436     #define CHECK_PARTIAL()\
437 ph10 553 if (md->partial != 0 && eptr >= md->end_subject && \
438     eptr > md->start_used_ptr) \
439     { \
440     md->hitend = TRUE; \
441     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 ph10 427 }
443 ph10 426
444     #define SCHECK_PARTIAL()\
445 ph10 553 if (md->partial != 0 && eptr > md->start_used_ptr) \
446     { \
447     md->hitend = TRUE; \
448     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 ph10 427 }
450 ph10 426
451 ph10 427
452 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
453     the md structure (e.g. utf8, end_subject) into individual variables to improve
454 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455     made performance worse.
456    
457     Arguments:
458 nigel 93 eptr pointer to current character in subject
459     ecode pointer to current position in compiled code
460 ph10 168 mstart pointer to the current match start position (can be modified
461 ph10 172 by encountering \K)
462 ph10 501 markptr pointer to the most recent MARK name, or NULL
463 nigel 77 offset_top current top pointer
464     md pointer to "static" info for the match
465     eptrb pointer to chain of blocks containing eptr at start of
466     brackets - for testing for empty matches
467 nigel 87 rdepth the recursion depth
468 nigel 77
469     Returns: MATCH_MATCH if matched ) these values are >= 0
470     MATCH_NOMATCH if failed to match )
471 ph10 510 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 nigel 77 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 nigel 87 (e.g. stopped by repeated call or recursion limit)
474 nigel 77 */
475    
476     static int
477 ph10 510 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
478 ph10 625 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
479 ph10 604 unsigned int rdepth)
480 nigel 77 {
481     /* These variables do not need to be preserved over recursion in this function,
482 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
483     "register" because they are used a lot in loops. */
484 nigel 77
485 nigel 91 register int rrc; /* Returns from recursive calls */
486     register int i; /* Used for loops not involving calls to RMATCH() */
487 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
489 nigel 77
490 nigel 93 BOOL minimize, possessive; /* Quantifier options */
491 ph10 602 BOOL caseless;
492 ph10 403 int condcode;
493 nigel 93
494 nigel 77 /* When recursion is not being used, all "local" variables that have to be
495     preserved over calls to RMATCH() are part of a "frame" which is obtained from
496     heap storage. Set up the top-level frame here; others are obtained from the
497     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498    
499     #ifdef NO_RECURSE
500 ph10 563 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 ph10 531 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 nigel 77 frame->Xprevframe = NULL; /* Marks the top level */
503    
504     /* Copy in the original argument variables */
505    
506     frame->Xeptr = eptr;
507     frame->Xecode = ecode;
508 ph10 168 frame->Xmstart = mstart;
509 ph10 501 frame->Xmarkptr = markptr;
510 nigel 77 frame->Xoffset_top = offset_top;
511     frame->Xeptrb = eptrb;
512 nigel 87 frame->Xrdepth = rdepth;
513 nigel 77
514     /* This is where control jumps back to to effect "recursion" */
515    
516     HEAP_RECURSE:
517    
518     /* Macros make the argument variables come from the current frame */
519    
520     #define eptr frame->Xeptr
521     #define ecode frame->Xecode
522 ph10 168 #define mstart frame->Xmstart
523 ph10 501 #define markptr frame->Xmarkptr
524 nigel 77 #define offset_top frame->Xoffset_top
525     #define eptrb frame->Xeptrb
526 nigel 87 #define rdepth frame->Xrdepth
527 nigel 77
528     /* Ditto for the local variables */
529    
530     #ifdef SUPPORT_UTF8
531     #define charptr frame->Xcharptr
532     #endif
533     #define callpat frame->Xcallpat
534 ph10 403 #define codelink frame->Xcodelink
535 nigel 77 #define data frame->Xdata
536     #define next frame->Xnext
537     #define pp frame->Xpp
538     #define prev frame->Xprev
539     #define saved_eptr frame->Xsaved_eptr
540    
541     #define new_recursive frame->Xnew_recursive
542    
543     #define cur_is_word frame->Xcur_is_word
544     #define condition frame->Xcondition
545     #define prev_is_word frame->Xprev_is_word
546    
547     #ifdef SUPPORT_UCP
548     #define prop_type frame->Xprop_type
549 nigel 87 #define prop_value frame->Xprop_value
550 nigel 77 #define prop_fail_result frame->Xprop_fail_result
551 ph10 115 #define oclength frame->Xoclength
552     #define occhars frame->Xocchars
553 nigel 77 #endif
554    
555     #define ctype frame->Xctype
556     #define fc frame->Xfc
557     #define fi frame->Xfi
558     #define length frame->Xlength
559     #define max frame->Xmax
560     #define min frame->Xmin
561     #define number frame->Xnumber
562     #define offset frame->Xoffset
563     #define op frame->Xop
564     #define save_capture_last frame->Xsave_capture_last
565     #define save_offset1 frame->Xsave_offset1
566     #define save_offset2 frame->Xsave_offset2
567     #define save_offset3 frame->Xsave_offset3
568     #define stacksave frame->Xstacksave
569    
570     #define newptrb frame->Xnewptrb
571    
572     /* When recursion is being used, local variables are allocated on the stack and
573     get preserved during recursion in the normal way. In this environment, fi and
574     i, and fc and c, can be the same variables. */
575    
576 nigel 93 #else /* NO_RECURSE not defined */
577 nigel 77 #define fi i
578     #define fc c
579    
580 ph10 604 /* Many of the following variables are used only in small blocks of the code.
581     My normal style of coding would have declared them within each of those blocks.
582     However, in order to accommodate the version of this code that uses an external
583     "stack" implemented on the heap, it is easier to declare them all here, so the
584     declarations can be cut out in a block. The only declarations within blocks
585     below are for variables that do not have to be preserved over a recursive call
586     to RMATCH(). */
587 nigel 77
588 ph10 625 #ifdef SUPPORT_UTF8
589     const uschar *charptr;
590     #endif
591     const uschar *callpat;
592     const uschar *data;
593     const uschar *next;
594     USPTR pp;
595     const uschar *prev;
596     USPTR saved_eptr;
597    
598     recursion_info new_recursive;
599    
600     BOOL cur_is_word;
601 nigel 87 BOOL condition;
602 nigel 77 BOOL prev_is_word;
603    
604     #ifdef SUPPORT_UCP
605     int prop_type;
606 nigel 87 int prop_value;
607 nigel 77 int prop_fail_result;
608 ph10 115 int oclength;
609     uschar occhars[8];
610 nigel 77 #endif
611    
612 ph10 399 int codelink;
613 nigel 77 int ctype;
614     int length;
615     int max;
616     int min;
617     int number;
618     int offset;
619     int op;
620     int save_capture_last;
621     int save_offset1, save_offset2, save_offset3;
622     int stacksave[REC_STACK_SAVE_MAX];
623    
624     eptrblock newptrb;
625 nigel 93 #endif /* NO_RECURSE */
626 nigel 77
627 ph10 625 /* To save space on the stack and in the heap frame, I have doubled up on some
628     of the local variables that are used only in localised parts of the code, but
629     still need to be preserved over recursive calls of match(). These macros define
630 ph10 604 the alternative names that are used. */
631    
632     #define allow_zero cur_is_word
633     #define cbegroup condition
634     #define code_offset codelink
635     #define condassert condition
636     #define matched_once prev_is_word
637    
638 nigel 77 /* These statements are here to stop the compiler complaining about unitialized
639     variables. */
640    
641     #ifdef SUPPORT_UCP
642 nigel 87 prop_value = 0;
643 nigel 77 prop_fail_result = 0;
644     #endif
645    
646 nigel 93
647 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
648     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
649     used. Thanks to Ian Taylor for noticing this possibility and sending the
650     original patch. */
651    
652     TAIL_RECURSE:
653    
654 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
655     are specified by the macro RMATCH and RRETURN is used to return. When
656     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
657 ph10 475 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
658 nigel 87 defined). However, RMATCH isn't like a function call because it's quite a
659     complicated macro. It has to be used in one particular way. This shouldn't,
660     however, impact performance when true recursion is being used. */
661 nigel 77
662 ph10 164 #ifdef SUPPORT_UTF8
663     utf8 = md->utf8; /* Local copy of the flag */
664     #else
665     utf8 = FALSE;
666     #endif
667    
668 nigel 87 /* First check that we haven't called match() too many times, or that we
669     haven't exceeded the recursive call limit. */
670    
671 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
672 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
673 nigel 77
674 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
675 ph10 625 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
676     done this way to save having to use another function argument, which would take
677 ph10 604 up space on the stack. See also MATCH_CONDASSERT below.
678 nigel 77
679 ph10 604 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
680     such remembered pointers, to be checked when we hit the closing ket, in order
681     to break infinite loops that match no characters. When match() is called in
682     other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
683     NOT be used with tail recursion, because the memory block that is used is on
684     the stack, so a new one may be required for each match(). */
685    
686     if (md->match_function_type == MATCH_CBEGROUP)
687 nigel 77 {
688 ph10 197 newptrb.epb_saved_eptr = eptr;
689     newptrb.epb_prev = eptrb;
690     eptrb = &newptrb;
691 ph10 604 md->match_function_type = 0;
692 nigel 77 }
693    
694 nigel 93 /* Now start processing the opcodes. */
695 nigel 77
696     for (;;)
697     {
698 nigel 93 minimize = possessive = FALSE;
699 nigel 77 op = *ecode;
700 ph10 625
701 nigel 93 switch(op)
702     {
703 ph10 510 case OP_MARK:
704     markptr = ecode + 2;
705     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
706 ph10 604 eptrb, RM55);
707 ph10 512
708     /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
709     argument, and we must check whether that argument matches this MARK's
710     argument. It is passed back in md->start_match_ptr (an overloading of that
711     variable). If it does match, we reset that variable to the current subject
712     position and return MATCH_SKIP. Otherwise, pass back the return code
713 ph10 510 unaltered. */
714 ph10 512
715     if (rrc == MATCH_SKIP_ARG &&
716 ph10 510 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
717     {
718     md->start_match_ptr = eptr;
719     RRETURN(MATCH_SKIP);
720     }
721    
722 ph10 512 if (md->mark == NULL) md->mark = markptr;
723 ph10 510 RRETURN(rrc);
724    
725 ph10 210 case OP_FAIL:
726 ph10 510 MRRETURN(MATCH_NOMATCH);
727 ph10 211
728 ph10 551 /* COMMIT overrides PRUNE, SKIP, and THEN */
729 ph10 553
730 ph10 510 case OP_COMMIT:
731     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
732 ph10 604 eptrb, RM52);
733 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
734 ph10 553 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
735     rrc != MATCH_THEN)
736 ph10 551 RRETURN(rrc);
737 ph10 510 MRRETURN(MATCH_COMMIT);
738    
739 ph10 551 /* PRUNE overrides THEN */
740 ph10 553
741 ph10 210 case OP_PRUNE:
742     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
743 ph10 604 eptrb, RM51);
744 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
745 ph10 510 MRRETURN(MATCH_PRUNE);
746 ph10 211
747 ph10 510 case OP_PRUNE_ARG:
748     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
749 ph10 604 eptrb, RM56);
750 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 ph10 510 md->mark = ecode + 2;
752     RRETURN(MATCH_PRUNE);
753 ph10 211
754 ph10 551 /* SKIP overrides PRUNE and THEN */
755 ph10 553
756 ph10 210 case OP_SKIP:
757     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
758 ph10 604 eptrb, RM53);
759 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
760 ph10 551 RRETURN(rrc);
761 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
762 ph10 510 MRRETURN(MATCH_SKIP);
763 ph10 211
764 ph10 510 case OP_SKIP_ARG:
765     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
766 ph10 604 eptrb, RM57);
767 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 ph10 551 RRETURN(rrc);
769 ph10 512
770     /* Pass back the current skip name by overloading md->start_match_ptr and
771     returning the special MATCH_SKIP_ARG return code. This will either be
772     caught by a matching MARK, or get to the top, where it is treated the same
773 ph10 510 as PRUNE. */
774 ph10 512
775 ph10 510 md->start_match_ptr = ecode + 2;
776 ph10 512 RRETURN(MATCH_SKIP_ARG);
777 ph10 553
778 ph10 716 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
779     the branch in which it occurs can be determined. Overload the start of
780     match pointer to do this. */
781 ph10 512
782 ph10 210 case OP_THEN:
783     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
784 ph10 604 eptrb, RM54);
785 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
786 ph10 716 md->start_match_ptr = ecode;
787 ph10 510 MRRETURN(MATCH_THEN);
788    
789     case OP_THEN_ARG:
790 ph10 716 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top,
791     md, eptrb, RM58);
792 ph10 510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
793 ph10 716 md->start_match_ptr = ecode;
794     md->mark = ecode + 2;
795 ph10 212 RRETURN(MATCH_THEN);
796 ph10 211
797 ph10 604 /* Handle a capturing bracket, other than those that are possessive with an
798     unlimited repeat. If there is space in the offset vector, save the current
799     subject position in the working slot at the top of the vector. We mustn't
800     change the current values of the data slot, because they may be set from a
801     previous iteration of this group, and be referred to by a reference inside
802 ph10 625 the group. A failure to match might occur after the group has succeeded,
803 ph10 617 if something later on doesn't match. For this reason, we need to restore
804     the working value and also the values of the final offsets, in case they
805     were set by a previous iteration of the same bracket.
806 nigel 77
807 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
808     a non-capturing bracket. Don't worry about setting the flag for the error
809     case here; that is handled in the code for KET. */
810 nigel 77
811 nigel 93 case OP_CBRA:
812     case OP_SCBRA:
813     number = GET2(ecode, 1+LINK_SIZE);
814 nigel 77 offset = number << 1;
815 ph10 625
816 ph10 475 #ifdef PCRE_DEBUG
817 nigel 93 printf("start bracket %d\n", number);
818     printf("subject=");
819 nigel 77 pchars(eptr, 16, TRUE, md);
820     printf("\n");
821     #endif
822    
823     if (offset < md->offset_max)
824     {
825     save_offset1 = md->offset_vector[offset];
826     save_offset2 = md->offset_vector[offset+1];
827     save_offset3 = md->offset_vector[md->offset_end - number];
828     save_capture_last = md->capture_last;
829    
830     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
831 ph10 531 md->offset_vector[md->offset_end - number] =
832 ph10 530 (int)(eptr - md->start_subject);
833 nigel 77
834 ph10 604 for (;;)
835 nigel 77 {
836 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
837     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
838 ph10 604 eptrb, RM1);
839 ph10 618 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
840 ph10 716
841     /* If we backed up to a THEN, check whether it is within the current
842     branch by comparing the address of the THEN that is passed back with
843     the end of the branch. If it is within the current branch, and the
844     branch is one of two or more alternatives (it either starts or ends
845     with OP_ALT), we have reached the limit of THEN's action, so convert
846     the return code to NOMATCH, which will cause normal backtracking to
847     happen from now on. Otherwise, THEN is passed back to an outer
848     alternative. This implements Perl's treatment of parenthesized groups,
849     where a group not containing | does not affect the current alternative,
850     that is, (X) is NOT the same as (X|(*F)). */
851    
852     if (rrc == MATCH_THEN)
853     {
854     next = ecode + GET(ecode,1);
855     if (md->start_match_ptr < next &&
856     (*ecode == OP_ALT || *next == OP_ALT))
857     rrc = MATCH_NOMATCH;
858     }
859    
860     /* Anything other than NOMATCH is passed back. */
861    
862     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
863 nigel 77 md->capture_last = save_capture_last;
864     ecode += GET(ecode, 1);
865 ph10 625 if (*ecode != OP_ALT) break;
866 nigel 77 }
867    
868     DPRINTF(("bracket %d failed\n", number));
869     md->offset_vector[offset] = save_offset1;
870     md->offset_vector[offset+1] = save_offset2;
871     md->offset_vector[md->offset_end - number] = save_offset3;
872 ph10 625
873 ph10 716 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
874 nigel 77
875 ph10 716 if (md->mark == NULL) md->mark = markptr;
876     RRETURN(rrc);
877 nigel 77 }
878    
879 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
880     as a non-capturing bracket. */
881 nigel 77
882 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
883     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
884    
885 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
886 nigel 77
887 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
888     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
889    
890 ph10 618 /* Non-capturing or atomic group, except for possessive with unlimited
891 ph10 708 repeat. Loop for all the alternatives.
892    
893 ph10 702 When we get to the final alternative within the brackets, we used to return
894     the result of a recursive call to match() whatever happened so it was
895     possible to reduce stack usage by turning this into a tail recursion,
896     except in the case of a possibly empty group. However, now that there is
897     the possiblity of (*THEN) occurring in the final alternative, this
898     optimization is no longer always possible.
899 ph10 625
900 ph10 708 We can optimize if we know there are no (*THEN)s in the pattern; at present
901     this is the best that can be done.
902    
903 ph10 625 MATCH_ONCE is returned when the end of an atomic group is successfully
904     reached, but subsequent matching fails. It passes back up the tree (causing
905     captured values to be reset) until the original atomic group level is
906 ph10 618 reached. This is tested by comparing md->once_target with the start of the
907     group. At this point, the return is converted into MATCH_NOMATCH so that
908     previous backup points can be taken. */
909 nigel 77
910 ph10 618 case OP_ONCE:
911 nigel 93 case OP_BRA:
912     case OP_SBRA:
913     DPRINTF(("start non-capturing bracket\n"));
914 ph10 618
915 nigel 91 for (;;)
916 nigel 77 {
917 ph10 618 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
918 ph10 702
919     /* If this is not a possibly empty group, and there are no (*THEN)s in
920 ph10 708 the pattern, and this is the final alternative, optimize as described
921 ph10 702 above. */
922    
923     else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
924     {
925     ecode += _pcre_OP_lengths[*ecode];
926     goto TAIL_RECURSE;
927 ph10 708 }
928 ph10 702
929     /* In all other cases, we have to make another call to match(). */
930    
931 ph10 708 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
932 ph10 604 RM2);
933 ph10 716
934     /* See comment in the code for capturing groups above about handling
935     THEN. */
936    
937     if (rrc == MATCH_THEN)
938 ph10 625 {
939 ph10 716 next = ecode + GET(ecode,1);
940     if (md->start_match_ptr < next &&
941     (*ecode == OP_ALT || *next == OP_ALT))
942     rrc = MATCH_NOMATCH;
943     }
944    
945     if (rrc != MATCH_NOMATCH)
946     {
947 ph10 618 if (rrc == MATCH_ONCE)
948     {
949     const uschar *scode = ecode;
950     if (*scode != OP_ONCE) /* If not at start, find it */
951     {
952     while (*scode == OP_ALT) scode += GET(scode, 1);
953     scode -= GET(scode, 1);
954 ph10 625 }
955 ph10 618 if (md->once_target == scode) rrc = MATCH_NOMATCH;
956 ph10 625 }
957 ph10 550 RRETURN(rrc);
958 ph10 625 }
959 nigel 77 ecode += GET(ecode, 1);
960 ph10 625 if (*ecode != OP_ALT) break;
961 nigel 77 }
962 ph10 716
963     if (md->mark == NULL) md->mark = markptr;
964 ph10 609 RRETURN(MATCH_NOMATCH);
965    
966 ph10 625 /* Handle possessive capturing brackets with an unlimited repeat. We come
967 ph10 604 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
968     handled similarly to the normal case above. However, the matching is
969     different. The end of these brackets will always be OP_KETRPOS, which
970     returns MATCH_KETRPOS without going further in the pattern. By this means
971     we can handle the group by iteration rather than recursion, thereby
972     reducing the amount of stack needed. */
973 ph10 625
974 ph10 604 case OP_CBRAPOS:
975     case OP_SCBRAPOS:
976     allow_zero = FALSE;
977 ph10 625
978 ph10 604 POSSESSIVE_CAPTURE:
979     number = GET2(ecode, 1+LINK_SIZE);
980     offset = number << 1;
981    
982     #ifdef PCRE_DEBUG
983     printf("start possessive bracket %d\n", number);
984     printf("subject=");
985     pchars(eptr, 16, TRUE, md);
986     printf("\n");
987     #endif
988    
989     if (offset < md->offset_max)
990     {
991     matched_once = FALSE;
992 ph10 625 code_offset = ecode - md->start_code;
993 ph10 604
994     save_offset1 = md->offset_vector[offset];
995     save_offset2 = md->offset_vector[offset+1];
996     save_offset3 = md->offset_vector[md->offset_end - number];
997     save_capture_last = md->capture_last;
998    
999     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1000 ph10 625
1001     /* Each time round the loop, save the current subject position for use
1002     when the group matches. For MATCH_MATCH, the group has matched, so we
1003     restart it with a new subject starting position, remembering that we had
1004     at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1005     usual. If we haven't matched any alternatives in any iteration, check to
1006     see if a previous iteration matched. If so, the group has matched;
1007     continue from afterwards. Otherwise it has failed; restore the previous
1008 ph10 604 capture values before returning NOMATCH. */
1009 ph10 625
1010 ph10 604 for (;;)
1011     {
1012     md->offset_vector[md->offset_end - number] =
1013     (int)(eptr - md->start_subject);
1014 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1015 ph10 604 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1016     eptrb, RM63);
1017     if (rrc == MATCH_KETRPOS)
1018     {
1019     offset_top = md->end_offset_top;
1020     eptr = md->end_match_ptr;
1021 ph10 625 ecode = md->start_code + code_offset;
1022 ph10 604 save_capture_last = md->capture_last;
1023 ph10 625 matched_once = TRUE;
1024     continue;
1025     }
1026 ph10 716
1027     /* See comment in the code for capturing groups above about handling
1028     THEN. */
1029    
1030     if (rrc == MATCH_THEN)
1031     {
1032     next = ecode + GET(ecode,1);
1033     if (md->start_match_ptr < next &&
1034     (*ecode == OP_ALT || *next == OP_ALT))
1035     rrc = MATCH_NOMATCH;
1036     }
1037    
1038     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1039 ph10 604 md->capture_last = save_capture_last;
1040     ecode += GET(ecode, 1);
1041 ph10 625 if (*ecode != OP_ALT) break;
1042 ph10 604 }
1043 ph10 610
1044 ph10 604 if (!matched_once)
1045 ph10 625 {
1046 ph10 604 md->offset_vector[offset] = save_offset1;
1047     md->offset_vector[offset+1] = save_offset2;
1048     md->offset_vector[md->offset_end - number] = save_offset3;
1049     }
1050 ph10 625
1051 ph10 716 if (md->mark == NULL) md->mark = markptr;
1052 ph10 604 if (allow_zero || matched_once)
1053 ph10 625 {
1054 ph10 604 ecode += 1 + LINK_SIZE;
1055     break;
1056 ph10 625 }
1057    
1058 ph10 604 RRETURN(MATCH_NOMATCH);
1059     }
1060 ph10 625
1061 ph10 604 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1062     as a non-capturing bracket. */
1063    
1064     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1065     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1066    
1067     DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1068    
1069     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1070     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1071    
1072 ph10 625 /* Non-capturing possessive bracket with unlimited repeat. We come here
1073 ph10 604 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1074     without the capturing complication. It is written out separately for speed
1075     and cleanliness. */
1076    
1077     case OP_BRAPOS:
1078     case OP_SBRAPOS:
1079 ph10 625 allow_zero = FALSE;
1080    
1081 ph10 604 POSSESSIVE_NON_CAPTURE:
1082     matched_once = FALSE;
1083 ph10 625 code_offset = ecode - md->start_code;
1084 ph10 604
1085     for (;;)
1086     {
1087 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1088 ph10 604 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1089 ph10 609 eptrb, RM48);
1090 ph10 604 if (rrc == MATCH_KETRPOS)
1091     {
1092 ph10 610 offset_top = md->end_offset_top;
1093 ph10 604 eptr = md->end_match_ptr;
1094 ph10 625 ecode = md->start_code + code_offset;
1095     matched_once = TRUE;
1096     continue;
1097     }
1098 ph10 716
1099     /* See comment in the code for capturing groups above about handling
1100     THEN. */
1101    
1102     if (rrc == MATCH_THEN)
1103     {
1104     next = ecode + GET(ecode,1);
1105     if (md->start_match_ptr < next &&
1106     (*ecode == OP_ALT || *next == OP_ALT))
1107     rrc = MATCH_NOMATCH;
1108     }
1109    
1110     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1111 ph10 604 ecode += GET(ecode, 1);
1112 ph10 625 if (*ecode != OP_ALT) break;
1113 ph10 604 }
1114 ph10 625
1115     if (matched_once || allow_zero)
1116 ph10 604 {
1117     ecode += 1 + LINK_SIZE;
1118     break;
1119 ph10 625 }
1120 ph10 604 RRETURN(MATCH_NOMATCH);
1121    
1122     /* Control never reaches here. */
1123    
1124 nigel 77 /* Conditional group: compilation checked that there are no more than
1125     two branches. If the condition is false, skipping the first branch takes us
1126     past the end if there is only one branch, but that's OK because that is
1127 ph10 609 exactly what going to the ket would do. */
1128 nigel 77
1129     case OP_COND:
1130 nigel 93 case OP_SCOND:
1131 ph10 604 codelink = GET(ecode, 1);
1132 ph10 406
1133 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
1134     inserted between OP_COND and an assertion condition. */
1135 ph10 392
1136 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1137     {
1138     if (pcre_callout != NULL)
1139     {
1140     pcre_callout_block cb;
1141 ph10 645 cb.version = 2; /* Version 1 of the callout block */
1142 ph10 381 cb.callout_number = ecode[LINK_SIZE+2];
1143     cb.offset_vector = md->offset_vector;
1144     cb.subject = (PCRE_SPTR)md->start_subject;
1145 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1146     cb.start_match = (int)(mstart - md->start_subject);
1147     cb.current_position = (int)(eptr - md->start_subject);
1148 ph10 381 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1149     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1150     cb.capture_top = offset_top/2;
1151     cb.capture_last = md->capture_last;
1152     cb.callout_data = md->callout_data;
1153 ph10 654 cb.mark = markptr;
1154 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1155 ph10 381 if (rrc < 0) RRETURN(rrc);
1156     }
1157     ecode += _pcre_OP_lengths[OP_CALLOUT];
1158     }
1159 ph10 392
1160 ph10 399 condcode = ecode[LINK_SIZE+1];
1161 ph10 406
1162 ph10 381 /* Now see what the actual condition is */
1163 ph10 392
1164 ph10 459 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1165 nigel 77 {
1166 ph10 459 if (md->recursive == NULL) /* Not recursing => FALSE */
1167     {
1168 ph10 461 condition = FALSE;
1169     ecode += GET(ecode, 1);
1170     }
1171 ph10 459 else
1172 ph10 461 {
1173 ph10 459 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1174     condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1175 ph10 461
1176 ph10 459 /* If the test is for recursion into a specific subpattern, and it is
1177     false, but the test was set up by name, scan the table to see if the
1178     name refers to any other numbers, and test them. The condition is true
1179     if any one is set. */
1180 ph10 461
1181 ph10 459 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1182     {
1183     uschar *slotA = md->name_table;
1184     for (i = 0; i < md->name_count; i++)
1185 ph10 461 {
1186     if (GET2(slotA, 0) == recno) break;
1187 ph10 459 slotA += md->name_entry_size;
1188     }
1189 ph10 461
1190 ph10 459 /* Found a name for the number - there can be only one; duplicate
1191     names for different numbers are allowed, but not vice versa. First
1192     scan down for duplicates. */
1193 ph10 461
1194 ph10 459 if (i < md->name_count)
1195 ph10 461 {
1196 ph10 459 uschar *slotB = slotA;
1197     while (slotB > md->name_table)
1198     {
1199     slotB -= md->name_entry_size;
1200     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1201     {
1202     condition = GET2(slotB, 0) == md->recursive->group_num;
1203 ph10 461 if (condition) break;
1204     }
1205 ph10 459 else break;
1206 ph10 461 }
1207    
1208 ph10 459 /* Scan up for duplicates */
1209 ph10 461
1210 ph10 459 if (!condition)
1211 ph10 461 {
1212 ph10 459 slotB = slotA;
1213     for (i++; i < md->name_count; i++)
1214     {
1215     slotB += md->name_entry_size;
1216     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1217     {
1218     condition = GET2(slotB, 0) == md->recursive->group_num;
1219     if (condition) break;
1220 ph10 461 }
1221 ph10 459 else break;
1222 ph10 461 }
1223     }
1224 ph10 459 }
1225 ph10 461 }
1226    
1227 ph10 459 /* Chose branch according to the condition */
1228 ph10 461
1229 ph10 459 ecode += condition? 3 : GET(ecode, 1);
1230     }
1231 ph10 461 }
1232 nigel 93
1233 ph10 459 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1234 nigel 93 {
1235 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1236 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1237 ph10 461
1238 ph10 459 /* If the numbered capture is unset, but the reference was by name,
1239 ph10 461 scan the table to see if the name refers to any other numbers, and test
1240     them. The condition is true if any one is set. This is tediously similar
1241     to the code above, but not close enough to try to amalgamate. */
1242    
1243 ph10 459 if (!condition && condcode == OP_NCREF)
1244     {
1245 ph10 461 int refno = offset >> 1;
1246 ph10 459 uschar *slotA = md->name_table;
1247 ph10 461
1248 ph10 459 for (i = 0; i < md->name_count; i++)
1249 ph10 461 {
1250     if (GET2(slotA, 0) == refno) break;
1251 ph10 459 slotA += md->name_entry_size;
1252     }
1253 ph10 461
1254     /* Found a name for the number - there can be only one; duplicate names
1255     for different numbers are allowed, but not vice versa. First scan down
1256 ph10 459 for duplicates. */
1257 ph10 461
1258 ph10 459 if (i < md->name_count)
1259 ph10 461 {
1260 ph10 459 uschar *slotB = slotA;
1261     while (slotB > md->name_table)
1262     {
1263     slotB -= md->name_entry_size;
1264     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1265     {
1266     offset = GET2(slotB, 0) << 1;
1267 ph10 461 condition = offset < offset_top &&
1268 ph10 459 md->offset_vector[offset] >= 0;
1269 ph10 461 if (condition) break;
1270     }
1271 ph10 459 else break;
1272 ph10 461 }
1273    
1274 ph10 459 /* Scan up for duplicates */
1275 ph10 461
1276 ph10 459 if (!condition)
1277 ph10 461 {
1278 ph10 459 slotB = slotA;
1279     for (i++; i < md->name_count; i++)
1280     {
1281     slotB += md->name_entry_size;
1282     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1283     {
1284     offset = GET2(slotB, 0) << 1;
1285 ph10 461 condition = offset < offset_top &&
1286 ph10 459 md->offset_vector[offset] >= 0;
1287 ph10 461 if (condition) break;
1288     }
1289 ph10 459 else break;
1290 ph10 461 }
1291     }
1292 ph10 459 }
1293 ph10 461 }
1294    
1295 ph10 459 /* Chose branch according to the condition */
1296    
1297 nigel 93 ecode += condition? 3 : GET(ecode, 1);
1298 nigel 77 }
1299    
1300 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
1301 nigel 93 {
1302     condition = FALSE;
1303     ecode += GET(ecode, 1);
1304     }
1305    
1306 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
1307 ph10 604 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1308     an assertion. */
1309 nigel 77
1310     else
1311     {
1312 ph10 625 md->match_function_type = MATCH_CONDASSERT;
1313 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1314 nigel 77 if (rrc == MATCH_MATCH)
1315     {
1316 ph10 619 if (md->end_offset_top > offset_top)
1317     offset_top = md->end_offset_top; /* Captures may have happened */
1318 nigel 93 condition = TRUE;
1319     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1320 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1321     }
1322 ph10 716
1323     /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1324     assertion; it is therefore treated as NOMATCH. */
1325    
1326     else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1327 nigel 77 {
1328     RRETURN(rrc); /* Need braces because of following else */
1329     }
1330 nigel 93 else
1331     {
1332     condition = FALSE;
1333 ph10 399 ecode += codelink;
1334 nigel 93 }
1335     }
1336 nigel 91
1337 ph10 716 /* We are now at the branch that is to be obeyed. As there is only one, can
1338     use tail recursion to avoid using another stack frame, except when there is
1339     unlimited repeat of a possibly empty group. In the latter case, a recursive
1340     call to match() is always required, unless the second alternative doesn't
1341     exist, in which case we can just plough on. Note that, for compatibility
1342     with Perl, the | in a conditional group is NOT treated as creating two
1343     alternatives. If a THEN is encountered in the branch, it propagates out to
1344     the enclosing alternative (unless nested in a deeper set of alternatives,
1345     of course). */
1346 nigel 91
1347 nigel 93 if (condition || *ecode == OP_ALT)
1348     {
1349 ph10 716 if (op != OP_SCOND)
1350 ph10 702 {
1351     ecode += 1 + LINK_SIZE;
1352     goto TAIL_RECURSE;
1353 ph10 708 }
1354 ph10 716
1355     md->match_function_type = MATCH_CBEGROUP;
1356 ph10 609 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1357     RRETURN(rrc);
1358 nigel 77 }
1359 ph10 708
1360 ph10 702 /* Condition false & no alternative; continue after the group. */
1361 ph10 708
1362 ph10 702 else
1363 nigel 93 {
1364     ecode += 1 + LINK_SIZE;
1365     }
1366     break;
1367 nigel 77
1368 ph10 461
1369 ph10 447 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1370     to close any currently open capturing brackets. */
1371 ph10 461
1372 ph10 447 case OP_CLOSE:
1373 ph10 461 number = GET2(ecode, 1);
1374 ph10 447 offset = number << 1;
1375 ph10 461
1376 ph10 475 #ifdef PCRE_DEBUG
1377 ph10 447 printf("end bracket %d at *ACCEPT", number);
1378     printf("\n");
1379     #endif
1380 nigel 77
1381 ph10 447 md->capture_last = number;
1382     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1383     {
1384     md->offset_vector[offset] =
1385     md->offset_vector[md->offset_end - number];
1386 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1387 ph10 447 if (offset_top <= offset) offset_top = offset + 2;
1388     }
1389     ecode += 3;
1390 ph10 461 break;
1391 ph10 447
1392    
1393 ph10 619 /* End of the pattern, either real or forced. */
1394 nigel 77
1395 ph10 619 case OP_END:
1396 ph10 210 case OP_ACCEPT:
1397 ph10 625 case OP_ASSERT_ACCEPT:
1398    
1399 ph10 619 /* If we have matched an empty string, fail if not in an assertion and not
1400     in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1401 ph10 613 is set and we have matched at the start of the subject. In both cases,
1402     backtracking will then try other alternatives, if any. */
1403 ph10 443
1404 ph10 619 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1405 ph10 618 md->recursive == NULL &&
1406 ph10 619 (md->notempty ||
1407     (md->notempty_atstart &&
1408     mstart == md->start_subject + md->start_offset)))
1409 ph10 510 MRRETURN(MATCH_NOMATCH);
1410 ph10 443
1411 ph10 442 /* Otherwise, we have a match. */
1412 ph10 625
1413 ph10 168 md->end_match_ptr = eptr; /* Record where we ended */
1414     md->end_offset_top = offset_top; /* and how many extracts were taken */
1415 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1416 nigel 77
1417 ph10 512 /* For some reason, the macros don't work properly if an expression is
1418     given as the argument to MRRETURN when the heap is in use. */
1419    
1420     rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1421     MRRETURN(rrc);
1422    
1423 nigel 77 /* Assertion brackets. Check the alternative branches in turn - the
1424     matching won't pass the KET for an assertion. If any one branch matches,
1425     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1426     start of each branch to move the current point backwards, so the code at
1427 ph10 625 this level is identical to the lookahead case. When the assertion is part
1428     of a condition, we want to return immediately afterwards. The caller of
1429     this incarnation of the match() function will have set MATCH_CONDASSERT in
1430     md->match_function type, and one of these opcodes will be the first opcode
1431     that is processed. We use a local variable that is preserved over calls to
1432 ph10 604 match() to remember this case. */
1433 nigel 77
1434     case OP_ASSERT:
1435     case OP_ASSERTBACK:
1436 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1437     {
1438     condassert = TRUE;
1439     md->match_function_type = 0;
1440     }
1441 ph10 625 else condassert = FALSE;
1442    
1443 nigel 77 do
1444     {
1445 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1446 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1447 ph10 500 {
1448     mstart = md->start_match_ptr; /* In case \K reset it */
1449 ph10 630 markptr = md->mark;
1450 ph10 500 break;
1451 ph10 501 }
1452 ph10 716
1453     /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1454     as NOMATCH. */
1455    
1456     if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1457 nigel 77 ecode += GET(ecode, 1);
1458     }
1459     while (*ecode == OP_ALT);
1460 ph10 625
1461 ph10 510 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1462 nigel 77
1463     /* If checking an assertion for a condition, return MATCH_MATCH. */
1464    
1465 ph10 604 if (condassert) RRETURN(MATCH_MATCH);
1466 nigel 77
1467     /* Continue from after the assertion, updating the offsets high water
1468     mark, since extracts may have been taken during the assertion. */
1469    
1470     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1471     ecode += 1 + LINK_SIZE;
1472     offset_top = md->end_offset_top;
1473     continue;
1474    
1475 ph10 473 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1476 ph10 482 PRUNE, or COMMIT means we must assume failure without checking subsequent
1477 ph10 473 branches. */
1478 nigel 77
1479     case OP_ASSERT_NOT:
1480     case OP_ASSERTBACK_NOT:
1481 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1482     {
1483     condassert = TRUE;
1484     md->match_function_type = 0;
1485     }
1486 ph10 625 else condassert = FALSE;
1487 ph10 604
1488 nigel 77 do
1489     {
1490 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1491 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1492 ph10 473 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1493     {
1494     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1495 ph10 482 break;
1496     }
1497 ph10 716
1498     /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1499     as NOMATCH. */
1500    
1501     if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1502 nigel 77 ecode += GET(ecode,1);
1503     }
1504     while (*ecode == OP_ALT);
1505    
1506 ph10 604 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1507 ph10 625
1508 nigel 77 ecode += 1 + LINK_SIZE;
1509     continue;
1510    
1511     /* Move the subject pointer back. This occurs only at the start of
1512     each branch of a lookbehind assertion. If we are too close to the start to
1513     move back, this match function fails. When working with UTF-8 we move
1514     back a number of characters, not bytes. */
1515    
1516     case OP_REVERSE:
1517     #ifdef SUPPORT_UTF8
1518     if (utf8)
1519     {
1520 nigel 93 i = GET(ecode, 1);
1521     while (i-- > 0)
1522 nigel 77 {
1523     eptr--;
1524 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1525 ph10 207 BACKCHAR(eptr);
1526 nigel 77 }
1527     }
1528     else
1529     #endif
1530    
1531     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1532    
1533     {
1534 nigel 93 eptr -= GET(ecode, 1);
1535 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1536 nigel 77 }
1537    
1538 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1539 nigel 77
1540 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1541 nigel 77 ecode += 1 + LINK_SIZE;
1542     break;
1543    
1544     /* The callout item calls an external function, if one is provided, passing
1545     details of the match so far. This is mainly for debugging, though the
1546     function is able to force a failure. */
1547    
1548     case OP_CALLOUT:
1549     if (pcre_callout != NULL)
1550     {
1551     pcre_callout_block cb;
1552 ph10 645 cb.version = 2; /* Version 1 of the callout block */
1553 nigel 77 cb.callout_number = ecode[1];
1554     cb.offset_vector = md->offset_vector;
1555 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1556 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1557     cb.start_match = (int)(mstart - md->start_subject);
1558     cb.current_position = (int)(eptr - md->start_subject);
1559 nigel 77 cb.pattern_position = GET(ecode, 2);
1560     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1561     cb.capture_top = offset_top/2;
1562     cb.capture_last = md->capture_last;
1563     cb.callout_data = md->callout_data;
1564 ph10 654 cb.mark = markptr;
1565 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1566 nigel 77 if (rrc < 0) RRETURN(rrc);
1567     }
1568     ecode += 2 + 2*LINK_SIZE;
1569     break;
1570    
1571     /* Recursion either matches the current regex, or some subexpression. The
1572     offset data is the offset to the starting bracket from the start of the
1573     whole pattern. (This is so that it works from duplicated subpatterns.)
1574 ph10 625
1575 ph10 618 The state of the capturing groups is preserved over recursion, and
1576 ph10 625 re-instated afterwards. We don't know how many are started and not yet
1577 ph10 618 finished (offset_top records the completed total) so we just have to save
1578     all the potential data. There may be up to 65535 such values, which is too
1579     large to put on the stack, but using malloc for small numbers seems
1580     expensive. As a compromise, the stack is used when there are no more than
1581     REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1582 nigel 77
1583     There are also other values that have to be saved. We use a chained
1584     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1585 ph10 625 for the original version of this logic. It has, however, been hacked around
1586 ph10 618 a lot, so he is not to blame for the current way it works. */
1587 nigel 77
1588     case OP_RECURSE:
1589     {
1590 ph10 642 recursion_info *ri;
1591     int recno;
1592 ph10 654
1593 nigel 77 callpat = md->start_code + GET(ecode, 1);
1594 ph10 642 recno = (callpat == md->start_code)? 0 :
1595 ph10 654 GET2(callpat, 1 + LINK_SIZE);
1596    
1597     /* Check for repeating a recursion without advancing the subject pointer.
1598 ph10 642 This should catch convoluted mutual recursions. (Some simple cases are
1599 ph10 654 caught at compile time.) */
1600    
1601 ph10 642 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1602 ph10 654 if (recno == ri->group_num && eptr == ri->subject_position)
1603 ph10 642 RRETURN(PCRE_ERROR_RECURSELOOP);
1604 nigel 77
1605     /* Add to "recursing stack" */
1606    
1607 ph10 642 new_recursive.group_num = recno;
1608     new_recursive.subject_position = eptr;
1609 nigel 77 new_recursive.prevrec = md->recursive;
1610     md->recursive = &new_recursive;
1611    
1612 ph10 618 /* Where to continue from afterwards */
1613 nigel 77
1614     ecode += 1 + LINK_SIZE;
1615    
1616 ph10 618 /* Now save the offset data */
1617 nigel 77
1618     new_recursive.saved_max = md->offset_end;
1619     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1620     new_recursive.offset_save = stacksave;
1621     else
1622     {
1623     new_recursive.offset_save =
1624     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1625     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1626     }
1627     memcpy(new_recursive.offset_save, md->offset_vector,
1628     new_recursive.saved_max * sizeof(int));
1629 ph10 625
1630 ph10 618 /* OK, now we can do the recursion. After processing each alternative,
1631 ph10 625 restore the offset data. If there were nested recursions, md->recursive
1632 ph10 618 might be changed, so reset it before looping. */
1633 nigel 77
1634     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1635 ph10 604 cbegroup = (*callpat >= OP_SBRA);
1636 nigel 77 do
1637     {
1638 ph10 604 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1639 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1640 ph10 604 md, eptrb, RM6);
1641 ph10 618 memcpy(md->offset_vector, new_recursive.offset_save,
1642     new_recursive.saved_max * sizeof(int));
1643 ph10 681 md->recursive = new_recursive.prevrec;
1644 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1645 nigel 77 {
1646 nigel 87 DPRINTF(("Recursion matched\n"));
1647 nigel 77 if (new_recursive.offset_save != stacksave)
1648     (pcre_free)(new_recursive.offset_save);
1649 ph10 618
1650     /* Set where we got to in the subject, and reset the start in case
1651 ph10 625 it was changed by \K. This *is* propagated back out of a recursion,
1652     for Perl compatibility. */
1653    
1654 ph10 618 eptr = md->end_match_ptr;
1655     mstart = md->start_match_ptr;
1656     goto RECURSION_MATCHED; /* Exit loop; end processing */
1657 nigel 77 }
1658 ph10 716
1659     /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1660     as NOMATCH. */
1661    
1662     else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1663 nigel 87 {
1664     DPRINTF(("Recursion gave error %d\n", rrc));
1665 ph10 400 if (new_recursive.offset_save != stacksave)
1666     (pcre_free)(new_recursive.offset_save);
1667 nigel 87 RRETURN(rrc);
1668     }
1669 nigel 77
1670     md->recursive = &new_recursive;
1671     callpat += GET(callpat, 1);
1672     }
1673     while (*callpat == OP_ALT);
1674    
1675     DPRINTF(("Recursion didn't match\n"));
1676     md->recursive = new_recursive.prevrec;
1677     if (new_recursive.offset_save != stacksave)
1678     (pcre_free)(new_recursive.offset_save);
1679 ph10 510 MRRETURN(MATCH_NOMATCH);
1680 nigel 77 }
1681 ph10 625
1682 ph10 618 RECURSION_MATCHED:
1683     break;
1684 nigel 77
1685     /* An alternation is the end of a branch; scan along to find the end of the
1686     bracketed group and go to there. */
1687    
1688     case OP_ALT:
1689     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1690     break;
1691    
1692 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1693     indicating that it may occur zero times. It may repeat infinitely, or not
1694     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1695     with fixed upper repeat limits are compiled as a number of copies, with the
1696     optional ones preceded by BRAZERO or BRAMINZERO. */
1697 ph10 625
1698 nigel 77 case OP_BRAZERO:
1699 ph10 604 next = ecode + 1;
1700     RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1701     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1702     do next += GET(next, 1); while (*next == OP_ALT);
1703     ecode = next + 1 + LINK_SIZE;
1704 nigel 77 break;
1705 ph10 625
1706 nigel 77 case OP_BRAMINZERO:
1707 ph10 604 next = ecode + 1;
1708     do next += GET(next, 1); while (*next == OP_ALT);
1709     RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1710     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1711     ecode++;
1712 nigel 77 break;
1713    
1714 ph10 335 case OP_SKIPZERO:
1715 ph10 604 next = ecode+1;
1716     do next += GET(next,1); while (*next == OP_ALT);
1717     ecode = next + 1 + LINK_SIZE;
1718 ph10 335 break;
1719 ph10 625
1720 ph10 604 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1721     here; just jump to the group, with allow_zero set TRUE. */
1722 ph10 625
1723 ph10 604 case OP_BRAPOSZERO:
1724 ph10 625 op = *(++ecode);
1725 ph10 604 allow_zero = TRUE;
1726     if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1727     goto POSSESSIVE_NON_CAPTURE;
1728 ph10 335
1729 nigel 93 /* End of a group, repeated or non-repeating. */
1730 nigel 77
1731     case OP_KET:
1732     case OP_KETRMIN:
1733     case OP_KETRMAX:
1734 ph10 625 case OP_KETRPOS:
1735 nigel 91 prev = ecode - GET(ecode, 1);
1736 ph10 625
1737 nigel 93 /* If this was a group that remembered the subject start, in order to break
1738     infinite repeats of empty string matches, retrieve the subject start from
1739     the chain. Otherwise, set it NULL. */
1740 nigel 77
1741 ph10 618 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1742 nigel 93 {
1743     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1744     eptrb = eptrb->epb_prev; /* Backup to previous group */
1745     }
1746     else saved_eptr = NULL;
1747 nigel 77
1748 ph10 618 /* If we are at the end of an assertion group, stop matching and return
1749     MATCH_MATCH, but record the current high water mark for use by positive
1750     assertions. We also need to record the match start in case it was changed
1751     by \K. */
1752 nigel 93
1753 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1754 ph10 618 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT)
1755 nigel 91 {
1756     md->end_match_ptr = eptr; /* For ONCE */
1757     md->end_offset_top = offset_top;
1758 ph10 500 md->start_match_ptr = mstart;
1759 ph10 630 MRRETURN(MATCH_MATCH); /* Sets md->mark */
1760 nigel 91 }
1761 nigel 77
1762 nigel 93 /* For capturing groups we have to check the group number back at the start
1763     and if necessary complete handling an extraction by setting the offsets and
1764 ph10 618 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1765     into group 0, so it won't be picked up here. Instead, we catch it when the
1766     OP_END is reached. Other recursion is handled here. We just have to record
1767     the current subject position and start match pointer and give a MATCH
1768     return. */
1769 nigel 77
1770 ph10 604 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1771     *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1772 nigel 91 {
1773 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1774 nigel 91 offset = number << 1;
1775 ph10 461
1776 ph10 475 #ifdef PCRE_DEBUG
1777 nigel 91 printf("end bracket %d", number);
1778     printf("\n");
1779 nigel 77 #endif
1780    
1781 ph10 618 /* Handle a recursively called group. */
1782    
1783     if (md->recursive != NULL && md->recursive->group_num == number)
1784     {
1785     md->end_match_ptr = eptr;
1786     md->start_match_ptr = mstart;
1787     RRETURN(MATCH_MATCH);
1788     }
1789    
1790     /* Deal with capturing */
1791    
1792 nigel 93 md->capture_last = number;
1793     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1794 nigel 91 {
1795 ph10 625 /* If offset is greater than offset_top, it means that we are
1796     "skipping" a capturing group, and that group's offsets must be marked
1797     unset. In earlier versions of PCRE, all the offsets were unset at the
1798     start of matching, but this doesn't work because atomic groups and
1799 ph10 615 assertions can cause a value to be set that should later be unset.
1800     Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1801 ph10 625 part of the atomic group, but this is not on the final matching path,
1802     so must be unset when 2 is set. (If there is no group 2, there is no
1803 ph10 615 problem, because offset_top will then be 2, indicating no capture.) */
1804 ph10 625
1805 ph10 615 if (offset > offset_top)
1806     {
1807     register int *iptr = md->offset_vector + offset_top;
1808     register int *iend = md->offset_vector + offset;
1809     while (iptr < iend) *iptr++ = -1;
1810 ph10 625 }
1811    
1812 ph10 615 /* Now make the extraction */
1813    
1814 nigel 93 md->offset_vector[offset] =
1815     md->offset_vector[md->offset_end - number];
1816 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1817 nigel 93 if (offset_top <= offset) offset_top = offset + 2;
1818     }
1819 nigel 91 }
1820 nigel 77
1821 ph10 618 /* For an ordinary non-repeating ket, just continue at this level. This
1822     also happens for a repeating ket if no characters were matched in the
1823     group. This is the forcible breaking of infinite loops as implemented in
1824 ph10 625 Perl 5.005. For a non-repeating atomic group, establish a backup point by
1825     processing the rest of the pattern at a lower level. If this results in a
1826     NOMATCH return, pass MATCH_ONCE back to the original OP_ONCE level, thereby
1827     bypassing intermediate backup points, but resetting any captures that
1828 ph10 618 happened along the way. */
1829 nigel 77
1830 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1831     {
1832 ph10 618 if (*prev == OP_ONCE)
1833     {
1834     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1835     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1836     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1837 ph10 625 RRETURN(MATCH_ONCE);
1838     }
1839 ph10 618 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1840 nigel 91 break;
1841     }
1842 ph10 625
1843     /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1844 ph10 604 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1845     at a time from the outer level, thus saving stack. */
1846 ph10 625
1847 ph10 604 if (*ecode == OP_KETRPOS)
1848 ph10 625 {
1849 ph10 604 md->end_match_ptr = eptr;
1850 ph10 625 md->end_offset_top = offset_top;
1851 ph10 604 RRETURN(MATCH_KETRPOS);
1852 ph10 625 }
1853 nigel 77
1854 ph10 604 /* The normal repeating kets try the rest of the pattern or restart from
1855     the preceding bracket, in the appropriate order. In the second case, we can
1856     use tail recursion to avoid using another stack frame, unless we have an
1857 ph10 618 an atomic group or an unlimited repeat of a group that can match an empty
1858     string. */
1859 nigel 77
1860 nigel 91 if (*ecode == OP_KETRMIN)
1861     {
1862 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1863 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1864 ph10 618 if (*prev == OP_ONCE)
1865     {
1866 ph10 623 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1867 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1868     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1869 ph10 625 RRETURN(MATCH_ONCE);
1870     }
1871 ph10 604 if (*prev >= OP_SBRA) /* Could match an empty string */
1872 ph10 197 {
1873 ph10 625 md->match_function_type = MATCH_CBEGROUP;
1874 ph10 604 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1875 ph10 197 RRETURN(rrc);
1876     }
1877 nigel 91 ecode = prev;
1878     goto TAIL_RECURSE;
1879 nigel 77 }
1880 nigel 91 else /* OP_KETRMAX */
1881     {
1882 ph10 625 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1883 ph10 604 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1884 ph10 618 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1885 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1886 ph10 618 if (*prev == OP_ONCE)
1887     {
1888 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1889 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1890     md->once_target = prev;
1891 ph10 625 RRETURN(MATCH_ONCE);
1892     }
1893 nigel 91 ecode += 1 + LINK_SIZE;
1894     goto TAIL_RECURSE;
1895     }
1896     /* Control never gets here */
1897 nigel 77
1898 ph10 602 /* Not multiline mode: start of subject assertion, unless notbol. */
1899 nigel 77
1900     case OP_CIRC:
1901 ph10 510 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1902 ph10 625
1903 nigel 77 /* Start of subject assertion */
1904    
1905     case OP_SOD:
1906 ph10 510 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1907 nigel 77 ecode++;
1908     break;
1909 ph10 625
1910 ph10 602 /* Multiline mode: start of subject unless notbol, or after any newline. */
1911 nigel 77
1912 ph10 602 case OP_CIRCM:
1913     if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1914     if (eptr != md->start_subject &&
1915     (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1916     MRRETURN(MATCH_NOMATCH);
1917     ecode++;
1918     break;
1919    
1920 nigel 77 /* Start of match assertion */
1921    
1922     case OP_SOM:
1923 ph10 510 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1924 nigel 77 ecode++;
1925     break;
1926 ph10 172
1927 ph10 168 /* Reset the start of match point */
1928 ph10 172
1929 ph10 168 case OP_SET_SOM:
1930     mstart = eptr;
1931 ph10 172 ecode++;
1932     break;
1933 nigel 77
1934 ph10 602 /* Multiline mode: assert before any newline, or before end of subject
1935     unless noteol is set. */
1936 nigel 77
1937 ph10 602 case OP_DOLLM:
1938     if (eptr < md->end_subject)
1939     { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1940     else
1941 nigel 77 {
1942 ph10 510 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1943 ph10 602 SCHECK_PARTIAL();
1944 nigel 77 }
1945 ph10 602 ecode++;
1946     break;
1947 ph10 579
1948 ph10 625 /* Not multiline mode: assert before a terminating newline or before end of
1949 ph10 602 subject unless noteol is set. */
1950    
1951     case OP_DOLL:
1952     if (md->noteol) MRRETURN(MATCH_NOMATCH);
1953     if (!md->endonly) goto ASSERT_NL_OR_EOS;
1954    
1955 nigel 91 /* ... else fall through for endonly */
1956 nigel 77
1957     /* End of subject assertion (\z) */
1958    
1959     case OP_EOD:
1960 ph10 510 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1961 ph10 553 SCHECK_PARTIAL();
1962 nigel 77 ecode++;
1963     break;
1964    
1965     /* End of subject or ending \n assertion (\Z) */
1966    
1967     case OP_EODN:
1968 ph10 553 ASSERT_NL_OR_EOS:
1969     if (eptr < md->end_subject &&
1970 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1971 ph10 510 MRRETURN(MATCH_NOMATCH);
1972 ph10 579
1973 ph10 553 /* Either at end of string or \n before end. */
1974 ph10 579
1975 ph10 553 SCHECK_PARTIAL();
1976 nigel 77 ecode++;
1977     break;
1978    
1979     /* Word boundary assertions */
1980    
1981     case OP_NOT_WORD_BOUNDARY:
1982     case OP_WORD_BOUNDARY:
1983     {
1984    
1985     /* Find out if the previous and current characters are "word" characters.
1986     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1987 ph10 443 be "non-word" characters. Remember the earliest consulted character for
1988 ph10 435 partial matching. */
1989 nigel 77
1990     #ifdef SUPPORT_UTF8
1991     if (utf8)
1992     {
1993 ph10 518 /* Get status of previous character */
1994 ph10 527
1995 nigel 77 if (eptr == md->start_subject) prev_is_word = FALSE; else
1996     {
1997 ph10 409 USPTR lastptr = eptr - 1;
1998 nigel 77 while((*lastptr & 0xc0) == 0x80) lastptr--;
1999 ph10 443 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2000 nigel 77 GETCHAR(c, lastptr);
2001 ph10 527 #ifdef SUPPORT_UCP
2002 ph10 518 if (md->use_ucp)
2003     {
2004     if (c == '_') prev_is_word = TRUE; else
2005 ph10 527 {
2006 ph10 518 int cat = UCD_CATEGORY(c);
2007     prev_is_word = (cat == ucp_L || cat == ucp_N);
2008 ph10 527 }
2009     }
2010     else
2011     #endif
2012 nigel 77 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2013     }
2014 ph10 527
2015 ph10 518 /* Get status of next character */
2016 ph10 527
2017 ph10 443 if (eptr >= md->end_subject)
2018 nigel 77 {
2019 ph10 443 SCHECK_PARTIAL();
2020     cur_is_word = FALSE;
2021 ph10 428 }
2022     else
2023     {
2024 nigel 77 GETCHAR(c, eptr);
2025 ph10 527 #ifdef SUPPORT_UCP
2026 ph10 518 if (md->use_ucp)
2027     {
2028     if (c == '_') cur_is_word = TRUE; else
2029 ph10 527 {
2030 ph10 518 int cat = UCD_CATEGORY(c);
2031     cur_is_word = (cat == ucp_L || cat == ucp_N);
2032 ph10 527 }
2033     }
2034     else
2035     #endif
2036 nigel 77 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2037     }
2038     }
2039     else
2040     #endif
2041    
2042 ph10 527 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2043 ph10 518 consistency with the behaviour of \w we do use it in this case. */
2044 nigel 77
2045     {
2046 ph10 518 /* Get status of previous character */
2047 ph10 527
2048 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
2049     {
2050 ph10 443 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2051 ph10 527 #ifdef SUPPORT_UCP
2052 ph10 518 if (md->use_ucp)
2053     {
2054 ph10 527 c = eptr[-1];
2055 ph10 518 if (c == '_') prev_is_word = TRUE; else
2056 ph10 527 {
2057 ph10 518 int cat = UCD_CATEGORY(c);
2058     prev_is_word = (cat == ucp_L || cat == ucp_N);
2059 ph10 527 }
2060     }
2061     else
2062     #endif
2063 ph10 435 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2064     }
2065 ph10 527
2066 ph10 518 /* Get status of next character */
2067 ph10 527
2068 ph10 443 if (eptr >= md->end_subject)
2069 ph10 428 {
2070 ph10 443 SCHECK_PARTIAL();
2071     cur_is_word = FALSE;
2072 ph10 428 }
2073 ph10 527 else
2074     #ifdef SUPPORT_UCP
2075 ph10 518 if (md->use_ucp)
2076     {
2077 ph10 527 c = *eptr;
2078 ph10 518 if (c == '_') cur_is_word = TRUE; else
2079 ph10 527 {
2080 ph10 518 int cat = UCD_CATEGORY(c);
2081     cur_is_word = (cat == ucp_L || cat == ucp_N);
2082 ph10 527 }
2083     }
2084     else
2085     #endif
2086 ph10 518 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2087 nigel 77 }
2088    
2089     /* Now see if the situation is what we want */
2090    
2091     if ((*ecode++ == OP_WORD_BOUNDARY)?
2092     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2093 ph10 510 MRRETURN(MATCH_NOMATCH);
2094 nigel 77 }
2095     break;
2096    
2097     /* Match a single character type; inline for speed */
2098    
2099     case OP_ANY:
2100 ph10 510 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2101 ph10 345 /* Fall through */
2102    
2103 ph10 341 case OP_ALLANY:
2104 ph10 648 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2105     { /* not be updated before SCHECK_PARTIAL. */
2106 ph10 443 SCHECK_PARTIAL();
2107 ph10 510 MRRETURN(MATCH_NOMATCH);
2108 ph10 443 }
2109 ph10 648 eptr++;
2110 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2111 nigel 77 ecode++;
2112     break;
2113    
2114     /* Match a single byte, even in UTF-8 mode. This opcode really does match
2115     any byte, even newline, independent of the setting of PCRE_DOTALL. */
2116    
2117     case OP_ANYBYTE:
2118 ph10 648 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2119     { /* not be updated before SCHECK_PARTIAL. */
2120 ph10 443 SCHECK_PARTIAL();
2121 ph10 510 MRRETURN(MATCH_NOMATCH);
2122 ph10 443 }
2123 ph10 654 eptr++;
2124 nigel 77 ecode++;
2125     break;
2126    
2127     case OP_NOT_DIGIT:
2128 ph10 443 if (eptr >= md->end_subject)
2129 ph10 428 {
2130 ph10 443 SCHECK_PARTIAL();
2131 ph10 510 MRRETURN(MATCH_NOMATCH);
2132 ph10 443 }
2133 nigel 77 GETCHARINCTEST(c, eptr);
2134     if (
2135     #ifdef SUPPORT_UTF8
2136     c < 256 &&
2137     #endif
2138     (md->ctypes[c] & ctype_digit) != 0
2139     )
2140 ph10 510 MRRETURN(MATCH_NOMATCH);
2141 nigel 77 ecode++;
2142     break;
2143    
2144     case OP_DIGIT:
2145 ph10 443 if (eptr >= md->end_subject)
2146 ph10 428 {
2147 ph10 443 SCHECK_PARTIAL();
2148 ph10 510 MRRETURN(MATCH_NOMATCH);
2149 ph10 443 }
2150 nigel 77 GETCHARINCTEST(c, eptr);
2151     if (
2152     #ifdef SUPPORT_UTF8
2153     c >= 256 ||
2154     #endif
2155     (md->ctypes[c] & ctype_digit) == 0
2156     )
2157 ph10 510 MRRETURN(MATCH_NOMATCH);
2158 nigel 77 ecode++;
2159     break;
2160    
2161     case OP_NOT_WHITESPACE:
2162 ph10 443 if (eptr >= md->end_subject)
2163 ph10 428 {
2164 ph10 443 SCHECK_PARTIAL();
2165 ph10 510 MRRETURN(MATCH_NOMATCH);
2166 ph10 443 }
2167 nigel 77 GETCHARINCTEST(c, eptr);
2168     if (
2169     #ifdef SUPPORT_UTF8
2170     c < 256 &&
2171     #endif
2172     (md->ctypes[c] & ctype_space) != 0
2173     )
2174 ph10 510 MRRETURN(MATCH_NOMATCH);
2175 nigel 77 ecode++;
2176     break;
2177    
2178     case OP_WHITESPACE:
2179 ph10 443 if (eptr >= md->end_subject)
2180 ph10 428 {
2181 ph10 443 SCHECK_PARTIAL();
2182 ph10 510 MRRETURN(MATCH_NOMATCH);
2183 ph10 443 }
2184 nigel 77 GETCHARINCTEST(c, eptr);
2185     if (
2186     #ifdef SUPPORT_UTF8
2187     c >= 256 ||
2188     #endif
2189     (md->ctypes[c] & ctype_space) == 0
2190     )
2191 ph10 510 MRRETURN(MATCH_NOMATCH);
2192 nigel 77 ecode++;
2193     break;
2194    
2195     case OP_NOT_WORDCHAR:
2196 ph10 443 if (eptr >= md->end_subject)
2197 ph10 428 {
2198 ph10 443 SCHECK_PARTIAL();
2199 ph10 510 MRRETURN(MATCH_NOMATCH);
2200 ph10 443 }
2201 nigel 77 GETCHARINCTEST(c, eptr);
2202     if (
2203     #ifdef SUPPORT_UTF8
2204     c < 256 &&
2205     #endif
2206     (md->ctypes[c] & ctype_word) != 0
2207     )
2208 ph10 510 MRRETURN(MATCH_NOMATCH);
2209 nigel 77 ecode++;
2210     break;
2211    
2212     case OP_WORDCHAR:
2213 ph10 443 if (eptr >= md->end_subject)
2214 ph10 428 {
2215 ph10 443 SCHECK_PARTIAL();
2216 ph10 510 MRRETURN(MATCH_NOMATCH);
2217 ph10 443 }
2218 nigel 77 GETCHARINCTEST(c, eptr);
2219     if (
2220     #ifdef SUPPORT_UTF8
2221     c >= 256 ||
2222     #endif
2223     (md->ctypes[c] & ctype_word) == 0
2224     )
2225 ph10 510 MRRETURN(MATCH_NOMATCH);
2226 nigel 77 ecode++;
2227     break;
2228    
2229 nigel 93 case OP_ANYNL:
2230 ph10 443 if (eptr >= md->end_subject)
2231 ph10 428 {
2232 ph10 443 SCHECK_PARTIAL();
2233 ph10 510 MRRETURN(MATCH_NOMATCH);
2234 ph10 443 }
2235 nigel 93 GETCHARINCTEST(c, eptr);
2236     switch(c)
2237     {
2238 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2239 ph10 625
2240 nigel 93 case 0x000d:
2241     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2242     break;
2243 ph10 231
2244 nigel 93 case 0x000a:
2245 ph10 231 break;
2246    
2247 nigel 93 case 0x000b:
2248     case 0x000c:
2249     case 0x0085:
2250     case 0x2028:
2251     case 0x2029:
2252 ph10 510 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2253 nigel 93 break;
2254     }
2255     ecode++;
2256     break;
2257    
2258 ph10 178 case OP_NOT_HSPACE:
2259 ph10 443 if (eptr >= md->end_subject)
2260 ph10 428 {
2261 ph10 443 SCHECK_PARTIAL();
2262 ph10 510 MRRETURN(MATCH_NOMATCH);
2263 ph10 443 }
2264 ph10 178 GETCHARINCTEST(c, eptr);
2265     switch(c)
2266     {
2267     default: break;
2268     case 0x09: /* HT */
2269     case 0x20: /* SPACE */
2270     case 0xa0: /* NBSP */
2271     case 0x1680: /* OGHAM SPACE MARK */
2272     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2273     case 0x2000: /* EN QUAD */
2274     case 0x2001: /* EM QUAD */
2275     case 0x2002: /* EN SPACE */
2276     case 0x2003: /* EM SPACE */
2277     case 0x2004: /* THREE-PER-EM SPACE */
2278     case 0x2005: /* FOUR-PER-EM SPACE */
2279     case 0x2006: /* SIX-PER-EM SPACE */
2280     case 0x2007: /* FIGURE SPACE */
2281     case 0x2008: /* PUNCTUATION SPACE */
2282     case 0x2009: /* THIN SPACE */
2283     case 0x200A: /* HAIR SPACE */
2284     case 0x202f: /* NARROW NO-BREAK SPACE */
2285     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2286     case 0x3000: /* IDEOGRAPHIC SPACE */
2287 ph10 510 MRRETURN(MATCH_NOMATCH);
2288 ph10 178 }
2289     ecode++;
2290     break;
2291    
2292     case OP_HSPACE:
2293 ph10 443 if (eptr >= md->end_subject)
2294 ph10 428 {
2295 ph10 443 SCHECK_PARTIAL();
2296 ph10 510 MRRETURN(MATCH_NOMATCH);
2297 ph10 443 }
2298 ph10 178 GETCHARINCTEST(c, eptr);
2299     switch(c)
2300     {
2301 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2302 ph10 178 case 0x09: /* HT */
2303     case 0x20: /* SPACE */
2304     case 0xa0: /* NBSP */
2305     case 0x1680: /* OGHAM SPACE MARK */
2306     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2307     case 0x2000: /* EN QUAD */
2308     case 0x2001: /* EM QUAD */
2309     case 0x2002: /* EN SPACE */
2310     case 0x2003: /* EM SPACE */
2311     case 0x2004: /* THREE-PER-EM SPACE */
2312     case 0x2005: /* FOUR-PER-EM SPACE */
2313     case 0x2006: /* SIX-PER-EM SPACE */
2314     case 0x2007: /* FIGURE SPACE */
2315     case 0x2008: /* PUNCTUATION SPACE */
2316     case 0x2009: /* THIN SPACE */
2317     case 0x200A: /* HAIR SPACE */
2318     case 0x202f: /* NARROW NO-BREAK SPACE */
2319     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2320     case 0x3000: /* IDEOGRAPHIC SPACE */
2321     break;
2322     }
2323     ecode++;
2324     break;
2325    
2326     case OP_NOT_VSPACE:
2327 ph10 443 if (eptr >= md->end_subject)
2328 ph10 428 {
2329 ph10 443 SCHECK_PARTIAL();
2330 ph10 510 MRRETURN(MATCH_NOMATCH);
2331 ph10 443 }
2332 ph10 178 GETCHARINCTEST(c, eptr);
2333     switch(c)
2334     {
2335     default: break;
2336     case 0x0a: /* LF */
2337     case 0x0b: /* VT */
2338     case 0x0c: /* FF */
2339     case 0x0d: /* CR */
2340     case 0x85: /* NEL */
2341     case 0x2028: /* LINE SEPARATOR */
2342     case 0x2029: /* PARAGRAPH SEPARATOR */
2343 ph10 510 MRRETURN(MATCH_NOMATCH);
2344 ph10 178 }
2345     ecode++;
2346     break;
2347    
2348     case OP_VSPACE:
2349 ph10 443 if (eptr >= md->end_subject)
2350 ph10 428 {
2351 ph10 443 SCHECK_PARTIAL();
2352 ph10 510 MRRETURN(MATCH_NOMATCH);
2353 ph10 443 }
2354 ph10 178 GETCHARINCTEST(c, eptr);
2355     switch(c)
2356     {
2357 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2358 ph10 178 case 0x0a: /* LF */
2359     case 0x0b: /* VT */
2360     case 0x0c: /* FF */
2361     case 0x0d: /* CR */
2362     case 0x85: /* NEL */
2363     case 0x2028: /* LINE SEPARATOR */
2364     case 0x2029: /* PARAGRAPH SEPARATOR */
2365     break;
2366     }
2367     ecode++;
2368     break;
2369    
2370 nigel 77 #ifdef SUPPORT_UCP
2371     /* Check the next character by Unicode property. We will get here only
2372     if the support is in the binary; otherwise a compile-time error occurs. */
2373    
2374     case OP_PROP:
2375     case OP_NOTPROP:
2376 ph10 443 if (eptr >= md->end_subject)
2377 ph10 428 {
2378 ph10 443 SCHECK_PARTIAL();
2379 ph10 510 MRRETURN(MATCH_NOMATCH);
2380 ph10 443 }
2381 nigel 77 GETCHARINCTEST(c, eptr);
2382     {
2383 ph10 384 const ucd_record *prop = GET_UCD(c);
2384 nigel 77
2385 nigel 87 switch(ecode[1])
2386     {
2387     case PT_ANY:
2388 ph10 510 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2389 nigel 87 break;
2390 nigel 77
2391 nigel 87 case PT_LAMP:
2392 ph10 349 if ((prop->chartype == ucp_Lu ||
2393     prop->chartype == ucp_Ll ||
2394     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2395 ph10 510 MRRETURN(MATCH_NOMATCH);
2396 ph10 517 break;
2397 nigel 87
2398     case PT_GC:
2399 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2400 ph10 510 MRRETURN(MATCH_NOMATCH);
2401 nigel 87 break;
2402    
2403     case PT_PC:
2404 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2405 ph10 510 MRRETURN(MATCH_NOMATCH);
2406 nigel 87 break;
2407    
2408     case PT_SC:
2409 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
2410 ph10 510 MRRETURN(MATCH_NOMATCH);
2411 nigel 87 break;
2412 ph10 527
2413 ph10 517 /* These are specials */
2414 ph10 527
2415 ph10 517 case PT_ALNUM:
2416     if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2417     _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2418     MRRETURN(MATCH_NOMATCH);
2419 ph10 527 break;
2420    
2421 ph10 517 case PT_SPACE: /* Perl space */
2422     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2423     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2424     == (op == OP_NOTPROP))
2425     MRRETURN(MATCH_NOMATCH);
2426 ph10 527 break;
2427    
2428 ph10 517 case PT_PXSPACE: /* POSIX space */
2429     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2430 ph10 527 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2431 ph10 517 c == CHAR_FF || c == CHAR_CR)
2432     == (op == OP_NOTPROP))
2433     MRRETURN(MATCH_NOMATCH);
2434 ph10 527 break;
2435 nigel 87
2436 ph10 527 case PT_WORD:
2437 ph10 517 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2438 ph10 527 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2439 ph10 517 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2440     MRRETURN(MATCH_NOMATCH);
2441 ph10 527 break;
2442    
2443 ph10 517 /* This should never occur */
2444    
2445 nigel 87 default:
2446     RRETURN(PCRE_ERROR_INTERNAL);
2447 nigel 77 }
2448 nigel 87
2449     ecode += 3;
2450 nigel 77 }
2451     break;
2452    
2453     /* Match an extended Unicode sequence. We will get here only if the support
2454     is in the binary; otherwise a compile-time error occurs. */
2455    
2456     case OP_EXTUNI:
2457 ph10 443 if (eptr >= md->end_subject)
2458 ph10 428 {
2459 ph10 443 SCHECK_PARTIAL();
2460 ph10 510 MRRETURN(MATCH_NOMATCH);
2461 ph10 443 }
2462 nigel 77 GETCHARINCTEST(c, eptr);
2463 ph10 623 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2464     while (eptr < md->end_subject)
2465 nigel 77 {
2466 ph10 623 int len = 1;
2467     if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2468     if (UCD_CATEGORY(c) != ucp_M) break;
2469     eptr += len;
2470 nigel 77 }
2471     ecode++;
2472     break;
2473     #endif
2474    
2475    
2476     /* Match a back reference, possibly repeatedly. Look past the end of the
2477     item to see if there is repeat information following. The code is similar
2478     to that for character classes, but repeated for efficiency. Then obey
2479     similar code to character type repeats - written out again for speed.
2480     However, if the referenced string is the empty string, always treat
2481     it as matched, any number of times (otherwise there could be infinite
2482     loops). */
2483    
2484     case OP_REF:
2485 ph10 625 case OP_REFI:
2486     caseless = op == OP_REFI;
2487 ph10 595 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2488     ecode += 3;
2489 ph10 345
2490 ph10 595 /* If the reference is unset, there are two possibilities:
2491 ph10 345
2492 ph10 595 (a) In the default, Perl-compatible state, set the length negative;
2493     this ensures that every attempt at a match fails. We can't just fail
2494     here, because of the possibility of quantifiers with zero minima.
2495 ph10 345
2496 ph10 595 (b) If the JavaScript compatibility flag is set, set the length to zero
2497     so that the back reference matches an empty string.
2498 ph10 345
2499 ph10 595 Otherwise, set the length to the length of what was matched by the
2500     referenced subpattern. */
2501 ph10 345
2502 ph10 595 if (offset >= offset_top || md->offset_vector[offset] < 0)
2503     length = (md->jscript_compat)? 0 : -1;
2504     else
2505     length = md->offset_vector[offset+1] - md->offset_vector[offset];
2506 nigel 77
2507 ph10 595 /* Set up for repetition, or handle the non-repeated case */
2508 nigel 77
2509 ph10 595 switch (*ecode)
2510     {
2511     case OP_CRSTAR:
2512     case OP_CRMINSTAR:
2513     case OP_CRPLUS:
2514     case OP_CRMINPLUS:
2515     case OP_CRQUERY:
2516     case OP_CRMINQUERY:
2517     c = *ecode++ - OP_CRSTAR;
2518     minimize = (c & 1) != 0;
2519     min = rep_min[c]; /* Pick up values from tables; */
2520     max = rep_max[c]; /* zero for max => infinity */
2521     if (max == 0) max = INT_MAX;
2522     break;
2523 nigel 77
2524 ph10 595 case OP_CRRANGE:
2525     case OP_CRMINRANGE:
2526     minimize = (*ecode == OP_CRMINRANGE);
2527     min = GET2(ecode, 1);
2528     max = GET2(ecode, 3);
2529     if (max == 0) max = INT_MAX;
2530     ecode += 5;
2531     break;
2532 nigel 77
2533 ph10 595 default: /* No repeat follows */
2534 ph10 602 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2535 ph10 595 {
2536     CHECK_PARTIAL();
2537     MRRETURN(MATCH_NOMATCH);
2538 nigel 77 }
2539 ph10 595 eptr += length;
2540     continue; /* With the main loop */
2541     }
2542 nigel 77
2543 ph10 595 /* Handle repeated back references. If the length of the reference is
2544     zero, just continue with the main loop. */
2545 ph10 443
2546 ph10 595 if (length == 0) continue;
2547 nigel 77
2548 ph10 595 /* First, ensure the minimum number of matches are present. We get back
2549     the length of the reference string explicitly rather than passing the
2550     address of eptr, so that eptr can be a register variable. */
2551 nigel 77
2552 ph10 595 for (i = 1; i <= min; i++)
2553     {
2554 ph10 625 int slength;
2555 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2556 nigel 77 {
2557 ph10 595 CHECK_PARTIAL();
2558     MRRETURN(MATCH_NOMATCH);
2559 nigel 77 }
2560 ph10 595 eptr += slength;
2561     }
2562 nigel 77
2563 ph10 595 /* If min = max, continue at the same level without recursion.
2564     They are not both allowed to be zero. */
2565 nigel 77
2566 ph10 595 if (min == max) continue;
2567 nigel 77
2568 ph10 595 /* If minimizing, keep trying and advancing the pointer */
2569 nigel 77
2570 ph10 595 if (minimize)
2571     {
2572     for (fi = min;; fi++)
2573 nigel 77 {
2574 ph10 625 int slength;
2575 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2576 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2577     if (fi >= max) MRRETURN(MATCH_NOMATCH);
2578 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2579 nigel 77 {
2580 ph10 595 CHECK_PARTIAL();
2581     MRRETURN(MATCH_NOMATCH);
2582 nigel 77 }
2583 ph10 595 eptr += slength;
2584 nigel 77 }
2585 ph10 595 /* Control never gets here */
2586     }
2587 nigel 77
2588 ph10 595 /* If maximizing, find the longest string and work backwards */
2589 nigel 77
2590 ph10 595 else
2591     {
2592     pp = eptr;
2593     for (i = min; i < max; i++)
2594 nigel 77 {
2595 ph10 625 int slength;
2596 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2597 nigel 77 {
2598 ph10 595 CHECK_PARTIAL();
2599     break;
2600 nigel 77 }
2601 ph10 595 eptr += slength;
2602 nigel 77 }
2603 ph10 595 while (eptr >= pp)
2604     {
2605 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2606 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2607     eptr -= length;
2608     }
2609     MRRETURN(MATCH_NOMATCH);
2610 nigel 77 }
2611     /* Control never gets here */
2612    
2613     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2614     used when all the characters in the class have values in the range 0-255,
2615     and either the matching is caseful, or the characters are in the range
2616     0-127 when UTF-8 processing is enabled. The only difference between
2617     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2618     encountered.
2619    
2620     First, look past the end of the item to see if there is repeat information
2621     following. Then obey similar code to character type repeats - written out
2622     again for speed. */
2623    
2624     case OP_NCLASS:
2625     case OP_CLASS:
2626     {
2627     data = ecode + 1; /* Save for matching */
2628     ecode += 33; /* Advance past the item */
2629    
2630     switch (*ecode)
2631     {
2632     case OP_CRSTAR:
2633     case OP_CRMINSTAR:
2634     case OP_CRPLUS:
2635     case OP_CRMINPLUS:
2636     case OP_CRQUERY:
2637     case OP_CRMINQUERY:
2638     c = *ecode++ - OP_CRSTAR;
2639     minimize = (c & 1) != 0;
2640     min = rep_min[c]; /* Pick up values from tables; */
2641     max = rep_max[c]; /* zero for max => infinity */
2642     if (max == 0) max = INT_MAX;
2643     break;
2644    
2645     case OP_CRRANGE:
2646     case OP_CRMINRANGE:
2647     minimize = (*ecode == OP_CRMINRANGE);
2648     min = GET2(ecode, 1);
2649     max = GET2(ecode, 3);
2650     if (max == 0) max = INT_MAX;
2651     ecode += 5;
2652     break;
2653    
2654     default: /* No repeat follows */
2655     min = max = 1;
2656     break;
2657     }
2658    
2659     /* First, ensure the minimum number of matches are present. */
2660    
2661     #ifdef SUPPORT_UTF8
2662     /* UTF-8 mode */
2663     if (utf8)
2664     {
2665     for (i = 1; i <= min; i++)
2666     {
2667 ph10 427 if (eptr >= md->end_subject)
2668 ph10 426 {
2669 ph10 428 SCHECK_PARTIAL();
2670 ph10 510 MRRETURN(MATCH_NOMATCH);
2671 ph10 427 }
2672 nigel 77 GETCHARINC(c, eptr);
2673     if (c > 255)
2674     {
2675 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2676 nigel 77 }
2677     else
2678     {
2679 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2680 nigel 77 }
2681     }
2682     }
2683     else
2684     #endif
2685     /* Not UTF-8 mode */
2686     {
2687     for (i = 1; i <= min; i++)
2688     {
2689 ph10 427 if (eptr >= md->end_subject)
2690 ph10 426 {
2691 ph10 428 SCHECK_PARTIAL();
2692 ph10 510 MRRETURN(MATCH_NOMATCH);
2693 ph10 427 }
2694 nigel 77 c = *eptr++;
2695 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2696 nigel 77 }
2697     }
2698    
2699     /* If max == min we can continue with the main loop without the
2700     need to recurse. */
2701    
2702     if (min == max) continue;
2703    
2704     /* If minimizing, keep testing the rest of the expression and advancing
2705     the pointer while it matches the class. */
2706    
2707     if (minimize)
2708     {
2709     #ifdef SUPPORT_UTF8
2710     /* UTF-8 mode */
2711     if (utf8)
2712     {
2713     for (fi = min;; fi++)
2714     {
2715 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2716 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2717 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2718 ph10 427 if (eptr >= md->end_subject)
2719 ph10 426 {
2720 ph10 427 SCHECK_PARTIAL();
2721 ph10 510 MRRETURN(MATCH_NOMATCH);
2722 ph10 427 }
2723 nigel 77 GETCHARINC(c, eptr);
2724     if (c > 255)
2725     {
2726 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2727 nigel 77 }
2728     else
2729     {
2730 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2731 nigel 77 }
2732     }
2733     }
2734     else
2735     #endif
2736     /* Not UTF-8 mode */
2737     {
2738     for (fi = min;; fi++)
2739     {
2740 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2741 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2742 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2743 ph10 427 if (eptr >= md->end_subject)
2744 ph10 426 {
2745 ph10 427 SCHECK_PARTIAL();
2746 ph10 510 MRRETURN(MATCH_NOMATCH);
2747 ph10 427 }
2748 nigel 77 c = *eptr++;
2749 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2750 nigel 77 }
2751     }
2752     /* Control never gets here */
2753     }
2754    
2755     /* If maximizing, find the longest possible run, then work backwards. */
2756    
2757     else
2758     {
2759     pp = eptr;
2760    
2761     #ifdef SUPPORT_UTF8
2762     /* UTF-8 mode */
2763     if (utf8)
2764     {
2765     for (i = min; i < max; i++)
2766     {
2767     int len = 1;
2768 ph10 463 if (eptr >= md->end_subject)
2769 ph10 462 {
2770 ph10 463 SCHECK_PARTIAL();
2771 ph10 462 break;
2772 ph10 463 }
2773 nigel 77 GETCHARLEN(c, eptr, len);
2774     if (c > 255)
2775     {
2776     if (op == OP_CLASS) break;
2777     }
2778     else
2779     {
2780     if ((data[c/8] & (1 << (c&7))) == 0) break;
2781     }
2782     eptr += len;
2783     }
2784     for (;;)
2785     {
2786 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2787 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2788     if (eptr-- == pp) break; /* Stop if tried at original pos */
2789     BACKCHAR(eptr);
2790     }
2791     }
2792     else
2793     #endif
2794     /* Not UTF-8 mode */
2795     {
2796     for (i = min; i < max; i++)
2797     {
2798 ph10 463 if (eptr >= md->end_subject)
2799 ph10 462 {
2800 ph10 463 SCHECK_PARTIAL();
2801 ph10 462 break;
2802 ph10 463 }
2803 nigel 77 c = *eptr;
2804     if ((data[c/8] & (1 << (c&7))) == 0) break;
2805     eptr++;
2806     }
2807     while (eptr >= pp)
2808     {
2809 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2810 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2811 nigel 77 eptr--;
2812     }
2813     }
2814    
2815 ph10 510 MRRETURN(MATCH_NOMATCH);
2816 nigel 77 }
2817     }
2818     /* Control never gets here */
2819    
2820    
2821     /* Match an extended character class. This opcode is encountered only
2822 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2823     mode, because Unicode properties are supported in non-UTF-8 mode. */
2824 nigel 77
2825     #ifdef SUPPORT_UTF8
2826     case OP_XCLASS:
2827     {
2828     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2829     ecode += GET(ecode, 1); /* Advance past the item */
2830    
2831     switch (*ecode)
2832     {
2833     case OP_CRSTAR:
2834     case OP_CRMINSTAR:
2835     case OP_CRPLUS:
2836     case OP_CRMINPLUS:
2837     case OP_CRQUERY:
2838     case OP_CRMINQUERY:
2839     c = *ecode++ - OP_CRSTAR;
2840     minimize = (c & 1) != 0;
2841     min = rep_min[c]; /* Pick up values from tables; */
2842     max = rep_max[c]; /* zero for max => infinity */
2843     if (max == 0) max = INT_MAX;
2844     break;
2845    
2846     case OP_CRRANGE:
2847     case OP_CRMINRANGE:
2848     minimize = (*ecode == OP_CRMINRANGE);
2849     min = GET2(ecode, 1);
2850     max = GET2(ecode, 3);
2851     if (max == 0) max = INT_MAX;
2852     ecode += 5;
2853     break;
2854    
2855     default: /* No repeat follows */
2856     min = max = 1;
2857     break;
2858     }
2859    
2860     /* First, ensure the minimum number of matches are present. */
2861    
2862     for (i = 1; i <= min; i++)
2863     {
2864 ph10 427 if (eptr >= md->end_subject)
2865 ph10 426 {
2866     SCHECK_PARTIAL();
2867 ph10 510 MRRETURN(MATCH_NOMATCH);
2868 ph10 427 }
2869 ph10 384 GETCHARINCTEST(c, eptr);
2870 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2871 nigel 77 }
2872    
2873     /* If max == min we can continue with the main loop without the
2874     need to recurse. */
2875    
2876     if (min == max) continue;
2877    
2878     /* If minimizing, keep testing the rest of the expression and advancing
2879     the pointer while it matches the class. */
2880    
2881     if (minimize)
2882     {
2883     for (fi = min;; fi++)
2884     {
2885 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2886 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2887 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2888 ph10 427 if (eptr >= md->end_subject)
2889 ph10 426 {
2890 ph10 427 SCHECK_PARTIAL();
2891 ph10 510 MRRETURN(MATCH_NOMATCH);
2892 ph10 427 }
2893 ph10 384 GETCHARINCTEST(c, eptr);
2894 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2895 nigel 77 }
2896     /* Control never gets here */
2897     }
2898    
2899     /* If maximizing, find the longest possible run, then work backwards. */
2900    
2901     else
2902     {
2903     pp = eptr;
2904     for (i = min; i < max; i++)
2905     {
2906     int len = 1;
2907 ph10 463 if (eptr >= md->end_subject)
2908 ph10 462 {
2909 ph10 463 SCHECK_PARTIAL();
2910 ph10 462 break;
2911 ph10 463 }
2912 ph10 384 GETCHARLENTEST(c, eptr, len);
2913 nigel 77 if (!_pcre_xclass(c, data)) break;
2914     eptr += len;
2915     }
2916     for(;;)
2917     {
2918 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2919 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2920     if (eptr-- == pp) break; /* Stop if tried at original pos */
2921 ph10 214 if (utf8) BACKCHAR(eptr);
2922 nigel 77 }
2923 ph10 510 MRRETURN(MATCH_NOMATCH);
2924 nigel 77 }
2925    
2926     /* Control never gets here */
2927     }
2928     #endif /* End of XCLASS */
2929    
2930     /* Match a single character, casefully */
2931    
2932     case OP_CHAR:
2933     #ifdef SUPPORT_UTF8
2934     if (utf8)
2935     {
2936     length = 1;
2937     ecode++;
2938     GETCHARLEN(fc, ecode, length);
2939 ph10 443 if (length > md->end_subject - eptr)
2940 ph10 428 {
2941     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2942 ph10 510 MRRETURN(MATCH_NOMATCH);
2943 ph10 443 }
2944 ph10 510 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2945 nigel 77 }
2946     else
2947     #endif
2948    
2949     /* Non-UTF-8 mode */
2950     {
2951 ph10 443 if (md->end_subject - eptr < 1)
2952 ph10 428 {
2953     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2954 ph10 510 MRRETURN(MATCH_NOMATCH);
2955 ph10 443 }
2956 ph10 510 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2957 nigel 77 ecode += 2;
2958     }
2959     break;
2960    
2961     /* Match a single character, caselessly */
2962    
2963 ph10 602 case OP_CHARI:
2964 nigel 77 #ifdef SUPPORT_UTF8
2965     if (utf8)
2966     {
2967     length = 1;
2968     ecode++;
2969     GETCHARLEN(fc, ecode, length);
2970    
2971 ph10 443 if (length > md->end_subject - eptr)
2972 ph10 428 {
2973     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2974 ph10 510 MRRETURN(MATCH_NOMATCH);
2975 ph10 443 }
2976 nigel 77
2977     /* If the pattern character's value is < 128, we have only one byte, and
2978     can use the fast lookup table. */
2979    
2980     if (fc < 128)
2981     {
2982 ph10 510 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2983 nigel 77 }
2984    
2985     /* Otherwise we must pick up the subject character */
2986    
2987     else
2988     {
2989 nigel 93 unsigned int dc;
2990 nigel 77 GETCHARINC(dc, eptr);
2991     ecode += length;
2992    
2993     /* If we have Unicode property support, we can use it to test the other
2994 nigel 87 case of the character, if there is one. */
2995 nigel 77
2996     if (fc != dc)
2997     {
2998     #ifdef SUPPORT_UCP
2999 ph10 349 if (dc != UCD_OTHERCASE(fc))
3000 nigel 77 #endif
3001 ph10 510 MRRETURN(MATCH_NOMATCH);
3002 nigel 77 }
3003     }
3004     }
3005     else
3006     #endif /* SUPPORT_UTF8 */
3007    
3008     /* Non-UTF-8 mode */
3009     {
3010 ph10 443 if (md->end_subject - eptr < 1)
3011 ph10 428 {
3012 ph10 443 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3013 ph10 510 MRRETURN(MATCH_NOMATCH);
3014 ph10 443 }
3015 ph10 510 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3016 nigel 77 ecode += 2;
3017     }
3018     break;
3019    
3020 nigel 93 /* Match a single character repeatedly. */
3021 nigel 77
3022     case OP_EXACT:
3023 ph10 602 case OP_EXACTI:
3024 nigel 77 min = max = GET2(ecode, 1);
3025     ecode += 3;
3026     goto REPEATCHAR;
3027    
3028 nigel 93 case OP_POSUPTO:
3029 ph10 602 case OP_POSUPTOI:
3030 nigel 93 possessive = TRUE;
3031     /* Fall through */
3032    
3033 nigel 77 case OP_UPTO:
3034 ph10 602 case OP_UPTOI:
3035 nigel 77 case OP_MINUPTO:
3036 ph10 602 case OP_MINUPTOI:
3037 nigel 77 min = 0;
3038     max = GET2(ecode, 1);
3039 ph10 602 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3040 nigel 77 ecode += 3;
3041     goto REPEATCHAR;
3042    
3043 nigel 93 case OP_POSSTAR:
3044 ph10 602 case OP_POSSTARI:
3045 nigel 93 possessive = TRUE;
3046     min = 0;
3047     max = INT_MAX;
3048     ecode++;
3049     goto REPEATCHAR;
3050    
3051     case OP_POSPLUS:
3052 ph10 602 case OP_POSPLUSI:
3053 nigel 93 possessive = TRUE;
3054     min = 1;
3055     max = INT_MAX;
3056     ecode++;
3057     goto REPEATCHAR;
3058    
3059     case OP_POSQUERY:
3060 ph10 602 case OP_POSQUERYI:
3061 nigel 93 possessive = TRUE;
3062     min = 0;
3063     max = 1;
3064     ecode++;
3065     goto REPEATCHAR;
3066    
3067 nigel 77 case OP_STAR:
3068 ph10 602 case OP_STARI:
3069 nigel 77 case OP_MINSTAR:
3070 ph10 602 case OP_MINSTARI:
3071 nigel 77 case OP_PLUS:
3072 ph10 602 case OP_PLUSI:
3073 nigel 77 case OP_MINPLUS:
3074 ph10 602 case OP_MINPLUSI:
3075 nigel 77 case OP_QUERY:
3076 ph10 602 case OP_QUERYI:
3077 nigel 77 case OP_MINQUERY:
3078 ph10 602 case OP_MINQUERYI:
3079     c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3080 nigel 77 minimize = (c & 1) != 0;
3081     min = rep_min[c]; /* Pick up values from tables; */
3082     max = rep_max[c]; /* zero for max => infinity */
3083     if (max == 0) max = INT_MAX;
3084    
3085 ph10 426 /* Common code for all repeated single-character matches. */
3086 nigel 77
3087     REPEATCHAR:
3088     #ifdef SUPPORT_UTF8
3089     if (utf8)
3090     {
3091     length = 1;
3092     charptr = ecode;
3093     GETCHARLEN(fc, ecode, length);
3094     ecode += length;
3095    
3096     /* Handle multibyte character matching specially here. There is
3097     support for caseless matching if UCP support is present. */
3098    
3099     if (length > 1)
3100     {
3101     #ifdef SUPPORT_UCP
3102 nigel 93 unsigned int othercase;
3103 ph10 602 if (op >= OP_STARI && /* Caseless */
3104 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
3105 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
3106 ph10 115 else oclength = 0;
3107 nigel 77 #endif /* SUPPORT_UCP */
3108    
3109     for (i = 1; i <= min; i++)
3110     {
3111 ph10 426 if (eptr <= md->end_subject - length &&
3112     memcmp(eptr, charptr, length) == 0) eptr += length;
3113 ph10 123 #ifdef SUPPORT_UCP
3114 ph10 426 else if (oclength > 0 &&
3115     eptr <= md->end_subject - oclength &&
3116     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3117     #endif /* SUPPORT_UCP */
3118 nigel 77 else
3119     {
3120 ph10 426 CHECK_PARTIAL();
3121 ph10 510 MRRETURN(MATCH_NOMATCH);
3122 nigel 77 }
3123     }
3124    
3125     if (min == max) continue;
3126    
3127     if (minimize)
3128     {
3129     for (fi = min;; fi++)
3130     {
3131 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3132 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3133 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3134 ph10 426 if (eptr <= md->end_subject - length &&
3135     memcmp(eptr, charptr, length) == 0) eptr += length;
3136 ph10 123 #ifdef SUPPORT_UCP
3137 ph10 426 else if (oclength > 0 &&
3138     eptr <= md->end_subject - oclength &&
3139     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3140     #endif /* SUPPORT_UCP */
3141 nigel 77 else
3142     {
3143 ph10 426 CHECK_PARTIAL();
3144 ph10 510 MRRETURN(MATCH_NOMATCH);
3145 nigel 77 }
3146     }
3147     /* Control never gets here */
3148     }
3149 nigel 93
3150     else /* Maximize */
3151 nigel 77 {
3152     pp = eptr;
3153     for (i = min; i < max; i++)
3154     {
3155 ph10 426 if (eptr <= md->end_subject - length &&
3156     memcmp(eptr, charptr, length) == 0) eptr += length;
3157 ph10 123 #ifdef SUPPORT_UCP
3158 ph10 426 else if (oclength > 0 &&
3159     eptr <= md->end_subject - oclength &&
3160     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3161     #endif /* SUPPORT_UCP */
3162 ph10 463 else
3163 ph10 462 {
3164 ph10 463 CHECK_PARTIAL();
3165 ph10 462 break;
3166 ph10 463 }
3167 nigel 77 }
3168 nigel 93
3169     if (possessive) continue;
3170 ph10 427
3171 ph10 120 for(;;)
3172 ph10 426 {
3173 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3174 ph10 426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3175 ph10 510 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3176 ph10 115 #ifdef SUPPORT_UCP
3177 ph10 426 eptr--;
3178     BACKCHAR(eptr);
3179 ph10 123 #else /* without SUPPORT_UCP */
3180 ph10 426 eptr -= length;
3181 ph10 123 #endif /* SUPPORT_UCP */
3182 ph10 426 }
3183 nigel 77 }
3184     /* Control never gets here */
3185     }
3186    
3187     /* If the length of a UTF-8 character is 1, we fall through here, and
3188     obey the code as for non-UTF-8 characters below, though in this case the
3189     value of fc will always be < 128. */
3190     }
3191     else
3192     #endif /* SUPPORT_UTF8 */
3193    
3194     /* When not in UTF-8 mode, load a single-byte character. */
3195    
3196 ph10 426 fc = *ecode++;
3197 ph10 443
3198 nigel 77 /* The value of fc at this point is always less than 256, though we may or
3199     may not be in UTF-8 mode. The code is duplicated for the caseless and
3200     caseful cases, for speed, since matching characters is likely to be quite
3201     common. First, ensure the minimum number of matches are present. If min =
3202     max, continue at the same level without recursing. Otherwise, if
3203     minimizing, keep trying the rest of the expression and advancing one
3204     matching character if failing, up to the maximum. Alternatively, if
3205     maximizing, find the maximum number of characters and work backwards. */
3206    
3207     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3208     max, eptr));
3209    
3210 ph10 602 if (op >= OP_STARI) /* Caseless */
3211 nigel 77 {
3212     fc = md->lcc[fc];
3213     for (i = 1; i <= min; i++)
3214 ph10 426 {
3215     if (eptr >= md->end_subject)
3216     {
3217     SCHECK_PARTIAL();
3218 ph10 510 MRRETURN(MATCH_NOMATCH);
3219 ph10 426 }
3220 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3221 ph10 426 }
3222 nigel 77 if (min == max) continue;
3223     if (minimize)
3224     {
3225     for (fi = min;; fi++)
3226     {
3227 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3228 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3229 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3230 ph10 426 if (eptr >= md->end_subject)
3231     {
3232 ph10 427 SCHECK_PARTIAL();
3233 ph10 510 MRRETURN(MATCH_NOMATCH);
3234 ph10 426 }
3235 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3236 nigel 77 }
3237     /* Control never gets here */
3238     }
3239 nigel 93 else /* Maximize */
3240 nigel 77 {
3241     pp = eptr;
3242     for (i = min; i < max; i++)
3243     {
3244 ph10 463 if (eptr >= md->end_subject)
3245 ph10 462 {
3246     SCHECK_PARTIAL();
3247     break;
3248 ph10 463 }
3249 ph10 462 if (fc != md->lcc[*eptr]) break;
3250 nigel 77 eptr++;
3251     }
3252 ph10 427
3253 nigel 93 if (possessive) continue;
3254 ph10 427
3255 nigel 77 while (eptr >= pp)
3256     {
3257 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3258 nigel 77 eptr--;
3259     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3260     }
3261 ph10 510 MRRETURN(MATCH_NOMATCH);
3262 nigel 77 }
3263     /* Control never gets here */
3264     }
3265    
3266     /* Caseful comparisons (includes all multi-byte characters) */
3267    
3268     else
3269     {
3270 ph10 427 for (i = 1; i <= min; i++)
3271 ph10 426 {
3272     if (eptr >= md->end_subject)
3273     {
3274     SCHECK_PARTIAL();
3275 ph10 510 MRRETURN(MATCH_NOMATCH);
3276 ph10 426 }
3277 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3278 ph10 427 }
3279 ph10 443
3280 nigel 77 if (min == max) continue;
3281 ph10 443
3282 nigel 77 if (minimize)
3283     {
3284     for (fi = min;; fi++)
3285     {
3286 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3287 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3288 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3289 ph10 426 if (eptr >= md->end_subject)
3290 ph10 427 {
3291 ph10 426 SCHECK_PARTIAL();
3292 ph10 510 MRRETURN(MATCH_NOMATCH);
3293 ph10 427 }
3294 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3295 nigel 77 }
3296     /* Control never gets here */
3297     }
3298 nigel 93 else /* Maximize */
3299 nigel 77 {
3300     pp = eptr;
3301     for (i = min; i < max; i++)
3302     {
3303 ph10 463 if (eptr >= md->end_subject)
3304 ph10 462 {
3305 ph10 463 SCHECK_PARTIAL();
3306 ph10 462 break;
3307 ph10 463 }
3308 ph10 462 if (fc != *eptr) break;
3309 nigel 77 eptr++;
3310     }
3311 nigel 93 if (possessive) continue;
3312 ph10 443
3313 nigel 77 while (eptr >= pp)
3314     {
3315 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3316 nigel 77 eptr--;
3317     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3318     }
3319 ph10 510 MRRETURN(MATCH_NOMATCH);
3320 nigel 77 }
3321     }
3322     /* Control never gets here */
3323    
3324     /* Match a negated single one-byte character. The character we are
3325     checking can be multibyte. */
3326    
3327     case OP_NOT:
3328 ph10 625 case OP_NOTI:
3329 ph10 443 if (eptr >= md->end_subject)
3330 ph10 428 {
3331 ph10 443 SCHECK_PARTIAL();
3332 ph10 510 MRRETURN(MATCH_NOMATCH);
3333 ph10 443 }
3334 nigel 77 ecode++;
3335     GETCHARINCTEST(c, eptr);
3336 ph10 602 if (op == OP_NOTI) /* The caseless case */
3337 nigel 77 {
3338     #ifdef SUPPORT_UTF8
3339     if (c < 256)
3340     #endif
3341     c = md->lcc[c];
3342 ph10 510 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3343 nigel 77 }
3344 ph10 602 else /* Caseful */
3345 nigel 77 {
3346 ph10 510 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3347 nigel 77 }
3348     break;
3349    
3350     /* Match a negated single one-byte character repeatedly. This is almost a
3351     repeat of the code for a repeated single character, but I haven't found a
3352     nice way of commoning these up that doesn't require a test of the
3353     positive/negative option for each character match. Maybe that wouldn't add
3354     very much to the time taken, but character matching *is* what this is all
3355     about... */
3356    
3357     case OP_NOTEXACT:
3358 ph10 602 case OP_NOTEXACTI:
3359 nigel 77 min = max = GET2(ecode, 1);
3360     ecode += 3;
3361     goto REPEATNOTCHAR;
3362    
3363     case OP_NOTUPTO:
3364 ph10 602 case OP_NOTUPTOI:
3365 nigel 77 case OP_NOTMINUPTO:
3366 ph10 602 case OP_NOTMINUPTOI:
3367 nigel 77 min = 0;
3368     max = GET2(ecode, 1);
3369 ph10 602 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3370 nigel 77 ecode += 3;
3371     goto REPEATNOTCHAR;
3372    
3373 nigel 93 case OP_NOTPOSSTAR:
3374 ph10 602 case OP_NOTPOSSTARI:
3375 nigel 93 possessive = TRUE;
3376     min = 0;
3377     max = INT_MAX;
3378     ecode++;
3379     goto REPEATNOTCHAR;
3380    
3381     case OP_NOTPOSPLUS:
3382 ph10 602 case OP_NOTPOSPLUSI:
3383 nigel 93 possessive = TRUE;
3384     min = 1;
3385     max = INT_MAX;
3386     ecode++;
3387     goto REPEATNOTCHAR;
3388    
3389     case OP_NOTPOSQUERY:
3390 ph10 602 case OP_NOTPOSQUERYI:
3391 nigel 93 possessive = TRUE;
3392     min = 0;
3393     max = 1;
3394     ecode++;
3395     goto REPEATNOTCHAR;
3396    
3397     case OP_NOTPOSUPTO:
3398 ph10 602 case OP_NOTPOSUPTOI:
3399 nigel 93 possessive = TRUE;
3400     min = 0;
3401     max = GET2(ecode, 1);
3402     ecode += 3;
3403     goto REPEATNOTCHAR;
3404    
3405 nigel 77 case OP_NOTSTAR:
3406 ph10 602 case OP_NOTSTARI:
3407 nigel 77 case OP_NOTMINSTAR:
3408 ph10 602 case OP_NOTMINSTARI:
3409 nigel 77 case OP_NOTPLUS:
3410 ph10 602 case OP_NOTPLUSI:
3411 nigel 77 case OP_NOTMINPLUS:
3412 ph10 602 case OP_NOTMINPLUSI:
3413 nigel 77 case OP_NOTQUERY:
3414 ph10 602 case OP_NOTQUERYI:
3415 nigel 77 case OP_NOTMINQUERY:
3416 ph10 602 case OP_NOTMINQUERYI:
3417     c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3418 nigel 77 minimize = (c & 1) != 0;
3419     min = rep_min[c]; /* Pick up values from tables; */
3420     max = rep_max[c]; /* zero for max => infinity */
3421     if (max == 0) max = INT_MAX;
3422    
3423 ph10 426 /* Common code for all repeated single-byte matches. */
3424 nigel 77
3425     REPEATNOTCHAR:
3426     fc = *ecode++;
3427    
3428     /* The code is duplicated for the caseless and caseful cases, for speed,
3429     since matching characters is likely to be quite common. First, ensure the
3430     minimum number of matches are present. If min = max, continue at the same
3431     level without recursing. Otherwise, if minimizing, keep trying the rest of
3432     the expression and advancing one matching character if failing, up to the
3433     maximum. Alternatively, if maximizing, find the maximum number of
3434     characters and work backwards. */
3435    
3436     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3437     max, eptr));
3438    
3439 ph10 602 if (op >= OP_NOTSTARI) /* Caseless */
3440 nigel 77 {
3441     fc = md->lcc[fc];
3442    
3443     #ifdef SUPPORT_UTF8
3444     /* UTF-8 mode */
3445     if (utf8)
3446     {
3447 nigel 93 register unsigned int d;
3448 nigel 77 for (i = 1; i <= min; i++)
3449     {
3450 ph10 426 if (eptr >= md->end_subject)
3451     {
3452     SCHECK_PARTIAL();
3453 ph10 510 MRRETURN(MATCH_NOMATCH);
3454 ph10 427 }
3455 nigel 77 GETCHARINC(d, eptr);
3456     if (d < 256) d = md->lcc[d];
3457 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3458 nigel 77 }
3459     }
3460     else
3461     #endif
3462    
3463     /* Not UTF-8 mode */
3464     {
3465     for (i = 1; i <= min; i++)
3466 ph10 426 {
3467     if (eptr >= md->end_subject)
3468     {
3469     SCHECK_PARTIAL();
3470 ph10 510 MRRETURN(MATCH_NOMATCH);
3471 ph10 427 }
3472 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3473 ph10 427 }
3474 nigel 77 }
3475    
3476     if (min == max) continue;
3477    
3478     if (minimize)
3479     {
3480     #ifdef SUPPORT_UTF8
3481     /* UTF-8 mode */
3482     if (utf8)
3483     {
3484 nigel 93 register unsigned int d;
3485 nigel 77 for (fi = min;; fi++)
3486     {
3487 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3488 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3489 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3490 ph10 427 if (eptr >= md->end_subject)
3491 ph10 426 {
3492 ph10 427 SCHECK_PARTIAL();
3493 ph10 510 MRRETURN(MATCH_NOMATCH);
3494 ph10 427 }
3495 nigel 77 GETCHARINC(d, eptr);
3496     if (d < 256) d = md->lcc[d];
3497 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3498 nigel 77 }
3499     }
3500     else
3501     #endif
3502     /* Not UTF-8 mode */
3503     {
3504     for (fi = min;; fi++)
3505     {
3506 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3507 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3508 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3509 ph10 426 if (eptr >= md->end_subject)
3510     {
3511     SCHECK_PARTIAL();
3512 ph10 510 MRRETURN(MATCH_NOMATCH);
3513 ph10 426 }
3514 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3515 nigel 77 }
3516     }
3517     /* Control never gets here */
3518     }
3519    
3520     /* Maximize case */
3521    
3522     else
3523     {
3524     pp = eptr;
3525    
3526     #ifdef SUPPORT_UTF8
3527     /* UTF-8 mode */
3528     if (utf8)
3529     {
3530 nigel 93 register unsigned int d;
3531 nigel 77 for (i = min; i < max; i++)
3532     {
3533     int len = 1;
3534 ph10 463 if (eptr >= md->end_subject)
3535 ph10 462 {
3536 ph10 463 SCHECK_PARTIAL();
3537 ph10 462 break;
3538 ph10 463 }
3539 nigel 77 GETCHARLEN(d, eptr, len);
3540     if (d < 256) d = md->lcc[d];
3541     if (fc == d) break;
3542     eptr += len;
3543     }
3544 nigel 93 if (possessive) continue;
3545     for(;;)
3546 nigel 77 {
3547 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3548 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3549     if (eptr-- == pp) break; /* Stop if tried at original pos */
3550     BACKCHAR(eptr);
3551     }
3552     }
3553     else
3554     #endif
3555     /* Not UTF-8 mode */
3556     {
3557     for (i = min; i < max; i++)
3558     {
3559 ph10 463 if (eptr >= md->end_subject)
3560 ph10 462 {
3561     SCHECK_PARTIAL();
3562     break;
3563 ph10 463 }
3564 ph10 462 if (fc == md->lcc[*eptr]) break;
3565 nigel 77 eptr++;
3566     }
3567 nigel 93 if (possessive) continue;
3568 nigel 77 while (eptr >= pp)
3569     {
3570 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3571 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3572     eptr--;
3573     }
3574     }
3575    
3576 ph10 510 MRRETURN(MATCH_NOMATCH);
3577 nigel 77 }
3578     /* Control never gets here */
3579     }
3580    
3581     /* Caseful comparisons */
3582    
3583     else
3584     {
3585     #ifdef SUPPORT_UTF8
3586     /* UTF-8 mode */
3587     if (utf8)
3588     {
3589 nigel 93 register unsigned int d;
3590 nigel 77 for (i = 1; i <= min; i++)
3591     {
3592 ph10 426 if (eptr >= md->end_subject)
3593     {
3594     SCHECK_PARTIAL();
3595 ph10 510 MRRETURN(MATCH_NOMATCH);
3596 ph10 427 }
3597 nigel 77 GETCHARINC(d, eptr);
3598 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3599 nigel 77 }
3600     }
3601     else
3602     #endif
3603     /* Not UTF-8 mode */
3604     {
3605     for (i = 1; i <= min; i++)
3606 ph10 426 {
3607     if (eptr >= md->end_subject)
3608     {
3609     SCHECK_PARTIAL();
3610 ph10 510 MRRETURN(MATCH_NOMATCH);
3611 ph10 427 }
3612 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3613 ph10 427 }
3614 nigel 77 }
3615    
3616     if (min == max) continue;
3617    
3618     if (minimize)
3619     {
3620     #ifdef SUPPORT_UTF8
3621     /* UTF-8 mode */
3622     if (utf8)
3623     {
3624 nigel 93 register unsigned int d;
3625 nigel 77 for (fi = min;; fi++)
3626     {
3627 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3628 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3629 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3630 ph10 427 if (eptr >= md->end_subject)
3631 ph10 426 {
3632 ph10 427 SCHECK_PARTIAL();
3633 ph10 510 MRRETURN(MATCH_NOMATCH);
3634 ph10 427 }
3635 nigel 77 GETCHARINC(d, eptr);
3636 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3637 nigel 77 }
3638     }
3639     else
3640     #endif
3641     /* Not UTF-8 mode */
3642     {
3643     for (fi = min;; fi++)
3644     {
3645 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3646 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3647 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3648 ph10 426 if (eptr >= md->end_subject)
3649     {
3650     SCHECK_PARTIAL();
3651 ph10 510 MRRETURN(MATCH_NOMATCH);
3652 ph10 427 }
3653 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3654 nigel 77 }
3655     }
3656     /* Control never gets here */
3657     }
3658    
3659     /* Maximize case */
3660    
3661     else
3662     {
3663     pp = eptr;
3664    
3665     #ifdef SUPPORT_UTF8
3666     /* UTF-8 mode */
3667     if (utf8)
3668     {
3669 nigel 93 register unsigned int d;
3670 nigel 77 for (i = min; i < max; i++)
3671     {
3672     int len = 1;
3673 ph10 463 if (eptr >= md->end_subject)
3674 ph10 462 {
3675 ph10 463 SCHECK_PARTIAL();
3676 ph10 462 break;
3677 ph10 463 }
3678 nigel 77 GETCHARLEN(d, eptr, len);
3679     if (fc == d) break;
3680     eptr += len;
3681     }
3682 nigel 93 if (possessive) continue;
3683 nigel 77 for(;;)
3684     {
3685 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3686 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3687     if (eptr-- == pp) break; /* Stop if tried at original pos */
3688     BACKCHAR(eptr);
3689     }
3690     }
3691     else
3692     #endif
3693     /* Not UTF-8 mode */
3694     {
3695     for (i = min; i < max; i++)
3696     {
3697 ph10 463 if (eptr >= md->end_subject)
3698 ph10 462 {
3699 ph10 463 SCHECK_PARTIAL();
3700 ph10 462 break;
3701 ph10 463 }
3702 ph10 462 if (fc == *eptr) break;
3703 nigel 77 eptr++;
3704     }
3705 nigel 93 if (possessive) continue;
3706 nigel 77 while (eptr >= pp)
3707     {
3708 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3709 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3710     eptr--;
3711     }
3712     }
3713    
3714 ph10 510 MRRETURN(MATCH_NOMATCH);
3715 nigel 77 }
3716     }
3717     /* Control never gets here */
3718    
3719     /* Match a single character type repeatedly; several different opcodes
3720     share code. This is very similar to the code for single characters, but we
3721     repeat it in the interests of efficiency. */
3722    
3723     case OP_TYPEEXACT:
3724     min = max = GET2(ecode, 1);
3725     minimize = TRUE;
3726     ecode += 3;
3727     goto REPEATTYPE;
3728    
3729     case OP_TYPEUPTO:
3730     case OP_TYPEMINUPTO:
3731     min = 0;
3732     max = GET2(ecode, 1);
3733     minimize = *ecode == OP_TYPEMINUPTO;
3734     ecode += 3;
3735     goto REPEATTYPE;
3736    
3737 nigel 93 case OP_TYPEPOSSTAR:
3738     possessive = TRUE;
3739     min = 0;
3740     max = INT_MAX;
3741     ecode++;
3742     goto REPEATTYPE;
3743    
3744     case OP_TYPEPOSPLUS:
3745     possessive = TRUE;
3746     min = 1;
3747     max = INT_MAX;
3748     ecode++;
3749     goto REPEATTYPE;
3750    
3751     case OP_TYPEPOSQUERY:
3752     possessive = TRUE;
3753     min = 0;
3754     max = 1;
3755     ecode++;
3756     goto REPEATTYPE;
3757    
3758     case OP_TYPEPOSUPTO:
3759     possessive = TRUE;
3760     min = 0;
3761     max = GET2(ecode, 1);
3762     ecode += 3;
3763     goto REPEATTYPE;
3764    
3765 nigel 77 case OP_TYPESTAR:
3766     case OP_TYPEMINSTAR:
3767     case OP_TYPEPLUS:
3768     case OP_TYPEMINPLUS:
3769     case OP_TYPEQUERY:
3770     case OP_TYPEMINQUERY:
3771     c = *ecode++ - OP_TYPESTAR;
3772     minimize = (c & 1) != 0;
3773     min = rep_min[c]; /* Pick up values from tables; */
3774     max = rep_max[c]; /* zero for max => infinity */
3775     if (max == 0) max = INT_MAX;
3776    
3777     /* Common code for all repeated single character type matches. Note that
3778     in UTF-8 mode, '.' matches a character of any length, but for the other
3779     character types, the valid characters are all one-byte long. */
3780    
3781     REPEATTYPE:
3782     ctype = *ecode++; /* Code for the character type */
3783    
3784     #ifdef SUPPORT_UCP
3785     if (ctype == OP_PROP || ctype == OP_NOTPROP)
3786     {
3787     prop_fail_result = ctype == OP_NOTPROP;
3788     prop_type = *ecode++;
3789 nigel 87 prop_value = *ecode++;
3790 nigel 77 }
3791     else prop_type = -1;
3792     #endif
3793    
3794     /* First, ensure the minimum number of matches are present. Use inline
3795     code for maximizing the speed, and do the type test once at the start
3796 ph10 426 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3797 nigel 77 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3798     and single-bytes. */
3799    
3800     if (min > 0)
3801     {
3802     #ifdef SUPPORT_UCP
3803 nigel 87 if (prop_type >= 0)
3804 nigel 77 {
3805 nigel 87 switch(prop_type)
3806 nigel 77 {
3807 nigel 87 case PT_ANY:
3808 ph10 510 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3809 nigel 87 for (i = 1; i <= min; i++)
3810     {
3811 ph10 427 if (eptr >= md->end_subject)
3812 ph10 426 {
3813 ph10 427 SCHECK_PARTIAL();
3814 ph10 510 MRRETURN(MATCH_NOMATCH);
3815 ph10 427 }
3816 ph10 184 GETCHARINCTEST(c, eptr);
3817 nigel 87 }
3818     break;
3819    
3820     case PT_LAMP:
3821     for (i = 1; i <= min; i++)
3822     {
3823 ph10 625 int chartype;
3824 ph10 427 if (eptr >= md->end_subject)
3825 ph10 426 {
3826 ph10 427 SCHECK_PARTIAL();
3827 ph10 510 MRRETURN(MATCH_NOMATCH);
3828 ph10 427 }
3829 ph10 184 GETCHARINCTEST(c, eptr);
3830 ph10 623 chartype = UCD_CHARTYPE(c);
3831     if ((chartype == ucp_Lu ||
3832     chartype == ucp_Ll ||
3833     chartype == ucp_Lt) == prop_fail_result)
3834 ph10 510 MRRETURN(MATCH_NOMATCH);
3835 nigel 87 }
3836     break;
3837    
3838     case PT_GC:
3839     for (i = 1; i <= min; i++)
3840     {
3841 ph10 427 if (eptr >= md->end_subject)
3842 ph10 426 {
3843 ph10 427 SCHECK_PARTIAL();
3844 ph10 510 MRRETURN(MATCH_NOMATCH);
3845 ph10 427 }
3846 ph10 184 GETCHARINCTEST(c, eptr);
3847 ph10 623 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3848 ph10 510 MRRETURN(MATCH_NOMATCH);
3849 nigel 87 }
3850     break;
3851    
3852     case PT_PC:
3853     for (i = 1; i <= min; i++)
3854     {
3855 ph10 427 if (eptr >= md->end_subject)
3856 ph10