/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 888 - (hide annotations) (download)
Tue Jan 17 14:43:23 2012 UTC (2 years, 9 months ago) by ph10
File MIME type: text/plain
File size: 211788 byte(s)
Fix MARK bug for assertions.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 836 Copyright (c) 1997-2012 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 ph10 625 /* Values for setting in md->match_function_type to indicate two special types
61     of call to match(). We do it this way to save on using another stack variable,
62 ph10 604 as stack usage is to be discouraged. */
63 nigel 77
64 ph10 604 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65     #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66 nigel 77
67     /* Non-error returns from the match() function. Error returns are externally
68     defined PCRE_ERROR_xxx codes, which are all negative. */
69    
70     #define MATCH_MATCH 1
71     #define MATCH_NOMATCH 0
72    
73 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
74 ph10 210 negative to avoid the external error codes. */
75    
76 ph10 511 #define MATCH_ACCEPT (-999)
77     #define MATCH_COMMIT (-998)
78 ph10 604 #define MATCH_KETRPOS (-997)
79 ph10 618 #define MATCH_ONCE (-996)
80     #define MATCH_PRUNE (-995)
81     #define MATCH_SKIP (-994)
82     #define MATCH_SKIP_ARG (-993)
83     #define MATCH_THEN (-992)
84 ph10 210
85 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
86     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87     because the offset vector is always a multiple of 3 long. */
88    
89     #define REC_STACK_SAVE_MAX 30
90    
91     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92    
93     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95    
96    
97    
98 ph10 475 #ifdef PCRE_DEBUG
99 nigel 77 /*************************************************
100     * Debugging function to print chars *
101     *************************************************/
102    
103     /* Print a sequence of chars in printable format, stopping at the end of the
104     subject if the requested.
105    
106     Arguments:
107     p points to characters
108     length number to print
109     is_subject TRUE if printing from within md->start_subject
110     md pointer to matching data block, if is_subject is TRUE
111    
112     Returns: nothing
113     */
114    
115     static void
116 ph10 836 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
117 nigel 77 {
118 nigel 93 unsigned int c;
119 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120     while (length-- > 0)
121     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122     }
123     #endif
124    
125    
126    
127     /*************************************************
128     * Match a back-reference *
129     *************************************************/
130    
131 ph10 595 /* Normally, if a back reference hasn't been set, the length that is passed is
132     negative, so the match always fails. However, in JavaScript compatibility mode,
133 ph10 625 the length passed is zero. Note that in caseless UTF-8 mode, the number of
134 ph10 595 subject bytes matched may be different to the number of reference bytes.
135 nigel 77
136     Arguments:
137     offset index into the offset vector
138 ph10 595 eptr pointer into the subject
139     length length of reference to be matched (number of bytes)
140 nigel 77 md points to match data block
141 ph10 602 caseless TRUE if caseless
142 nigel 77
143 ph10 595 Returns: < 0 if not matched, otherwise the number of subject bytes matched
144 nigel 77 */
145    
146 ph10 595 static int
147 ph10 836 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
148 ph10 602 BOOL caseless)
149 nigel 77 {
150 ph10 836 PCRE_PUCHAR eptr_start = eptr;
151     register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
152 nigel 77
153 ph10 475 #ifdef PCRE_DEBUG
154 nigel 77 if (eptr >= md->end_subject)
155     printf("matching subject <null>");
156     else
157     {
158     printf("matching subject ");
159     pchars(eptr, length, TRUE, md);
160     }
161     printf(" against backref ");
162     pchars(p, length, FALSE, md);
163     printf("\n");
164     #endif
165    
166 ph10 595 /* Always fail if reference not set (and not JavaScript compatible). */
167 nigel 77
168 ph10 595 if (length < 0) return -1;
169 nigel 77
170 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
171     properly if Unicode properties are supported. Otherwise, we can check only
172     ASCII characters. */
173 nigel 77
174 ph10 602 if (caseless)
175 nigel 77 {
176 ph10 836 #ifdef SUPPORT_UTF
177 ph10 354 #ifdef SUPPORT_UCP
178 ph10 836 if (md->utf)
179 ph10 354 {
180 ph10 625 /* Match characters up to the end of the reference. NOTE: the number of
181 ph10 595 bytes matched may differ, because there are some characters whose upper and
182     lower case versions code as different numbers of bytes. For example, U+023A
183     (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
184     a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
185 ph10 625 the latter. It is important, therefore, to check the length along the
186 ph10 595 reference, not along the subject (earlier code did this wrong). */
187 ph10 625
188 ph10 836 PCRE_PUCHAR endptr = p + length;
189 ph10 595 while (p < endptr)
190 ph10 354 {
191 ph10 358 int c, d;
192 ph10 597 if (eptr >= md->end_subject) return -1;
193 ph10 354 GETCHARINC(c, eptr);
194     GETCHARINC(d, p);
195 ph10 595 if (c != d && c != UCD_OTHERCASE(d)) return -1;
196 ph10 358 }
197     }
198 ph10 354 else
199     #endif
200     #endif
201    
202     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
203     is no UCP support. */
204 ph10 597 {
205 ph10 625 if (eptr + length > md->end_subject) return -1;
206 ph10 597 while (length-- > 0)
207 ph10 836 {
208     if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
209     p++;
210     eptr++;
211     }
212 ph10 625 }
213 nigel 77 }
214 ph10 358
215 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
216     are in UTF-8 mode. */
217 ph10 358
218 nigel 77 else
219 ph10 625 {
220     if (eptr + length > md->end_subject) return -1;
221     while (length-- > 0) if (*p++ != *eptr++) return -1;
222 ph10 597 }
223 nigel 77
224 ph10 836 return (int)(eptr - eptr_start);
225 nigel 77 }
226    
227    
228    
229     /***************************************************************************
230     ****************************************************************************
231     RECURSION IN THE match() FUNCTION
232    
233 nigel 87 The match() function is highly recursive, though not every recursive call
234     increases the recursive depth. Nevertheless, some regular expressions can cause
235     it to recurse to a great depth. I was writing for Unix, so I just let it call
236     itself recursively. This uses the stack for saving everything that has to be
237     saved for a recursive call. On Unix, the stack can be large, and this works
238     fine.
239 nigel 77
240 nigel 87 It turns out that on some non-Unix-like systems there are problems with
241     programs that use a lot of stack. (This despite the fact that every last chip
242     has oodles of memory these days, and techniques for extending the stack have
243     been known for decades.) So....
244 nigel 77
245     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
246     calls by keeping local variables that need to be preserved in blocks of memory
247 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
248 nigel 77 achieve this so that the actual code doesn't look very different to what it
249     always used to.
250 ph10 164
251 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
252 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
253     Switzer, the use of longjmp() has been abolished, at the cost of having to
254     provide a unique number for each call to RMATCH. There is no way of generating
255     a sequence of numbers at compile time in C. I have given them names, to make
256     them stand out more clearly.
257    
258     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
259     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
260 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
261     don't have indeterminate values; this has meant that the frame size can be
262 ph10 164 reduced because the result can be "passed back" by straight setting of the
263     variable instead of being passed in the frame.
264 nigel 77 ****************************************************************************
265     ***************************************************************************/
266    
267 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
268     below must be updated in sync. */
269 nigel 77
270 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
271     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
272     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
273     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
274 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
275 ph10 527 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
276 ph10 723 RM61, RM62, RM63, RM64, RM65, RM66 };
277 ph10 164
278 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
279 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
280 ph10 501 actually used in this definition. */
281 nigel 77
282     #ifndef NO_RECURSE
283     #define REGISTER register
284 ph10 164
285 ph10 475 #ifdef PCRE_DEBUG
286 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
287 nigel 87 { \
288     printf("match() called in line %d\n", __LINE__); \
289 ph10 836 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
290 nigel 87 printf("to line %d\n", __LINE__); \
291     }
292     #define RRETURN(ra) \
293     { \
294     printf("match() returned %d from line %d ", ra, __LINE__); \
295     return ra; \
296     }
297     #else
298 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
299 ph10 836 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
300 nigel 77 #define RRETURN(ra) return ra
301 nigel 87 #endif
302    
303 nigel 77 #else
304    
305    
306 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
307     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
308     argument of match(), which never changes. */
309 nigel 77
310     #define REGISTER
311    
312 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw)\
313 nigel 77 {\
314 ph10 836 heapframe *newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
315 ph10 534 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
316 ph10 164 frame->Xwhere = rw; \
317     newframe->Xeptr = ra;\
318     newframe->Xecode = rb;\
319 ph10 168 newframe->Xmstart = mstart;\
320 ph10 164 newframe->Xoffset_top = rc;\
321 ph10 602 newframe->Xeptrb = re;\
322 ph10 164 newframe->Xrdepth = frame->Xrdepth + 1;\
323     newframe->Xprevframe = frame;\
324     frame = newframe;\
325     DPRINTF(("restarting from line %d\n", __LINE__));\
326     goto HEAP_RECURSE;\
327     L_##rw:\
328     DPRINTF(("jumped back to line %d\n", __LINE__));\
329 nigel 77 }
330    
331     #define RRETURN(ra)\
332     {\
333 ph10 527 heapframe *oldframe = frame;\
334     frame = oldframe->Xprevframe;\
335 ph10 836 (PUBL(stack_free))(oldframe);\
336 nigel 77 if (frame != NULL)\
337     {\
338 ph10 164 rrc = ra;\
339     goto HEAP_RETURN;\
340 nigel 77 }\
341     return ra;\
342     }
343    
344    
345     /* Structure for remembering the local variables in a private frame */
346    
347     typedef struct heapframe {
348     struct heapframe *Xprevframe;
349    
350     /* Function arguments that may change */
351    
352 ph10 836 PCRE_PUCHAR Xeptr;
353     const pcre_uchar *Xecode;
354     PCRE_PUCHAR Xmstart;
355 nigel 77 int Xoffset_top;
356     eptrblock *Xeptrb;
357 nigel 91 unsigned int Xrdepth;
358 nigel 77
359     /* Function local variables */
360    
361 ph10 836 PCRE_PUCHAR Xcallpat;
362     #ifdef SUPPORT_UTF
363     PCRE_PUCHAR Xcharptr;
364 ph10 406 #endif
365 ph10 836 PCRE_PUCHAR Xdata;
366     PCRE_PUCHAR Xnext;
367     PCRE_PUCHAR Xpp;
368     PCRE_PUCHAR Xprev;
369     PCRE_PUCHAR Xsaved_eptr;
370 nigel 77
371     recursion_info Xnew_recursive;
372    
373     BOOL Xcur_is_word;
374     BOOL Xcondition;
375     BOOL Xprev_is_word;
376    
377     #ifdef SUPPORT_UCP
378     int Xprop_type;
379 nigel 87 int Xprop_value;
380 nigel 77 int Xprop_fail_result;
381 ph10 123 int Xoclength;
382 ph10 836 pcre_uchar Xocchars[6];
383 nigel 77 #endif
384    
385 ph10 403 int Xcodelink;
386 nigel 77 int Xctype;
387 nigel 93 unsigned int Xfc;
388 nigel 77 int Xfi;
389     int Xlength;
390     int Xmax;
391     int Xmin;
392     int Xnumber;
393     int Xoffset;
394     int Xop;
395     int Xsave_capture_last;
396     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
397     int Xstacksave[REC_STACK_SAVE_MAX];
398    
399     eptrblock Xnewptrb;
400    
401 ph10 164 /* Where to jump back to */
402 nigel 77
403 ph10 164 int Xwhere;
404 ph10 165
405 nigel 77 } heapframe;
406    
407     #endif
408    
409    
410     /***************************************************************************
411     ***************************************************************************/
412    
413    
414    
415     /*************************************************
416     * Match from current position *
417     *************************************************/
418    
419 nigel 93 /* This function is called recursively in many circumstances. Whenever it
420 nigel 77 returns a negative (error) response, the outer incarnation must also return the
421 ph10 426 same response. */
422 nigel 77
423 ph10 426 /* These macros pack up tests that are used for partial matching, and which
424 ph10 836 appear several times in the code. We set the "hit end" flag if the pointer is
425 ph10 426 at the end of the subject and also past the start of the subject (i.e.
426 ph10 427 something has been matched). For hard partial matching, we then return
427     immediately. The second one is used when we already know we are past the end of
428     the subject. */
429 ph10 426
430     #define CHECK_PARTIAL()\
431 ph10 553 if (md->partial != 0 && eptr >= md->end_subject && \
432     eptr > md->start_used_ptr) \
433     { \
434     md->hitend = TRUE; \
435 ph10 836 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
436 ph10 427 }
437 ph10 426
438     #define SCHECK_PARTIAL()\
439 ph10 553 if (md->partial != 0 && eptr > md->start_used_ptr) \
440     { \
441     md->hitend = TRUE; \
442 ph10 836 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
443 ph10 427 }
444 ph10 426
445 ph10 427
446 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
447 ph10 836 the md structure (e.g. utf, end_subject) into individual variables to improve
448 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
449     made performance worse.
450    
451     Arguments:
452 nigel 93 eptr pointer to current character in subject
453     ecode pointer to current position in compiled code
454 ph10 168 mstart pointer to the current match start position (can be modified
455 ph10 172 by encountering \K)
456 nigel 77 offset_top current top pointer
457     md pointer to "static" info for the match
458     eptrb pointer to chain of blocks containing eptr at start of
459     brackets - for testing for empty matches
460 nigel 87 rdepth the recursion depth
461 nigel 77
462     Returns: MATCH_MATCH if matched ) these values are >= 0
463     MATCH_NOMATCH if failed to match )
464 ph10 510 a negative MATCH_xxx value for PRUNE, SKIP, etc
465 nigel 77 a negative PCRE_ERROR_xxx value if aborted by an error condition
466 nigel 87 (e.g. stopped by repeated call or recursion limit)
467 nigel 77 */
468    
469     static int
470 ph10 836 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
471 ph10 842 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
472 ph10 835 unsigned int rdepth)
473 nigel 77 {
474     /* These variables do not need to be preserved over recursion in this function,
475 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
476     "register" because they are used a lot in loops. */
477 nigel 77
478 nigel 91 register int rrc; /* Returns from recursive calls */
479     register int i; /* Used for loops not involving calls to RMATCH() */
480 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
481 ph10 836 register BOOL utf; /* Local copy of UTF flag for speed */
482 nigel 77
483 nigel 93 BOOL minimize, possessive; /* Quantifier options */
484 ph10 602 BOOL caseless;
485 ph10 403 int condcode;
486 nigel 93
487 nigel 77 /* When recursion is not being used, all "local" variables that have to be
488     preserved over calls to RMATCH() are part of a "frame" which is obtained from
489     heap storage. Set up the top-level frame here; others are obtained from the
490     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
491    
492     #ifdef NO_RECURSE
493 ph10 836 heapframe *frame = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));
494 ph10 531 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
495 nigel 77 frame->Xprevframe = NULL; /* Marks the top level */
496    
497     /* Copy in the original argument variables */
498    
499     frame->Xeptr = eptr;
500     frame->Xecode = ecode;
501 ph10 168 frame->Xmstart = mstart;
502 nigel 77 frame->Xoffset_top = offset_top;
503     frame->Xeptrb = eptrb;
504 nigel 87 frame->Xrdepth = rdepth;
505 nigel 77
506     /* This is where control jumps back to to effect "recursion" */
507    
508     HEAP_RECURSE:
509    
510     /* Macros make the argument variables come from the current frame */
511    
512     #define eptr frame->Xeptr
513     #define ecode frame->Xecode
514 ph10 168 #define mstart frame->Xmstart
515 nigel 77 #define offset_top frame->Xoffset_top
516     #define eptrb frame->Xeptrb
517 nigel 87 #define rdepth frame->Xrdepth
518 nigel 77
519     /* Ditto for the local variables */
520    
521 ph10 836 #ifdef SUPPORT_UTF
522 nigel 77 #define charptr frame->Xcharptr
523     #endif
524     #define callpat frame->Xcallpat
525 ph10 403 #define codelink frame->Xcodelink
526 nigel 77 #define data frame->Xdata
527     #define next frame->Xnext
528     #define pp frame->Xpp
529     #define prev frame->Xprev
530     #define saved_eptr frame->Xsaved_eptr
531    
532     #define new_recursive frame->Xnew_recursive
533    
534     #define cur_is_word frame->Xcur_is_word
535     #define condition frame->Xcondition
536     #define prev_is_word frame->Xprev_is_word
537    
538     #ifdef SUPPORT_UCP
539     #define prop_type frame->Xprop_type
540 nigel 87 #define prop_value frame->Xprop_value
541 nigel 77 #define prop_fail_result frame->Xprop_fail_result
542 ph10 115 #define oclength frame->Xoclength
543     #define occhars frame->Xocchars
544 nigel 77 #endif
545    
546     #define ctype frame->Xctype
547     #define fc frame->Xfc
548     #define fi frame->Xfi
549     #define length frame->Xlength
550     #define max frame->Xmax
551     #define min frame->Xmin
552     #define number frame->Xnumber
553     #define offset frame->Xoffset
554     #define op frame->Xop
555     #define save_capture_last frame->Xsave_capture_last
556     #define save_offset1 frame->Xsave_offset1
557     #define save_offset2 frame->Xsave_offset2
558     #define save_offset3 frame->Xsave_offset3
559     #define stacksave frame->Xstacksave
560    
561     #define newptrb frame->Xnewptrb
562    
563     /* When recursion is being used, local variables are allocated on the stack and
564     get preserved during recursion in the normal way. In this environment, fi and
565     i, and fc and c, can be the same variables. */
566    
567 nigel 93 #else /* NO_RECURSE not defined */
568 nigel 77 #define fi i
569     #define fc c
570    
571 ph10 604 /* Many of the following variables are used only in small blocks of the code.
572     My normal style of coding would have declared them within each of those blocks.
573     However, in order to accommodate the version of this code that uses an external
574     "stack" implemented on the heap, it is easier to declare them all here, so the
575     declarations can be cut out in a block. The only declarations within blocks
576     below are for variables that do not have to be preserved over a recursive call
577     to RMATCH(). */
578 nigel 77
579 ph10 836 #ifdef SUPPORT_UTF
580     const pcre_uchar *charptr;
581 ph10 625 #endif
582 ph10 836 const pcre_uchar *callpat;
583     const pcre_uchar *data;
584     const pcre_uchar *next;
585     PCRE_PUCHAR pp;
586     const pcre_uchar *prev;
587     PCRE_PUCHAR saved_eptr;
588 ph10 625
589     recursion_info new_recursive;
590    
591     BOOL cur_is_word;
592 nigel 87 BOOL condition;
593 nigel 77 BOOL prev_is_word;
594    
595     #ifdef SUPPORT_UCP
596     int prop_type;
597 nigel 87 int prop_value;
598 nigel 77 int prop_fail_result;
599 ph10 115 int oclength;
600 ph10 836 pcre_uchar occhars[6];
601 nigel 77 #endif
602    
603 ph10 399 int codelink;
604 nigel 77 int ctype;
605     int length;
606     int max;
607     int min;
608     int number;
609     int offset;
610     int op;
611     int save_capture_last;
612     int save_offset1, save_offset2, save_offset3;
613     int stacksave[REC_STACK_SAVE_MAX];
614    
615     eptrblock newptrb;
616 nigel 93 #endif /* NO_RECURSE */
617 nigel 77
618 ph10 625 /* To save space on the stack and in the heap frame, I have doubled up on some
619     of the local variables that are used only in localised parts of the code, but
620     still need to be preserved over recursive calls of match(). These macros define
621 ph10 604 the alternative names that are used. */
622    
623     #define allow_zero cur_is_word
624     #define cbegroup condition
625     #define code_offset codelink
626     #define condassert condition
627     #define matched_once prev_is_word
628 ph10 836 #define foc number
629 ph10 882 #define save_mark data
630 ph10 604
631 nigel 77 /* These statements are here to stop the compiler complaining about unitialized
632     variables. */
633    
634     #ifdef SUPPORT_UCP
635 nigel 87 prop_value = 0;
636 nigel 77 prop_fail_result = 0;
637     #endif
638    
639 nigel 93
640 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
641     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
642     used. Thanks to Ian Taylor for noticing this possibility and sending the
643     original patch. */
644    
645     TAIL_RECURSE:
646    
647 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
648     are specified by the macro RMATCH and RRETURN is used to return. When
649     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
650 ph10 475 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
651 nigel 87 defined). However, RMATCH isn't like a function call because it's quite a
652     complicated macro. It has to be used in one particular way. This shouldn't,
653     however, impact performance when true recursion is being used. */
654 nigel 77
655 ph10 836 #ifdef SUPPORT_UTF
656     utf = md->utf; /* Local copy of the flag */
657 ph10 164 #else
658 ph10 836 utf = FALSE;
659 ph10 164 #endif
660    
661 nigel 87 /* First check that we haven't called match() too many times, or that we
662     haven't exceeded the recursive call limit. */
663    
664 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
665 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
666 nigel 77
667 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
668 ph10 625 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
669     done this way to save having to use another function argument, which would take
670 ph10 604 up space on the stack. See also MATCH_CONDASSERT below.
671 nigel 77
672 ph10 604 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
673     such remembered pointers, to be checked when we hit the closing ket, in order
674     to break infinite loops that match no characters. When match() is called in
675     other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
676     NOT be used with tail recursion, because the memory block that is used is on
677     the stack, so a new one may be required for each match(). */
678    
679     if (md->match_function_type == MATCH_CBEGROUP)
680 nigel 77 {
681 ph10 197 newptrb.epb_saved_eptr = eptr;
682     newptrb.epb_prev = eptrb;
683     eptrb = &newptrb;
684 ph10 604 md->match_function_type = 0;
685 nigel 77 }
686    
687 nigel 93 /* Now start processing the opcodes. */
688 nigel 77
689     for (;;)
690     {
691 nigel 93 minimize = possessive = FALSE;
692 nigel 77 op = *ecode;
693 ph10 625
694 nigel 93 switch(op)
695     {
696 ph10 510 case OP_MARK:
697 ph10 836 md->nomatch_mark = ecode + 2;
698     md->mark = NULL; /* In case previously set by assertion */
699     RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
700 ph10 604 eptrb, RM55);
701 ph10 836 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
702     md->mark == NULL) md->mark = ecode + 2;
703 ph10 512
704     /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
705     argument, and we must check whether that argument matches this MARK's
706     argument. It is passed back in md->start_match_ptr (an overloading of that
707     variable). If it does match, we reset that variable to the current subject
708     position and return MATCH_SKIP. Otherwise, pass back the return code
709 ph10 510 unaltered. */
710 ph10 512
711 ph10 836 else if (rrc == MATCH_SKIP_ARG &&
712     STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
713 ph10 510 {
714     md->start_match_ptr = eptr;
715     RRETURN(MATCH_SKIP);
716     }
717     RRETURN(rrc);
718    
719 ph10 210 case OP_FAIL:
720 ph10 836 RRETURN(MATCH_NOMATCH);
721 ph10 211
722 ph10 551 /* COMMIT overrides PRUNE, SKIP, and THEN */
723 ph10 553
724 ph10 510 case OP_COMMIT:
725 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
726 ph10 604 eptrb, RM52);
727 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
728 ph10 553 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
729     rrc != MATCH_THEN)
730 ph10 551 RRETURN(rrc);
731 ph10 836 RRETURN(MATCH_COMMIT);
732 ph10 510
733 ph10 551 /* PRUNE overrides THEN */
734 ph10 553
735 ph10 210 case OP_PRUNE:
736 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
737 ph10 604 eptrb, RM51);
738 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
739 ph10 836 RRETURN(MATCH_PRUNE);
740 ph10 211
741 ph10 510 case OP_PRUNE_ARG:
742 ph10 836 md->nomatch_mark = ecode + 2;
743     md->mark = NULL; /* In case previously set by assertion */
744     RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
745 ph10 604 eptrb, RM56);
746 ph10 836 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
747     md->mark == NULL) md->mark = ecode + 2;
748 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
749 ph10 510 RRETURN(MATCH_PRUNE);
750 ph10 211
751 ph10 551 /* SKIP overrides PRUNE and THEN */
752 ph10 553
753 ph10 210 case OP_SKIP:
754 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
755 ph10 604 eptrb, RM53);
756 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
757 ph10 551 RRETURN(rrc);
758 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
759 ph10 836 RRETURN(MATCH_SKIP);
760 ph10 211
761 ph10 836 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
762     nomatch_mark. There is a flag that disables this opcode when re-matching a
763     pattern that ended with a SKIP for which there was not a matching MARK. */
764    
765 ph10 510 case OP_SKIP_ARG:
766 ph10 836 if (md->ignore_skip_arg)
767     {
768     ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
769     break;
770     }
771     RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
772 ph10 604 eptrb, RM57);
773 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
774 ph10 551 RRETURN(rrc);
775 ph10 512
776     /* Pass back the current skip name by overloading md->start_match_ptr and
777     returning the special MATCH_SKIP_ARG return code. This will either be
778 ph10 836 caught by a matching MARK, or get to the top, where it causes a rematch
779     with the md->ignore_skip_arg flag set. */
780 ph10 512
781 ph10 510 md->start_match_ptr = ecode + 2;
782 ph10 512 RRETURN(MATCH_SKIP_ARG);
783 ph10 553
784 ph10 716 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
785     the branch in which it occurs can be determined. Overload the start of
786     match pointer to do this. */
787 ph10 512
788 ph10 210 case OP_THEN:
789 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
790 ph10 604 eptrb, RM54);
791 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
792 ph10 716 md->start_match_ptr = ecode;
793 ph10 836 RRETURN(MATCH_THEN);
794 ph10 510
795     case OP_THEN_ARG:
796 ph10 836 md->nomatch_mark = ecode + 2;
797     md->mark = NULL; /* In case previously set by assertion */
798     RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
799 ph10 716 md, eptrb, RM58);
800 ph10 836 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
801     md->mark == NULL) md->mark = ecode + 2;
802 ph10 510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
803 ph10 733 md->start_match_ptr = ecode;
804 ph10 212 RRETURN(MATCH_THEN);
805 ph10 733
806 ph10 723 /* Handle an atomic group that does not contain any capturing parentheses.
807 ph10 733 This can be handled like an assertion. Prior to 8.13, all atomic groups
808     were handled this way. In 8.13, the code was changed as below for ONCE, so
809     that backups pass through the group and thereby reset captured values.
810     However, this uses a lot more stack, so in 8.20, atomic groups that do not
811     contain any captures generate OP_ONCE_NC, which can be handled in the old,
812 ph10 723 less stack intensive way.
813 ph10 211
814 ph10 723 Check the alternative branches in turn - the matching won't pass the KET
815     for this kind of subpattern. If any one branch matches, we carry on as at
816     the end of a normal bracket, leaving the subject pointer, but resetting
817     the start-of-match value in case it was changed by \K. */
818    
819     case OP_ONCE_NC:
820     prev = ecode;
821     saved_eptr = eptr;
822 ph10 882 save_mark = md->mark;
823 ph10 723 do
824     {
825     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
826     if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
827     {
828     mstart = md->start_match_ptr;
829     break;
830     }
831     if (rrc == MATCH_THEN)
832     {
833     next = ecode + GET(ecode,1);
834 ph10 733 if (md->start_match_ptr < next &&
835 ph10 723 (*ecode == OP_ALT || *next == OP_ALT))
836     rrc = MATCH_NOMATCH;
837 ph10 733 }
838    
839 ph10 723 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
840     ecode += GET(ecode,1);
841 ph10 882 md->mark = save_mark;
842 ph10 723 }
843     while (*ecode == OP_ALT);
844    
845     /* If hit the end of the group (which could be repeated), fail */
846    
847     if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
848    
849     /* Continue as from after the group, updating the offsets high water
850     mark, since extracts may have been taken. */
851    
852     do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
853    
854     offset_top = md->end_offset_top;
855     eptr = md->end_match_ptr;
856    
857     /* For a non-repeating ket, just continue at this level. This also
858     happens for a repeating ket if no characters were matched in the group.
859     This is the forcible breaking of infinite loops as implemented in Perl
860     5.005. */
861    
862     if (*ecode == OP_KET || eptr == saved_eptr)
863     {
864     ecode += 1+LINK_SIZE;
865     break;
866     }
867    
868     /* The repeating kets try the rest of the pattern or restart from the
869     preceding bracket, in the appropriate order. The second "call" of match()
870     uses tail recursion, to avoid using another stack frame. */
871    
872     if (*ecode == OP_KETRMIN)
873     {
874     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
875     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
876     ecode = prev;
877     goto TAIL_RECURSE;
878     }
879     else /* OP_KETRMAX */
880     {
881 ph10 733 md->match_function_type = MATCH_CBEGROUP;
882 ph10 723 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
883     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
884     ecode += 1 + LINK_SIZE;
885     goto TAIL_RECURSE;
886     }
887     /* Control never gets here */
888    
889 ph10 604 /* Handle a capturing bracket, other than those that are possessive with an
890     unlimited repeat. If there is space in the offset vector, save the current
891     subject position in the working slot at the top of the vector. We mustn't
892     change the current values of the data slot, because they may be set from a
893     previous iteration of this group, and be referred to by a reference inside
894 ph10 625 the group. A failure to match might occur after the group has succeeded,
895 ph10 617 if something later on doesn't match. For this reason, we need to restore
896     the working value and also the values of the final offsets, in case they
897     were set by a previous iteration of the same bracket.
898 nigel 77
899 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
900     a non-capturing bracket. Don't worry about setting the flag for the error
901     case here; that is handled in the code for KET. */
902 nigel 77
903 nigel 93 case OP_CBRA:
904     case OP_SCBRA:
905     number = GET2(ecode, 1+LINK_SIZE);
906 nigel 77 offset = number << 1;
907 ph10 625
908 ph10 475 #ifdef PCRE_DEBUG
909 nigel 93 printf("start bracket %d\n", number);
910     printf("subject=");
911 nigel 77 pchars(eptr, 16, TRUE, md);
912     printf("\n");
913     #endif
914    
915     if (offset < md->offset_max)
916     {
917     save_offset1 = md->offset_vector[offset];
918     save_offset2 = md->offset_vector[offset+1];
919     save_offset3 = md->offset_vector[md->offset_end - number];
920     save_capture_last = md->capture_last;
921 ph10 882 save_mark = md->mark;
922 nigel 77
923     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
924 ph10 531 md->offset_vector[md->offset_end - number] =
925 ph10 530 (int)(eptr - md->start_subject);
926 nigel 77
927 ph10 604 for (;;)
928 nigel 77 {
929 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
930 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
931 ph10 604 eptrb, RM1);
932 ph10 618 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
933 ph10 733
934     /* If we backed up to a THEN, check whether it is within the current
935     branch by comparing the address of the THEN that is passed back with
936 ph10 716 the end of the branch. If it is within the current branch, and the
937     branch is one of two or more alternatives (it either starts or ends
938 ph10 733 with OP_ALT), we have reached the limit of THEN's action, so convert
939     the return code to NOMATCH, which will cause normal backtracking to
940 ph10 716 happen from now on. Otherwise, THEN is passed back to an outer
941 ph10 733 alternative. This implements Perl's treatment of parenthesized groups,
942     where a group not containing | does not affect the current alternative,
943 ph10 716 that is, (X) is NOT the same as (X|(*F)). */
944    
945     if (rrc == MATCH_THEN)
946     {
947     next = ecode + GET(ecode,1);
948 ph10 733 if (md->start_match_ptr < next &&
949 ph10 716 (*ecode == OP_ALT || *next == OP_ALT))
950     rrc = MATCH_NOMATCH;
951 ph10 733 }
952    
953 ph10 716 /* Anything other than NOMATCH is passed back. */
954    
955     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
956 nigel 77 md->capture_last = save_capture_last;
957     ecode += GET(ecode, 1);
958 ph10 882 md->mark = save_mark;
959 ph10 625 if (*ecode != OP_ALT) break;
960 nigel 77 }
961    
962     DPRINTF(("bracket %d failed\n", number));
963     md->offset_vector[offset] = save_offset1;
964     md->offset_vector[offset+1] = save_offset2;
965     md->offset_vector[md->offset_end - number] = save_offset3;
966 ph10 625
967 ph10 716 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
968 nigel 77
969 ph10 716 RRETURN(rrc);
970 nigel 77 }
971    
972 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
973     as a non-capturing bracket. */
974 nigel 77
975 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
976     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
977    
978 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
979 nigel 77
980 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
981     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
982    
983 ph10 618 /* Non-capturing or atomic group, except for possessive with unlimited
984 ph10 723 repeat and ONCE group with no captures. Loop for all the alternatives.
985 ph10 708
986 ph10 702 When we get to the final alternative within the brackets, we used to return
987     the result of a recursive call to match() whatever happened so it was
988     possible to reduce stack usage by turning this into a tail recursion,
989     except in the case of a possibly empty group. However, now that there is
990     the possiblity of (*THEN) occurring in the final alternative, this
991     optimization is no longer always possible.
992 ph10 625
993 ph10 708 We can optimize if we know there are no (*THEN)s in the pattern; at present
994     this is the best that can be done.
995    
996 ph10 625 MATCH_ONCE is returned when the end of an atomic group is successfully
997     reached, but subsequent matching fails. It passes back up the tree (causing
998     captured values to be reset) until the original atomic group level is
999 ph10 618 reached. This is tested by comparing md->once_target with the start of the
1000     group. At this point, the return is converted into MATCH_NOMATCH so that
1001     previous backup points can be taken. */
1002 nigel 77
1003 ph10 618 case OP_ONCE:
1004 nigel 93 case OP_BRA:
1005     case OP_SBRA:
1006     DPRINTF(("start non-capturing bracket\n"));
1007 ph10 618
1008 nigel 91 for (;;)
1009 nigel 77 {
1010 ph10 618 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1011 ph10 702
1012     /* If this is not a possibly empty group, and there are no (*THEN)s in
1013 ph10 708 the pattern, and this is the final alternative, optimize as described
1014 ph10 702 above. */
1015    
1016     else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1017     {
1018 ph10 836 ecode += PRIV(OP_lengths)[*ecode];
1019 ph10 702 goto TAIL_RECURSE;
1020 ph10 708 }
1021 ph10 702
1022     /* In all other cases, we have to make another call to match(). */
1023    
1024 ph10 882 save_mark = md->mark;
1025 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1026 ph10 604 RM2);
1027 ph10 882
1028 ph10 716 /* See comment in the code for capturing groups above about handling
1029     THEN. */
1030    
1031     if (rrc == MATCH_THEN)
1032 ph10 625 {
1033 ph10 716 next = ecode + GET(ecode,1);
1034 ph10 733 if (md->start_match_ptr < next &&
1035 ph10 716 (*ecode == OP_ALT || *next == OP_ALT))
1036     rrc = MATCH_NOMATCH;
1037 ph10 733 }
1038    
1039     if (rrc != MATCH_NOMATCH)
1040 ph10 716 {
1041 ph10 618 if (rrc == MATCH_ONCE)
1042     {
1043 ph10 836 const pcre_uchar *scode = ecode;
1044 ph10 618 if (*scode != OP_ONCE) /* If not at start, find it */
1045     {
1046     while (*scode == OP_ALT) scode += GET(scode, 1);
1047     scode -= GET(scode, 1);
1048 ph10 625 }
1049 ph10 618 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1050 ph10 625 }
1051 ph10 550 RRETURN(rrc);
1052 ph10 625 }
1053 nigel 77 ecode += GET(ecode, 1);
1054 ph10 882 md->mark = save_mark;
1055 ph10 625 if (*ecode != OP_ALT) break;
1056 nigel 77 }
1057 ph10 733
1058 ph10 609 RRETURN(MATCH_NOMATCH);
1059    
1060 ph10 625 /* Handle possessive capturing brackets with an unlimited repeat. We come
1061 ph10 604 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1062     handled similarly to the normal case above. However, the matching is
1063     different. The end of these brackets will always be OP_KETRPOS, which
1064     returns MATCH_KETRPOS without going further in the pattern. By this means
1065     we can handle the group by iteration rather than recursion, thereby
1066     reducing the amount of stack needed. */
1067 ph10 625
1068 ph10 604 case OP_CBRAPOS:
1069     case OP_SCBRAPOS:
1070     allow_zero = FALSE;
1071 ph10 625
1072 ph10 604 POSSESSIVE_CAPTURE:
1073     number = GET2(ecode, 1+LINK_SIZE);
1074     offset = number << 1;
1075    
1076     #ifdef PCRE_DEBUG
1077     printf("start possessive bracket %d\n", number);
1078     printf("subject=");
1079     pchars(eptr, 16, TRUE, md);
1080     printf("\n");
1081     #endif
1082    
1083     if (offset < md->offset_max)
1084     {
1085     matched_once = FALSE;
1086 ph10 836 code_offset = (int)(ecode - md->start_code);
1087 ph10 604
1088     save_offset1 = md->offset_vector[offset];
1089     save_offset2 = md->offset_vector[offset+1];
1090     save_offset3 = md->offset_vector[md->offset_end - number];
1091     save_capture_last = md->capture_last;
1092    
1093     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1094 ph10 625
1095     /* Each time round the loop, save the current subject position for use
1096     when the group matches. For MATCH_MATCH, the group has matched, so we
1097     restart it with a new subject starting position, remembering that we had
1098     at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1099     usual. If we haven't matched any alternatives in any iteration, check to
1100     see if a previous iteration matched. If so, the group has matched;
1101     continue from afterwards. Otherwise it has failed; restore the previous
1102 ph10 604 capture values before returning NOMATCH. */
1103 ph10 625
1104 ph10 604 for (;;)
1105     {
1106     md->offset_vector[md->offset_end - number] =
1107     (int)(eptr - md->start_subject);
1108 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1109 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1110 ph10 604 eptrb, RM63);
1111     if (rrc == MATCH_KETRPOS)
1112     {
1113     offset_top = md->end_offset_top;
1114     eptr = md->end_match_ptr;
1115 ph10 625 ecode = md->start_code + code_offset;
1116 ph10 604 save_capture_last = md->capture_last;
1117 ph10 625 matched_once = TRUE;
1118     continue;
1119     }
1120 ph10 733
1121 ph10 716 /* See comment in the code for capturing groups above about handling
1122     THEN. */
1123    
1124     if (rrc == MATCH_THEN)
1125     {
1126     next = ecode + GET(ecode,1);
1127 ph10 733 if (md->start_match_ptr < next &&
1128 ph10 716 (*ecode == OP_ALT || *next == OP_ALT))
1129     rrc = MATCH_NOMATCH;
1130 ph10 733 }
1131 ph10 716
1132     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1133 ph10 604 md->capture_last = save_capture_last;
1134     ecode += GET(ecode, 1);
1135 ph10 625 if (*ecode != OP_ALT) break;
1136 ph10 604 }
1137 ph10 610
1138 ph10 604 if (!matched_once)
1139 ph10 625 {
1140 ph10 604 md->offset_vector[offset] = save_offset1;
1141     md->offset_vector[offset+1] = save_offset2;
1142     md->offset_vector[md->offset_end - number] = save_offset3;
1143     }
1144 ph10 625
1145 ph10 604 if (allow_zero || matched_once)
1146 ph10 625 {
1147 ph10 604 ecode += 1 + LINK_SIZE;
1148     break;
1149 ph10 625 }
1150    
1151 ph10 604 RRETURN(MATCH_NOMATCH);
1152     }
1153 ph10 625
1154 ph10 604 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1155     as a non-capturing bracket. */
1156    
1157     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1158     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1159    
1160     DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1161    
1162     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1163     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1164    
1165 ph10 625 /* Non-capturing possessive bracket with unlimited repeat. We come here
1166 ph10 604 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1167     without the capturing complication. It is written out separately for speed
1168     and cleanliness. */
1169    
1170     case OP_BRAPOS:
1171     case OP_SBRAPOS:
1172 ph10 625 allow_zero = FALSE;
1173    
1174 ph10 604 POSSESSIVE_NON_CAPTURE:
1175     matched_once = FALSE;
1176 ph10 836 code_offset = (int)(ecode - md->start_code);
1177 ph10 604
1178     for (;;)
1179     {
1180 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1181 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1182 ph10 609 eptrb, RM48);
1183 ph10 604 if (rrc == MATCH_KETRPOS)
1184     {
1185 ph10 610 offset_top = md->end_offset_top;
1186 ph10 604 eptr = md->end_match_ptr;
1187 ph10 625 ecode = md->start_code + code_offset;
1188     matched_once = TRUE;
1189     continue;
1190     }
1191 ph10 733
1192 ph10 716 /* See comment in the code for capturing groups above about handling
1193     THEN. */
1194    
1195     if (rrc == MATCH_THEN)
1196     {
1197     next = ecode + GET(ecode,1);
1198 ph10 733 if (md->start_match_ptr < next &&
1199 ph10 716 (*ecode == OP_ALT || *next == OP_ALT))
1200     rrc = MATCH_NOMATCH;
1201 ph10 733 }
1202 ph10 716
1203     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1204 ph10 604 ecode += GET(ecode, 1);
1205 ph10 625 if (*ecode != OP_ALT) break;
1206 ph10 604 }
1207 ph10 625
1208     if (matched_once || allow_zero)
1209 ph10 604 {
1210     ecode += 1 + LINK_SIZE;
1211     break;
1212 ph10 625 }
1213 ph10 604 RRETURN(MATCH_NOMATCH);
1214    
1215     /* Control never reaches here. */
1216    
1217 nigel 77 /* Conditional group: compilation checked that there are no more than
1218     two branches. If the condition is false, skipping the first branch takes us
1219     past the end if there is only one branch, but that's OK because that is
1220 ph10 609 exactly what going to the ket would do. */
1221 nigel 77
1222     case OP_COND:
1223 nigel 93 case OP_SCOND:
1224 ph10 604 codelink = GET(ecode, 1);
1225 ph10 406
1226 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
1227     inserted between OP_COND and an assertion condition. */
1228 ph10 392
1229 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1230     {
1231 ph10 836 if (PUBL(callout) != NULL)
1232 ph10 381 {
1233 zherczeg 850 PUBL(callout_block) cb;
1234 ph10 645 cb.version = 2; /* Version 1 of the callout block */
1235 ph10 381 cb.callout_number = ecode[LINK_SIZE+2];
1236     cb.offset_vector = md->offset_vector;
1237 zherczeg 852 #ifdef COMPILE_PCRE8
1238 ph10 381 cb.subject = (PCRE_SPTR)md->start_subject;
1239 zherczeg 852 #else
1240     cb.subject = (PCRE_SPTR16)md->start_subject;
1241     #endif
1242 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1243     cb.start_match = (int)(mstart - md->start_subject);
1244     cb.current_position = (int)(eptr - md->start_subject);
1245 ph10 381 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1246     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1247     cb.capture_top = offset_top/2;
1248     cb.capture_last = md->capture_last;
1249     cb.callout_data = md->callout_data;
1250 ph10 836 cb.mark = md->nomatch_mark;
1251     if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1252 ph10 381 if (rrc < 0) RRETURN(rrc);
1253     }
1254 ph10 836 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1255 ph10 381 }
1256 ph10 392
1257 ph10 399 condcode = ecode[LINK_SIZE+1];
1258 ph10 406
1259 ph10 381 /* Now see what the actual condition is */
1260 ph10 392
1261 ph10 459 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1262 nigel 77 {
1263 ph10 459 if (md->recursive == NULL) /* Not recursing => FALSE */
1264     {
1265 ph10 461 condition = FALSE;
1266     ecode += GET(ecode, 1);
1267     }
1268 ph10 459 else
1269 ph10 461 {
1270 ph10 459 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1271 ph10 751 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1272 ph10 461
1273 ph10 459 /* If the test is for recursion into a specific subpattern, and it is
1274     false, but the test was set up by name, scan the table to see if the
1275     name refers to any other numbers, and test them. The condition is true
1276     if any one is set. */
1277 ph10 461
1278 ph10 751 if (!condition && condcode == OP_NRREF)
1279 ph10 459 {
1280 ph10 836 pcre_uchar *slotA = md->name_table;
1281 ph10 459 for (i = 0; i < md->name_count; i++)
1282 ph10 461 {
1283     if (GET2(slotA, 0) == recno) break;
1284 ph10 459 slotA += md->name_entry_size;
1285     }
1286 ph10 461
1287 ph10 459 /* Found a name for the number - there can be only one; duplicate
1288     names for different numbers are allowed, but not vice versa. First
1289     scan down for duplicates. */
1290 ph10 461
1291 ph10 459 if (i < md->name_count)
1292 ph10 461 {
1293 ph10 836 pcre_uchar *slotB = slotA;
1294 ph10 459 while (slotB > md->name_table)
1295     {
1296     slotB -= md->name_entry_size;
1297 ph10 836 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1298 ph10 459 {
1299     condition = GET2(slotB, 0) == md->recursive->group_num;
1300 ph10 461 if (condition) break;
1301     }
1302 ph10 459 else break;
1303 ph10 461 }
1304    
1305 ph10 459 /* Scan up for duplicates */
1306 ph10 461
1307 ph10 459 if (!condition)
1308 ph10 461 {
1309 ph10 459 slotB = slotA;
1310     for (i++; i < md->name_count; i++)
1311     {
1312     slotB += md->name_entry_size;
1313 ph10 836 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1314 ph10 459 {
1315     condition = GET2(slotB, 0) == md->recursive->group_num;
1316     if (condition) break;
1317 ph10 461 }
1318 ph10 459 else break;
1319 ph10 461 }
1320     }
1321 ph10 459 }
1322 ph10 461 }
1323    
1324 ph10 459 /* Chose branch according to the condition */
1325 ph10 461
1326 ph10 836 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1327 ph10 459 }
1328 ph10 461 }
1329 nigel 93
1330 ph10 459 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1331 nigel 93 {
1332 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1333 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1334 ph10 461
1335 ph10 459 /* If the numbered capture is unset, but the reference was by name,
1336 ph10 461 scan the table to see if the name refers to any other numbers, and test
1337     them. The condition is true if any one is set. This is tediously similar
1338     to the code above, but not close enough to try to amalgamate. */
1339    
1340 ph10 459 if (!condition && condcode == OP_NCREF)
1341     {
1342 ph10 461 int refno = offset >> 1;
1343 ph10 836 pcre_uchar *slotA = md->name_table;
1344 ph10 461
1345 ph10 459 for (i = 0; i < md->name_count; i++)
1346 ph10 461 {
1347     if (GET2(slotA, 0) == refno) break;
1348 ph10 459 slotA += md->name_entry_size;
1349     }
1350 ph10 461
1351     /* Found a name for the number - there can be only one; duplicate names
1352     for different numbers are allowed, but not vice versa. First scan down
1353 ph10 459 for duplicates. */
1354 ph10 461
1355 ph10 459 if (i < md->name_count)
1356 ph10 461 {
1357 ph10 836 pcre_uchar *slotB = slotA;
1358 ph10 459 while (slotB > md->name_table)
1359     {
1360     slotB -= md->name_entry_size;
1361 ph10 836 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1362 ph10 459 {
1363     offset = GET2(slotB, 0) << 1;
1364 ph10 461 condition = offset < offset_top &&
1365 ph10 459 md->offset_vector[offset] >= 0;
1366 ph10 461 if (condition) break;
1367     }
1368 ph10 459 else break;
1369 ph10 461 }
1370    
1371 ph10 459 /* Scan up for duplicates */
1372 ph10 461
1373 ph10 459 if (!condition)
1374 ph10 461 {
1375 ph10 459 slotB = slotA;
1376     for (i++; i < md->name_count; i++)
1377     {
1378     slotB += md->name_entry_size;
1379 ph10 836 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1380 ph10 459 {
1381     offset = GET2(slotB, 0) << 1;
1382 ph10 461 condition = offset < offset_top &&
1383 ph10 459 md->offset_vector[offset] >= 0;
1384 ph10 461 if (condition) break;
1385     }
1386 ph10 459 else break;
1387 ph10 461 }
1388     }
1389 ph10 459 }
1390 ph10 461 }
1391    
1392 ph10 459 /* Chose branch according to the condition */
1393    
1394 ph10 836 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1395 nigel 77 }
1396    
1397 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
1398 nigel 93 {
1399     condition = FALSE;
1400     ecode += GET(ecode, 1);
1401     }
1402    
1403 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
1404 ph10 604 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1405     an assertion. */
1406 nigel 77
1407     else
1408     {
1409 ph10 625 md->match_function_type = MATCH_CONDASSERT;
1410 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1411 nigel 77 if (rrc == MATCH_MATCH)
1412     {
1413 ph10 619 if (md->end_offset_top > offset_top)
1414     offset_top = md->end_offset_top; /* Captures may have happened */
1415 nigel 93 condition = TRUE;
1416     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1417 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1418     }
1419 ph10 733
1420 ph10 716 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1421 ph10 733 assertion; it is therefore treated as NOMATCH. */
1422 ph10 716
1423 ph10 733 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1424 nigel 77 {
1425     RRETURN(rrc); /* Need braces because of following else */
1426     }
1427 nigel 93 else
1428     {
1429     condition = FALSE;
1430 ph10 399 ecode += codelink;
1431 nigel 93 }
1432     }
1433 nigel 91
1434 ph10 716 /* We are now at the branch that is to be obeyed. As there is only one, can
1435     use tail recursion to avoid using another stack frame, except when there is
1436     unlimited repeat of a possibly empty group. In the latter case, a recursive
1437     call to match() is always required, unless the second alternative doesn't
1438     exist, in which case we can just plough on. Note that, for compatibility
1439     with Perl, the | in a conditional group is NOT treated as creating two
1440     alternatives. If a THEN is encountered in the branch, it propagates out to
1441     the enclosing alternative (unless nested in a deeper set of alternatives,
1442     of course). */
1443 nigel 91
1444 nigel 93 if (condition || *ecode == OP_ALT)
1445     {
1446 ph10 716 if (op != OP_SCOND)
1447 ph10 702 {
1448     ecode += 1 + LINK_SIZE;
1449     goto TAIL_RECURSE;
1450 ph10 708 }
1451 ph10 733
1452 ph10 716 md->match_function_type = MATCH_CBEGROUP;
1453 ph10 609 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1454     RRETURN(rrc);
1455 nigel 77 }
1456 ph10 708
1457 ph10 702 /* Condition false & no alternative; continue after the group. */
1458 ph10 708
1459 ph10 702 else
1460 nigel 93 {
1461     ecode += 1 + LINK_SIZE;
1462     }
1463     break;
1464 nigel 77
1465 ph10 461
1466 ph10 447 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1467     to close any currently open capturing brackets. */
1468 ph10 461
1469 ph10 447 case OP_CLOSE:
1470 ph10 461 number = GET2(ecode, 1);
1471 ph10 447 offset = number << 1;
1472 ph10 461
1473 ph10 475 #ifdef PCRE_DEBUG
1474 ph10 447 printf("end bracket %d at *ACCEPT", number);
1475     printf("\n");
1476     #endif
1477 nigel 77
1478 ph10 447 md->capture_last = number;
1479     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1480     {
1481     md->offset_vector[offset] =
1482     md->offset_vector[md->offset_end - number];
1483 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1484 ph10 447 if (offset_top <= offset) offset_top = offset + 2;
1485     }
1486 ph10 836 ecode += 1 + IMM2_SIZE;
1487 ph10 461 break;
1488 ph10 447
1489    
1490 ph10 619 /* End of the pattern, either real or forced. */
1491 nigel 77
1492 ph10 619 case OP_END:
1493 ph10 210 case OP_ACCEPT:
1494 ph10 625 case OP_ASSERT_ACCEPT:
1495    
1496 ph10 619 /* If we have matched an empty string, fail if not in an assertion and not
1497     in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1498 ph10 613 is set and we have matched at the start of the subject. In both cases,
1499     backtracking will then try other alternatives, if any. */
1500 ph10 443
1501 ph10 619 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1502 ph10 618 md->recursive == NULL &&
1503 ph10 619 (md->notempty ||
1504     (md->notempty_atstart &&
1505     mstart == md->start_subject + md->start_offset)))
1506 ph10 836 RRETURN(MATCH_NOMATCH);
1507 ph10 443
1508 ph10 442 /* Otherwise, we have a match. */
1509 ph10 625
1510 ph10 168 md->end_match_ptr = eptr; /* Record where we ended */
1511     md->end_offset_top = offset_top; /* and how many extracts were taken */
1512 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1513 nigel 77
1514 ph10 512 /* For some reason, the macros don't work properly if an expression is
1515 ph10 836 given as the argument to RRETURN when the heap is in use. */
1516 ph10 512
1517     rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1518 ph10 836 RRETURN(rrc);
1519 ph10 512
1520 nigel 77 /* Assertion brackets. Check the alternative branches in turn - the
1521     matching won't pass the KET for an assertion. If any one branch matches,
1522     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1523     start of each branch to move the current point backwards, so the code at
1524 ph10 625 this level is identical to the lookahead case. When the assertion is part
1525     of a condition, we want to return immediately afterwards. The caller of
1526     this incarnation of the match() function will have set MATCH_CONDASSERT in
1527     md->match_function type, and one of these opcodes will be the first opcode
1528     that is processed. We use a local variable that is preserved over calls to
1529 ph10 604 match() to remember this case. */
1530 nigel 77
1531     case OP_ASSERT:
1532     case OP_ASSERTBACK:
1533 ph10 888 save_mark = md->mark;
1534 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1535     {
1536     condassert = TRUE;
1537     md->match_function_type = 0;
1538     }
1539 ph10 625 else condassert = FALSE;
1540    
1541 nigel 77 do
1542     {
1543 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1544 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1545 ph10 500 {
1546     mstart = md->start_match_ptr; /* In case \K reset it */
1547     break;
1548 ph10 501 }
1549 ph10 733
1550     /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1551 ph10 716 as NOMATCH. */
1552 ph10 733
1553 ph10 716 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1554 nigel 77 ecode += GET(ecode, 1);
1555 ph10 888 md->mark = save_mark;
1556 nigel 77 }
1557     while (*ecode == OP_ALT);
1558 ph10 625
1559 ph10 836 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1560 nigel 77
1561     /* If checking an assertion for a condition, return MATCH_MATCH. */
1562    
1563 ph10 604 if (condassert) RRETURN(MATCH_MATCH);
1564 nigel 77
1565     /* Continue from after the assertion, updating the offsets high water
1566     mark, since extracts may have been taken during the assertion. */
1567    
1568     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1569     ecode += 1 + LINK_SIZE;
1570     offset_top = md->end_offset_top;
1571     continue;
1572    
1573 ph10 473 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1574 ph10 482 PRUNE, or COMMIT means we must assume failure without checking subsequent
1575 ph10 473 branches. */
1576 nigel 77
1577     case OP_ASSERT_NOT:
1578     case OP_ASSERTBACK_NOT:
1579 ph10 888 save_mark = md->mark;
1580 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1581     {
1582     condassert = TRUE;
1583     md->match_function_type = 0;
1584     }
1585 ph10 625 else condassert = FALSE;
1586 ph10 604
1587 nigel 77 do
1588     {
1589 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1590 ph10 888 md->mark = save_mark;
1591 ph10 836 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1592 ph10 473 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1593     {
1594     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1595 ph10 482 break;
1596     }
1597 ph10 716
1598 ph10 733 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1599 ph10 716 as NOMATCH. */
1600    
1601     if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1602 nigel 77 ecode += GET(ecode,1);
1603     }
1604     while (*ecode == OP_ALT);
1605    
1606 ph10 604 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1607 ph10 625
1608 nigel 77 ecode += 1 + LINK_SIZE;
1609     continue;
1610    
1611     /* Move the subject pointer back. This occurs only at the start of
1612     each branch of a lookbehind assertion. If we are too close to the start to
1613     move back, this match function fails. When working with UTF-8 we move
1614     back a number of characters, not bytes. */
1615    
1616     case OP_REVERSE:
1617 ph10 836 #ifdef SUPPORT_UTF
1618     if (utf)
1619 nigel 77 {
1620 nigel 93 i = GET(ecode, 1);
1621     while (i-- > 0)
1622 nigel 77 {
1623     eptr--;
1624 ph10 836 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1625 ph10 207 BACKCHAR(eptr);
1626 nigel 77 }
1627     }
1628     else
1629     #endif
1630    
1631     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1632    
1633     {
1634 nigel 93 eptr -= GET(ecode, 1);
1635 ph10 836 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1636 nigel 77 }
1637    
1638 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1639 nigel 77
1640 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1641 nigel 77 ecode += 1 + LINK_SIZE;
1642     break;
1643    
1644     /* The callout item calls an external function, if one is provided, passing
1645     details of the match so far. This is mainly for debugging, though the
1646     function is able to force a failure. */
1647    
1648     case OP_CALLOUT:
1649 ph10 836 if (PUBL(callout) != NULL)
1650 nigel 77 {
1651 zherczeg 850 PUBL(callout_block) cb;
1652 ph10 645 cb.version = 2; /* Version 1 of the callout block */
1653 nigel 77 cb.callout_number = ecode[1];
1654     cb.offset_vector = md->offset_vector;
1655 zherczeg 852 #ifdef COMPILE_PCRE8
1656 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1657 zherczeg 852 #else
1658     cb.subject = (PCRE_SPTR16)md->start_subject;
1659     #endif
1660 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1661     cb.start_match = (int)(mstart - md->start_subject);
1662     cb.current_position = (int)(eptr - md->start_subject);
1663 nigel 77 cb.pattern_position = GET(ecode, 2);
1664     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1665     cb.capture_top = offset_top/2;
1666     cb.capture_last = md->capture_last;
1667     cb.callout_data = md->callout_data;
1668 ph10 836 cb.mark = md->nomatch_mark;
1669     if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1670 nigel 77 if (rrc < 0) RRETURN(rrc);
1671     }
1672     ecode += 2 + 2*LINK_SIZE;
1673     break;
1674    
1675     /* Recursion either matches the current regex, or some subexpression. The
1676     offset data is the offset to the starting bracket from the start of the
1677     whole pattern. (This is so that it works from duplicated subpatterns.)
1678 ph10 625
1679 ph10 618 The state of the capturing groups is preserved over recursion, and
1680 ph10 625 re-instated afterwards. We don't know how many are started and not yet
1681 ph10 618 finished (offset_top records the completed total) so we just have to save
1682     all the potential data. There may be up to 65535 such values, which is too
1683     large to put on the stack, but using malloc for small numbers seems
1684     expensive. As a compromise, the stack is used when there are no more than
1685     REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1686 nigel 77
1687     There are also other values that have to be saved. We use a chained
1688     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1689 ph10 625 for the original version of this logic. It has, however, been hacked around
1690 ph10 618 a lot, so he is not to blame for the current way it works. */
1691 nigel 77
1692     case OP_RECURSE:
1693     {
1694 ph10 642 recursion_info *ri;
1695     int recno;
1696 ph10 654
1697 nigel 77 callpat = md->start_code + GET(ecode, 1);
1698 ph10 642 recno = (callpat == md->start_code)? 0 :
1699 ph10 654 GET2(callpat, 1 + LINK_SIZE);
1700    
1701     /* Check for repeating a recursion without advancing the subject pointer.
1702 ph10 642 This should catch convoluted mutual recursions. (Some simple cases are
1703 ph10 654 caught at compile time.) */
1704    
1705 ph10 642 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1706 ph10 654 if (recno == ri->group_num && eptr == ri->subject_position)
1707 ph10 642 RRETURN(PCRE_ERROR_RECURSELOOP);
1708 nigel 77
1709     /* Add to "recursing stack" */
1710    
1711 ph10 642 new_recursive.group_num = recno;
1712     new_recursive.subject_position = eptr;
1713 nigel 77 new_recursive.prevrec = md->recursive;
1714     md->recursive = &new_recursive;
1715    
1716 ph10 618 /* Where to continue from afterwards */
1717 nigel 77
1718     ecode += 1 + LINK_SIZE;
1719    
1720 ph10 618 /* Now save the offset data */
1721 nigel 77
1722     new_recursive.saved_max = md->offset_end;
1723     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1724     new_recursive.offset_save = stacksave;
1725     else
1726     {
1727     new_recursive.offset_save =
1728 ph10 836 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1729 nigel 77 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1730     }
1731     memcpy(new_recursive.offset_save, md->offset_vector,
1732     new_recursive.saved_max * sizeof(int));
1733 ph10 625
1734 ph10 618 /* OK, now we can do the recursion. After processing each alternative,
1735 ph10 625 restore the offset data. If there were nested recursions, md->recursive
1736 ph10 618 might be changed, so reset it before looping. */
1737 nigel 77
1738     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1739 ph10 604 cbegroup = (*callpat >= OP_SBRA);
1740 nigel 77 do
1741     {
1742 ph10 604 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1743 ph10 836 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1744 ph10 604 md, eptrb, RM6);
1745 ph10 618 memcpy(md->offset_vector, new_recursive.offset_save,
1746     new_recursive.saved_max * sizeof(int));
1747 ph10 681 md->recursive = new_recursive.prevrec;
1748 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1749 nigel 77 {
1750 nigel 87 DPRINTF(("Recursion matched\n"));
1751 nigel 77 if (new_recursive.offset_save != stacksave)
1752 ph10 836 (PUBL(free))(new_recursive.offset_save);
1753 ph10 618
1754     /* Set where we got to in the subject, and reset the start in case
1755 ph10 625 it was changed by \K. This *is* propagated back out of a recursion,
1756     for Perl compatibility. */
1757    
1758 ph10 618 eptr = md->end_match_ptr;
1759     mstart = md->start_match_ptr;
1760     goto RECURSION_MATCHED; /* Exit loop; end processing */
1761 nigel 77 }
1762 ph10 716
1763     /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1764     as NOMATCH. */
1765    
1766 ph10 733 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1767 nigel 87 {
1768     DPRINTF(("Recursion gave error %d\n", rrc));
1769 ph10 400 if (new_recursive.offset_save != stacksave)
1770 ph10 836 (PUBL(free))(new_recursive.offset_save);
1771 nigel 87 RRETURN(rrc);
1772     }
1773 nigel 77
1774     md->recursive = &new_recursive;
1775     callpat += GET(callpat, 1);
1776     }
1777     while (*callpat == OP_ALT);
1778    
1779     DPRINTF(("Recursion didn't match\n"));
1780     md->recursive = new_recursive.prevrec;
1781     if (new_recursive.offset_save != stacksave)
1782 ph10 836 (PUBL(free))(new_recursive.offset_save);
1783     RRETURN(MATCH_NOMATCH);
1784 nigel 77 }
1785 ph10 625
1786 ph10 618 RECURSION_MATCHED:
1787     break;
1788 nigel 77
1789     /* An alternation is the end of a branch; scan along to find the end of the
1790     bracketed group and go to there. */
1791    
1792     case OP_ALT:
1793     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1794     break;
1795    
1796 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1797     indicating that it may occur zero times. It may repeat infinitely, or not
1798     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1799     with fixed upper repeat limits are compiled as a number of copies, with the
1800     optional ones preceded by BRAZERO or BRAMINZERO. */
1801 ph10 625
1802 nigel 77 case OP_BRAZERO:
1803 ph10 604 next = ecode + 1;
1804     RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1805     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1806     do next += GET(next, 1); while (*next == OP_ALT);
1807     ecode = next + 1 + LINK_SIZE;
1808 nigel 77 break;
1809 ph10 625
1810 nigel 77 case OP_BRAMINZERO:
1811 ph10 604 next = ecode + 1;
1812     do next += GET(next, 1); while (*next == OP_ALT);
1813     RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1814     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1815     ecode++;
1816 nigel 77 break;
1817    
1818 ph10 335 case OP_SKIPZERO:
1819 ph10 604 next = ecode+1;
1820     do next += GET(next,1); while (*next == OP_ALT);
1821     ecode = next + 1 + LINK_SIZE;
1822 ph10 335 break;
1823 ph10 625
1824 ph10 604 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1825     here; just jump to the group, with allow_zero set TRUE. */
1826 ph10 625
1827 ph10 604 case OP_BRAPOSZERO:
1828 ph10 625 op = *(++ecode);
1829 ph10 604 allow_zero = TRUE;
1830     if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1831     goto POSSESSIVE_NON_CAPTURE;
1832 ph10 335
1833 nigel 93 /* End of a group, repeated or non-repeating. */
1834 nigel 77
1835     case OP_KET:
1836     case OP_KETRMIN:
1837     case OP_KETRMAX:
1838 ph10 625 case OP_KETRPOS:
1839 nigel 91 prev = ecode - GET(ecode, 1);
1840 ph10 625
1841 nigel 93 /* If this was a group that remembered the subject start, in order to break
1842     infinite repeats of empty string matches, retrieve the subject start from
1843     the chain. Otherwise, set it NULL. */
1844 nigel 77
1845 ph10 618 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1846 nigel 93 {
1847     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1848     eptrb = eptrb->epb_prev; /* Backup to previous group */
1849     }
1850     else saved_eptr = NULL;
1851 nigel 77
1852 ph10 733 /* If we are at the end of an assertion group or a non-capturing atomic
1853 ph10 723 group, stop matching and return MATCH_MATCH, but record the current high
1854     water mark for use by positive assertions. We also need to record the match
1855     start in case it was changed by \K. */
1856 nigel 93
1857 ph10 723 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1858 ph10 733 *prev == OP_ONCE_NC)
1859 nigel 91 {
1860 ph10 723 md->end_match_ptr = eptr; /* For ONCE_NC */
1861 nigel 91 md->end_offset_top = offset_top;
1862 ph10 500 md->start_match_ptr = mstart;
1863 ph10 836 RRETURN(MATCH_MATCH); /* Sets md->mark */
1864 nigel 91 }
1865 nigel 77
1866 nigel 93 /* For capturing groups we have to check the group number back at the start
1867     and if necessary complete handling an extraction by setting the offsets and
1868 ph10 618 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1869     into group 0, so it won't be picked up here. Instead, we catch it when the
1870     OP_END is reached. Other recursion is handled here. We just have to record
1871     the current subject position and start match pointer and give a MATCH
1872     return. */
1873 nigel 77
1874 ph10 604 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1875     *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1876 nigel 91 {
1877 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1878 nigel 91 offset = number << 1;
1879 ph10 461
1880 ph10 475 #ifdef PCRE_DEBUG
1881 nigel 91 printf("end bracket %d", number);
1882     printf("\n");
1883 nigel 77 #endif
1884    
1885 ph10 618 /* Handle a recursively called group. */
1886    
1887     if (md->recursive != NULL && md->recursive->group_num == number)
1888     {
1889     md->end_match_ptr = eptr;
1890     md->start_match_ptr = mstart;
1891     RRETURN(MATCH_MATCH);
1892     }
1893    
1894     /* Deal with capturing */
1895    
1896 nigel 93 md->capture_last = number;
1897     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1898 nigel 91 {
1899 ph10 625 /* If offset is greater than offset_top, it means that we are
1900     "skipping" a capturing group, and that group's offsets must be marked
1901     unset. In earlier versions of PCRE, all the offsets were unset at the
1902     start of matching, but this doesn't work because atomic groups and
1903 ph10 615 assertions can cause a value to be set that should later be unset.
1904     Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1905 ph10 625 part of the atomic group, but this is not on the final matching path,
1906     so must be unset when 2 is set. (If there is no group 2, there is no
1907 ph10 615 problem, because offset_top will then be 2, indicating no capture.) */
1908 ph10 625
1909 ph10 615 if (offset > offset_top)
1910     {
1911     register int *iptr = md->offset_vector + offset_top;
1912     register int *iend = md->offset_vector + offset;
1913     while (iptr < iend) *iptr++ = -1;
1914 ph10 625 }
1915    
1916 ph10 615 /* Now make the extraction */
1917    
1918 nigel 93 md->offset_vector[offset] =
1919     md->offset_vector[md->offset_end - number];
1920 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1921 nigel 93 if (offset_top <= offset) offset_top = offset + 2;
1922     }
1923 nigel 91 }
1924 nigel 77
1925 ph10 618 /* For an ordinary non-repeating ket, just continue at this level. This
1926     also happens for a repeating ket if no characters were matched in the
1927     group. This is the forcible breaking of infinite loops as implemented in
1928 ph10 723 Perl 5.005. For a non-repeating atomic group that includes captures,
1929     establish a backup point by processing the rest of the pattern at a lower
1930     level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1931     original OP_ONCE level, thereby bypassing intermediate backup points, but
1932     resetting any captures that happened along the way. */
1933 nigel 77
1934 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1935     {
1936 ph10 618 if (*prev == OP_ONCE)
1937     {
1938     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1939     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1940     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1941 ph10 625 RRETURN(MATCH_ONCE);
1942     }
1943 ph10 618 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1944 nigel 91 break;
1945     }
1946 ph10 625
1947     /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1948 ph10 604 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1949     at a time from the outer level, thus saving stack. */
1950 ph10 625
1951 ph10 604 if (*ecode == OP_KETRPOS)
1952 ph10 625 {
1953 ph10 604 md->end_match_ptr = eptr;
1954 ph10 625 md->end_offset_top = offset_top;
1955 ph10 604 RRETURN(MATCH_KETRPOS);
1956 ph10 625 }
1957 nigel 77
1958 ph10 604 /* The normal repeating kets try the rest of the pattern or restart from
1959     the preceding bracket, in the appropriate order. In the second case, we can
1960     use tail recursion to avoid using another stack frame, unless we have an
1961 ph10 618 an atomic group or an unlimited repeat of a group that can match an empty
1962     string. */
1963 nigel 77
1964 nigel 91 if (*ecode == OP_KETRMIN)
1965     {
1966 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1967 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1968 ph10 618 if (*prev == OP_ONCE)
1969     {
1970 ph10 623 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1971 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1972     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1973 ph10 625 RRETURN(MATCH_ONCE);
1974     }
1975 ph10 604 if (*prev >= OP_SBRA) /* Could match an empty string */
1976 ph10 197 {
1977 ph10 625 md->match_function_type = MATCH_CBEGROUP;
1978 ph10 604 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1979 ph10 197 RRETURN(rrc);
1980     }
1981 nigel 91 ecode = prev;
1982     goto TAIL_RECURSE;
1983 nigel 77 }
1984 nigel 91 else /* OP_KETRMAX */
1985     {
1986 ph10 625 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1987 ph10 604 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1988 ph10 618 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1989 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1990 ph10 618 if (*prev == OP_ONCE)
1991     {
1992 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1993 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1994     md->once_target = prev;
1995 ph10 625 RRETURN(MATCH_ONCE);
1996     }
1997 nigel 91 ecode += 1 + LINK_SIZE;
1998     goto TAIL_RECURSE;
1999     }
2000     /* Control never gets here */
2001 nigel 77
2002 ph10 602 /* Not multiline mode: start of subject assertion, unless notbol. */
2003 nigel 77
2004     case OP_CIRC:
2005 ph10 836 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2006 ph10 625
2007 nigel 77 /* Start of subject assertion */
2008    
2009     case OP_SOD:
2010 ph10 836 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2011 nigel 77 ecode++;
2012     break;
2013 ph10 625
2014 ph10 602 /* Multiline mode: start of subject unless notbol, or after any newline. */
2015 nigel 77
2016 ph10 602 case OP_CIRCM:
2017 ph10 836 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2018 ph10 602 if (eptr != md->start_subject &&
2019     (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2020 ph10 836 RRETURN(MATCH_NOMATCH);
2021 ph10 602 ecode++;
2022     break;
2023    
2024 nigel 77 /* Start of match assertion */
2025    
2026     case OP_SOM:
2027 ph10 836 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2028 nigel 77 ecode++;
2029     break;
2030 ph10 172
2031 ph10 168 /* Reset the start of match point */
2032 ph10 172
2033 ph10 168 case OP_SET_SOM:
2034     mstart = eptr;
2035 ph10 172 ecode++;
2036     break;
2037 nigel 77
2038 ph10 602 /* Multiline mode: assert before any newline, or before end of subject
2039     unless noteol is set. */
2040 nigel 77
2041 ph10 602 case OP_DOLLM:
2042     if (eptr < md->end_subject)
2043 ph10 836 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
2044 ph10 602 else
2045 nigel 77 {
2046 ph10 836 if (md->noteol) RRETURN(MATCH_NOMATCH);
2047 ph10 602 SCHECK_PARTIAL();
2048 nigel 77 }
2049 ph10 602 ecode++;
2050     break;
2051 ph10 579
2052 ph10 625 /* Not multiline mode: assert before a terminating newline or before end of
2053 ph10 602 subject unless noteol is set. */
2054    
2055     case OP_DOLL:
2056 ph10 836 if (md->noteol) RRETURN(MATCH_NOMATCH);
2057 ph10 602 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2058    
2059 nigel 91 /* ... else fall through for endonly */
2060 nigel 77
2061     /* End of subject assertion (\z) */
2062    
2063     case OP_EOD:
2064 ph10 836 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2065 ph10 553 SCHECK_PARTIAL();
2066 nigel 77 ecode++;
2067     break;
2068    
2069     /* End of subject or ending \n assertion (\Z) */
2070    
2071     case OP_EODN:
2072 ph10 553 ASSERT_NL_OR_EOS:
2073     if (eptr < md->end_subject &&
2074 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2075 ph10 836 RRETURN(MATCH_NOMATCH);
2076 ph10 579
2077 ph10 553 /* Either at end of string or \n before end. */
2078 ph10 579
2079 ph10 553 SCHECK_PARTIAL();
2080 nigel 77 ecode++;
2081     break;
2082    
2083     /* Word boundary assertions */
2084    
2085     case OP_NOT_WORD_BOUNDARY:
2086     case OP_WORD_BOUNDARY:
2087     {
2088    
2089     /* Find out if the previous and current characters are "word" characters.
2090     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2091 ph10 443 be "non-word" characters. Remember the earliest consulted character for
2092 ph10 435 partial matching. */
2093 nigel 77
2094 ph10 836 #ifdef SUPPORT_UTF
2095     if (utf)
2096 nigel 77 {
2097 ph10 518 /* Get status of previous character */
2098 ph10 527
2099 nigel 77 if (eptr == md->start_subject) prev_is_word = FALSE; else
2100     {
2101 ph10 836 PCRE_PUCHAR lastptr = eptr - 1;
2102     BACKCHAR(lastptr);
2103 ph10 443 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2104 nigel 77 GETCHAR(c, lastptr);
2105 ph10 527 #ifdef SUPPORT_UCP
2106 ph10 518 if (md->use_ucp)
2107     {
2108     if (c == '_') prev_is_word = TRUE; else
2109 ph10 527 {
2110 ph10 518 int cat = UCD_CATEGORY(c);
2111     prev_is_word = (cat == ucp_L || cat == ucp_N);
2112 ph10 527 }
2113     }
2114     else
2115     #endif
2116 nigel 77 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2117     }
2118 ph10 527
2119 ph10 518 /* Get status of next character */
2120 ph10 527
2121 ph10 443 if (eptr >= md->end_subject)
2122 nigel 77 {
2123 ph10 443 SCHECK_PARTIAL();
2124     cur_is_word = FALSE;
2125 ph10 428 }
2126     else
2127     {
2128 nigel 77 GETCHAR(c, eptr);
2129 ph10 527 #ifdef SUPPORT_UCP
2130 ph10 518 if (md->use_ucp)
2131     {
2132     if (c == '_') cur_is_word = TRUE; else
2133 ph10 527 {
2134 ph10 518 int cat = UCD_CATEGORY(c);
2135     cur_is_word = (cat == ucp_L || cat == ucp_N);
2136 ph10 527 }
2137     }
2138     else
2139     #endif
2140 nigel 77 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2141     }
2142     }
2143     else
2144     #endif
2145    
2146 ph10 527 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2147 ph10 518 consistency with the behaviour of \w we do use it in this case. */
2148 nigel 77
2149     {
2150 ph10 518 /* Get status of previous character */
2151 ph10 527
2152 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
2153     {
2154 ph10 443 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2155 ph10 527 #ifdef SUPPORT_UCP
2156 ph10 518 if (md->use_ucp)
2157     {
2158 ph10 527 c = eptr[-1];
2159 ph10 518 if (c == '_') prev_is_word = TRUE; else
2160 ph10 527 {
2161 ph10 518 int cat = UCD_CATEGORY(c);
2162     prev_is_word = (cat == ucp_L || cat == ucp_N);
2163 ph10 527 }
2164     }
2165     else
2166     #endif
2167 ph10 836 prev_is_word = MAX_255(eptr[-1])
2168     && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2169 ph10 435 }
2170 ph10 527
2171 ph10 518 /* Get status of next character */
2172 ph10 527
2173 ph10 443 if (eptr >= md->end_subject)
2174 ph10 428 {
2175 ph10 443 SCHECK_PARTIAL();
2176     cur_is_word = FALSE;
2177 ph10 428 }
2178 ph10 527 else
2179     #ifdef SUPPORT_UCP
2180 ph10 518 if (md->use_ucp)
2181     {
2182 ph10 527 c = *eptr;
2183 ph10 518 if (c == '_') cur_is_word = TRUE; else
2184 ph10 527 {
2185 ph10 518 int cat = UCD_CATEGORY(c);
2186     cur_is_word = (cat == ucp_L || cat == ucp_N);
2187 ph10 527 }
2188     }
2189     else
2190     #endif
2191 ph10 836 cur_is_word = MAX_255(*eptr)
2192     && ((md->ctypes[*eptr] & ctype_word) != 0);
2193 nigel 77 }
2194    
2195     /* Now see if the situation is what we want */
2196    
2197     if ((*ecode++ == OP_WORD_BOUNDARY)?
2198     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2199 ph10 836 RRETURN(MATCH_NOMATCH);
2200 nigel 77 }
2201     break;
2202    
2203     /* Match a single character type; inline for speed */
2204    
2205     case OP_ANY:
2206 ph10 836 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2207 ph10 345 /* Fall through */
2208    
2209 ph10 341 case OP_ALLANY:
2210 ph10 648 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2211     { /* not be updated before SCHECK_PARTIAL. */
2212 ph10 443 SCHECK_PARTIAL();
2213 ph10 836 RRETURN(MATCH_NOMATCH);
2214 ph10 443 }
2215 ph10 648 eptr++;
2216 ph10 836 #ifdef SUPPORT_UTF
2217     if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2218     #endif
2219 nigel 77 ecode++;
2220     break;
2221    
2222     /* Match a single byte, even in UTF-8 mode. This opcode really does match
2223     any byte, even newline, independent of the setting of PCRE_DOTALL. */
2224    
2225     case OP_ANYBYTE:
2226 ph10 648 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2227     { /* not be updated before SCHECK_PARTIAL. */
2228 ph10 443 SCHECK_PARTIAL();
2229 ph10 836 RRETURN(MATCH_NOMATCH);
2230 ph10 443 }
2231 ph10 654 eptr++;
2232 nigel 77 ecode++;
2233     break;
2234    
2235     case OP_NOT_DIGIT:
2236 ph10 443 if (eptr >= md->end_subject)
2237 ph10 428 {
2238 ph10 443 SCHECK_PARTIAL();
2239 ph10 836 RRETURN(MATCH_NOMATCH);
2240 ph10 443 }
2241 nigel 77 GETCHARINCTEST(c, eptr);
2242     if (
2243 ph10 836 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2244 nigel 77 c < 256 &&
2245     #endif
2246     (md->ctypes[c] & ctype_digit) != 0
2247     )
2248 ph10 836 RRETURN(MATCH_NOMATCH);
2249 nigel 77 ecode++;
2250     break;
2251    
2252     case OP_DIGIT:
2253 ph10 443 if (eptr >= md->end_subject)
2254 ph10 428 {
2255 ph10 443 SCHECK_PARTIAL();
2256 ph10 836 RRETURN(MATCH_NOMATCH);
2257 ph10 443 }
2258 nigel 77 GETCHARINCTEST(c, eptr);
2259     if (
2260 ph10 836 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2261     c > 255 ||
2262 nigel 77 #endif
2263     (md->ctypes[c] & ctype_digit) == 0
2264     )
2265 ph10 836 RRETURN(MATCH_NOMATCH);
2266 nigel 77 ecode++;
2267     break;
2268    
2269     case OP_NOT_WHITESPACE:
2270 ph10 443 if (eptr >= md->end_subject)
2271 ph10 428 {
2272 ph10 443 SCHECK_PARTIAL();
2273 ph10 836 RRETURN(MATCH_NOMATCH);
2274 ph10 443 }
2275 nigel 77 GETCHARINCTEST(c, eptr);
2276     if (
2277 ph10 836 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2278 nigel 77 c < 256 &&
2279     #endif
2280     (md->ctypes[c] & ctype_space) != 0
2281     )
2282 ph10 836 RRETURN(MATCH_NOMATCH);
2283 nigel 77 ecode++;
2284     break;
2285    
2286     case OP_WHITESPACE:
2287 ph10 443 if (eptr >= md->end_subject)
2288 ph10 428 {
2289 ph10 443 SCHECK_PARTIAL();
2290 ph10 836 RRETURN(MATCH_NOMATCH);
2291 ph10 443 }
2292 nigel 77 GETCHARINCTEST(c, eptr);
2293     if (
2294 ph10 836 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2295     c > 255 ||
2296 nigel 77 #endif
2297     (md->ctypes[c] & ctype_space) == 0
2298     )
2299 ph10 836 RRETURN(MATCH_NOMATCH);
2300 nigel 77 ecode++;
2301     break;
2302    
2303     case OP_NOT_WORDCHAR:
2304 ph10 443 if (eptr >= md->end_subject)
2305 ph10 428 {
2306 ph10 443 SCHECK_PARTIAL();
2307 ph10 836 RRETURN(MATCH_NOMATCH);
2308 ph10 443 }
2309 nigel 77 GETCHARINCTEST(c, eptr);
2310     if (
2311 ph10 836 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2312 nigel 77 c < 256 &&
2313     #endif
2314     (md->ctypes[c] & ctype_word) != 0
2315     )
2316 ph10 836 RRETURN(MATCH_NOMATCH);
2317 nigel 77 ecode++;
2318     break;
2319    
2320     case OP_WORDCHAR:
2321 ph10 443 if (eptr >= md->end_subject)
2322 ph10 428 {
2323 ph10 443 SCHECK_PARTIAL();
2324 ph10 836 RRETURN(MATCH_NOMATCH);
2325 ph10 443 }
2326 nigel 77 GETCHARINCTEST(c, eptr);
2327     if (
2328 ph10 836 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2329     c > 255 ||
2330 nigel 77 #endif
2331     (md->ctypes[c] & ctype_word) == 0
2332     )
2333 ph10 836 RRETURN(MATCH_NOMATCH);
2334 nigel 77 ecode++;
2335     break;
2336    
2337 nigel 93 case OP_ANYNL:
2338 ph10 443 if (eptr >= md->end_subject)
2339 ph10 428 {
2340 ph10 443 SCHECK_PARTIAL();
2341 ph10 836 RRETURN(MATCH_NOMATCH);
2342 ph10 443 }
2343 nigel 93 GETCHARINCTEST(c, eptr);
2344     switch(c)
2345     {
2346 ph10 836 default: RRETURN(MATCH_NOMATCH);
2347 ph10 625
2348 nigel 93 case 0x000d:
2349     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2350     break;
2351 ph10 231
2352 nigel 93 case 0x000a:
2353 ph10 231 break;
2354    
2355 nigel 93 case 0x000b:
2356     case 0x000c:
2357     case 0x0085:
2358     case 0x2028:
2359     case 0x2029:
2360 ph10 836 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2361 nigel 93 break;
2362     }
2363     ecode++;
2364     break;
2365    
2366 ph10 178 case OP_NOT_HSPACE:
2367 ph10 443 if (eptr >= md->end_subject)
2368 ph10 428 {
2369 ph10 443 SCHECK_PARTIAL();
2370 ph10 836 RRETURN(MATCH_NOMATCH);
2371 ph10 443 }
2372 ph10 178 GETCHARINCTEST(c, eptr);
2373     switch(c)
2374     {
2375     default: break;
2376     case 0x09: /* HT */
2377     case 0x20: /* SPACE */
2378     case 0xa0: /* NBSP */
2379     case 0x1680: /* OGHAM SPACE MARK */
2380     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2381     case 0x2000: /* EN QUAD */
2382     case 0x2001: /* EM QUAD */
2383     case 0x2002: /* EN SPACE */
2384     case 0x2003: /* EM SPACE */
2385     case 0x2004: /* THREE-PER-EM SPACE */
2386     case 0x2005: /* FOUR-PER-EM SPACE */
2387     case 0x2006: /* SIX-PER-EM SPACE */
2388     case 0x2007: /* FIGURE SPACE */
2389     case 0x2008: /* PUNCTUATION SPACE */
2390     case 0x2009: /* THIN SPACE */
2391     case 0x200A: /* HAIR SPACE */
2392     case 0x202f: /* NARROW NO-BREAK SPACE */
2393     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2394     case 0x3000: /* IDEOGRAPHIC SPACE */
2395 ph10 836 RRETURN(MATCH_NOMATCH);
2396 ph10 178 }
2397     ecode++;
2398     break;
2399    
2400     case OP_HSPACE:
2401 ph10 443 if (eptr >= md->end_subject)
2402 ph10 428 {
2403 ph10 443 SCHECK_PARTIAL();
2404 ph10 836 RRETURN(MATCH_NOMATCH);
2405 ph10 443 }
2406 ph10 178 GETCHARINCTEST(c, eptr);
2407     switch(c)
2408     {
2409 ph10 836 default: RRETURN(MATCH_NOMATCH);
2410 ph10 178 case 0x09: /* HT */
2411     case 0x20: /* SPACE */
2412     case 0xa0: /* NBSP */
2413     case 0x1680: /* OGHAM SPACE MARK */
2414     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2415     case 0x2000: /* EN QUAD */
2416     case 0x2001: /* EM QUAD */
2417     case 0x2002: /* EN SPACE */
2418     case 0x2003: /* EM SPACE */
2419     case 0x2004: /* THREE-PER-EM SPACE */
2420     case 0x2005: /* FOUR-PER-EM SPACE */
2421     case 0x2006: /* SIX-PER-EM SPACE */
2422     case 0x2007: /* FIGURE SPACE */
2423     case 0x2008: /* PUNCTUATION SPACE */
2424     case 0x2009: /* THIN SPACE */
2425     case 0x200A: /* HAIR SPACE */
2426     case 0x202f: /* NARROW NO-BREAK SPACE */
2427     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2428     case 0x3000: /* IDEOGRAPHIC SPACE */
2429     break;
2430     }
2431     ecode++;
2432     break;
2433    
2434     case OP_NOT_VSPACE:
2435 ph10 443 if (eptr >= md->end_subject)
2436 ph10 428 {
2437 ph10 443 SCHECK_PARTIAL();
2438 ph10 836 RRETURN(MATCH_NOMATCH);
2439 ph10 443 }
2440 ph10 178 GETCHARINCTEST(c, eptr);
2441     switch(c)
2442     {
2443     default: break;
2444     case 0x0a: /* LF */
2445     case 0x0b: /* VT */
2446     case 0x0c: /* FF */
2447     case 0x0d: /* CR */
2448     case 0x85: /* NEL */
2449     case 0x2028: /* LINE SEPARATOR */
2450     case 0x2029: /* PARAGRAPH SEPARATOR */
2451 ph10 836 RRETURN(MATCH_NOMATCH);
2452 ph10 178 }
2453     ecode++;
2454     break;
2455    
2456     case OP_VSPACE:
2457 ph10 443 if (eptr >= md->end_subject)
2458 ph10 428 {
2459 ph10 443 SCHECK_PARTIAL();
2460 ph10 836 RRETURN(MATCH_NOMATCH);
2461 ph10 443 }
2462 ph10 178 GETCHARINCTEST(c, eptr);
2463     switch(c)
2464     {
2465 ph10 836 default: RRETURN(MATCH_NOMATCH);
2466 ph10 178 case 0x0a: /* LF */
2467     case 0x0b: /* VT */
2468     case 0x0c: /* FF */
2469     case 0x0d: /* CR */
2470     case 0x85: /* NEL */
2471     case 0x2028: /* LINE SEPARATOR */
2472     case 0x2029: /* PARAGRAPH SEPARATOR */
2473     break;
2474     }
2475     ecode++;
2476     break;
2477    
2478 nigel 77 #ifdef SUPPORT_UCP
2479     /* Check the next character by Unicode property. We will get here only
2480     if the support is in the binary; otherwise a compile-time error occurs. */
2481    
2482     case OP_PROP:
2483     case OP_NOTPROP:
2484 ph10 443 if (eptr >= md->end_subject)
2485 ph10 428 {
2486 ph10 443 SCHECK_PARTIAL();
2487 ph10 836 RRETURN(MATCH_NOMATCH);
2488 ph10 443 }
2489 nigel 77 GETCHARINCTEST(c, eptr);
2490     {
2491 ph10 384 const ucd_record *prop = GET_UCD(c);
2492 nigel 77
2493 nigel 87 switch(ecode[1])
2494     {
2495     case PT_ANY:
2496 ph10 836 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2497 nigel 87 break;
2498 nigel 77
2499 nigel 87 case PT_LAMP:
2500 ph10 349 if ((prop->chartype == ucp_Lu ||
2501     prop->chartype == ucp_Ll ||
2502     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2503 ph10 836 RRETURN(MATCH_NOMATCH);
2504 ph10 517 break;
2505 nigel 87
2506     case PT_GC:
2507 ph10 836 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2508     RRETURN(MATCH_NOMATCH);
2509 nigel 87 break;
2510    
2511     case PT_PC:
2512 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2513 ph10 836 RRETURN(MATCH_NOMATCH);
2514 nigel 87 break;
2515    
2516     case PT_SC:
2517 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
2518 ph10 836 RRETURN(MATCH_NOMATCH);
2519 nigel 87 break;
2520 ph10 527
2521 ph10 517 /* These are specials */
2522 ph10 527
2523 ph10 517 case PT_ALNUM:
2524 ph10 836 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2525     PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2526     RRETURN(MATCH_NOMATCH);
2527 ph10 527 break;
2528    
2529 ph10 517 case PT_SPACE: /* Perl space */
2530 ph10 836 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2531 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2532     == (op == OP_NOTPROP))
2533 ph10 836 RRETURN(MATCH_NOMATCH);
2534 ph10 527 break;
2535    
2536 ph10 517 case PT_PXSPACE: /* POSIX space */
2537 ph10 836 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2538 ph10 527 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2539 ph10 517 c == CHAR_FF || c == CHAR_CR)
2540     == (op == OP_NOTPROP))
2541 ph10 836 RRETURN(MATCH_NOMATCH);
2542 ph10 527 break;
2543 nigel 87
2544 ph10 527 case PT_WORD:
2545 ph10 836 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2546     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2547 ph10 517 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2548 ph10 836 RRETURN(MATCH_NOMATCH);
2549 ph10 527 break;
2550    
2551 ph10 517 /* This should never occur */
2552    
2553 nigel 87 default:
2554     RRETURN(PCRE_ERROR_INTERNAL);
2555 nigel 77 }
2556 nigel 87
2557     ecode += 3;
2558 nigel 77 }
2559     break;
2560    
2561     /* Match an extended Unicode sequence. We will get here only if the support
2562     is in the binary; otherwise a compile-time error occurs. */
2563    
2564     case OP_EXTUNI:
2565 ph10 443 if (eptr >= md->end_subject)
2566 ph10 428 {
2567 ph10 443 SCHECK_PARTIAL();
2568 ph10 836 RRETURN(MATCH_NOMATCH);
2569 ph10 443 }
2570 nigel 77 GETCHARINCTEST(c, eptr);
2571 ph10 836 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2572 ph10 623 while (eptr < md->end_subject)
2573 nigel 77 {
2574 ph10 623 int len = 1;
2575 ph10 836 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2576 ph10 623 if (UCD_CATEGORY(c) != ucp_M) break;
2577     eptr += len;
2578 nigel 77 }
2579     ecode++;
2580     break;
2581     #endif
2582    
2583    
2584     /* Match a back reference, possibly repeatedly. Look past the end of the
2585     item to see if there is repeat information following. The code is similar
2586     to that for character classes, but repeated for efficiency. Then obey
2587     similar code to character type repeats - written out again for speed.
2588     However, if the referenced string is the empty string, always treat
2589     it as matched, any number of times (otherwise there could be infinite
2590     loops). */
2591    
2592     case OP_REF:
2593 ph10 625 case OP_REFI:
2594     caseless = op == OP_REFI;
2595 ph10 595 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2596 ph10 836 ecode += 1 + IMM2_SIZE;
2597 ph10 345
2598 ph10 595 /* If the reference is unset, there are two possibilities:
2599 ph10 345
2600 ph10 595 (a) In the default, Perl-compatible state, set the length negative;
2601     this ensures that every attempt at a match fails. We can't just fail
2602     here, because of the possibility of quantifiers with zero minima.
2603 ph10 345
2604 ph10 595 (b) If the JavaScript compatibility flag is set, set the length to zero
2605     so that the back reference matches an empty string.
2606 ph10 345
2607 ph10 595 Otherwise, set the length to the length of what was matched by the
2608     referenced subpattern. */
2609 ph10 345
2610 ph10 595 if (offset >= offset_top || md->offset_vector[offset] < 0)
2611     length = (md->jscript_compat)? 0 : -1;
2612     else
2613     length = md->offset_vector[offset+1] - md->offset_vector[offset];
2614 nigel 77
2615 ph10 595 /* Set up for repetition, or handle the non-repeated case */
2616 nigel 77
2617 ph10 595 switch (*ecode)
2618     {
2619     case OP_CRSTAR:
2620     case OP_CRMINSTAR:
2621     case OP_CRPLUS:
2622     case OP_CRMINPLUS:
2623     case OP_CRQUERY:
2624     case OP_CRMINQUERY:
2625     c = *ecode++ - OP_CRSTAR;
2626     minimize = (c & 1) != 0;
2627     min = rep_min[c]; /* Pick up values from tables; */
2628     max = rep_max[c]; /* zero for max => infinity */
2629     if (max == 0) max = INT_MAX;
2630     break;
2631 nigel 77
2632 ph10 595 case OP_CRRANGE:
2633     case OP_CRMINRANGE:
2634     minimize = (*ecode == OP_CRMINRANGE);
2635     min = GET2(ecode, 1);
2636 ph10 836 max = GET2(ecode, 1 + IMM2_SIZE);
2637 ph10 595 if (max == 0) max = INT_MAX;
2638 ph10 836 ecode += 1 + 2 * IMM2_SIZE;
2639 ph10 595 break;
2640 nigel 77
2641 ph10 595 default: /* No repeat follows */
2642 ph10 602 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2643 ph10 595 {
2644     CHECK_PARTIAL();
2645 ph10 836 RRETURN(MATCH_NOMATCH);
2646 nigel 77 }
2647 ph10 595 eptr += length;
2648     continue; /* With the main loop */
2649     }
2650 nigel 77
2651 ph10 595 /* Handle repeated back references. If the length of the reference is
2652 ph10 836 zero, just continue with the main loop. If the length is negative, it
2653 ph10 842 means the reference is unset in non-Java-compatible mode. If the minimum is
2654     zero, we can continue at the same level without recursion. For any other
2655 ph10 836 minimum, carrying on will result in NOMATCH. */
2656 ph10 443
2657 ph10 595 if (length == 0) continue;
2658 ph10 836 if (length < 0 && min == 0) continue;
2659 nigel 77
2660 ph10 595 /* First, ensure the minimum number of matches are present. We get back
2661     the length of the reference string explicitly rather than passing the
2662     address of eptr, so that eptr can be a register variable. */
2663 nigel 77
2664 ph10 595 for (i = 1; i <= min; i++)
2665     {
2666 ph10 625 int slength;
2667 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2668 nigel 77 {
2669 ph10 595 CHECK_PARTIAL();
2670 ph10 836 RRETURN(MATCH_NOMATCH);
2671 nigel 77 }
2672 ph10 595 eptr += slength;
2673     }
2674 nigel 77
2675 ph10 595 /* If min = max, continue at the same level without recursion.
2676     They are not both allowed to be zero. */
2677 nigel 77
2678 ph10 595 if (min == max) continue;
2679 nigel 77
2680 ph10 595 /* If minimizing, keep trying and advancing the pointer */
2681 nigel 77
2682 ph10 595 if (minimize)
2683     {
2684     for (fi = min;; fi++)
2685 nigel 77 {
2686 ph10 625 int slength;
2687 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2688 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2689 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
2690 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2691 nigel 77 {
2692 ph10 595 CHECK_PARTIAL();
2693 ph10 836 RRETURN(MATCH_NOMATCH);
2694 nigel 77 }
2695 ph10 595 eptr += slength;
2696 nigel 77 }
2697 ph10 595 /* Control never gets here */
2698     }
2699 nigel 77
2700 ph10 595 /* If maximizing, find the longest string and work backwards */
2701 nigel 77
2702 ph10 595 else
2703     {
2704     pp = eptr;
2705     for (i = min; i < max; i++)
2706 nigel 77 {
2707 ph10 625 int slength;
2708 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2709 nigel 77 {
2710 ph10 595 CHECK_PARTIAL();
2711     break;
2712 nigel 77 }
2713 ph10 595 eptr += slength;
2714 nigel 77 }
2715 ph10 595 while (eptr >= pp)
2716     {
2717 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2718 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2719     eptr -= length;
2720     }
2721 ph10 836 RRETURN(MATCH_NOMATCH);
2722 nigel 77 }
2723     /* Control never gets here */
2724    
2725     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2726     used when all the characters in the class have values in the range 0-255,
2727     and either the matching is caseful, or the characters are in the range
2728     0-127 when UTF-8 processing is enabled. The only difference between
2729     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2730     encountered.
2731    
2732     First, look past the end of the item to see if there is repeat information
2733     following. Then obey similar code to character type repeats - written out
2734     again for speed. */
2735    
2736     case OP_NCLASS:
2737     case OP_CLASS:
2738     {
2739 ph10 836 /* The data variable is saved across frames, so the byte map needs to
2740     be stored there. */
2741     #define BYTE_MAP ((pcre_uint8 *)data)
2742 nigel 77 data = ecode + 1; /* Save for matching */
2743 ph10 836 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2744 nigel 77
2745     switch (*ecode)
2746     {
2747     case OP_CRSTAR:
2748     case OP_CRMINSTAR:
2749     case OP_CRPLUS:
2750     case OP_CRMINPLUS:
2751     case OP_CRQUERY:
2752     case OP_CRMINQUERY:
2753     c = *ecode++ - OP_CRSTAR;
2754     minimize = (c & 1) != 0;
2755     min = rep_min[c]; /* Pick up values from tables; */
2756     max = rep_max[c]; /* zero for max => infinity */
2757     if (max == 0) max = INT_MAX;
2758     break;
2759    
2760     case OP_CRRANGE:
2761     case OP_CRMINRANGE:
2762     minimize = (*ecode == OP_CRMINRANGE);
2763     min = GET2(ecode, 1);
2764 ph10 836 max = GET2(ecode, 1 + IMM2_SIZE);
2765 nigel 77 if (max == 0) max = INT_MAX;
2766 ph10 836 ecode += 1 + 2 * IMM2_SIZE;
2767 nigel 77 break;
2768    
2769     default: /* No repeat follows */
2770     min = max = 1;
2771     break;
2772     }
2773    
2774     /* First, ensure the minimum number of matches are present. */
2775    
2776 ph10 836 #ifdef SUPPORT_UTF
2777     if (utf)
2778 nigel 77 {
2779     for (i = 1; i <= min; i++)
2780     {
2781 ph10 427 if (eptr >= md->end_subject)
2782 ph10 426 {
2783 ph10 428 SCHECK_PARTIAL();
2784 ph10 836 RRETURN(MATCH_NOMATCH);
2785 ph10 427 }
2786 nigel 77 GETCHARINC(c, eptr);
2787     if (c > 255)
2788     {
2789 ph10 836 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2790 nigel 77 }
2791     else
2792 ph10 836 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2793 nigel 77 }
2794     }
2795     else
2796     #endif
2797 ph10 836 /* Not UTF mode */
2798 nigel 77 {
2799     for (i = 1; i <= min; i++)
2800     {
2801 ph10 427 if (eptr >= md->end_subject)
2802 ph10 426 {
2803 ph10 428 SCHECK_PARTIAL();
2804 ph10 836 RRETURN(MATCH_NOMATCH);
2805 ph10 427 }
2806 nigel 77 c = *eptr++;
2807 ph10 836 #ifndef COMPILE_PCRE8
2808     if (c > 255)
2809     {
2810     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2811     }
2812     else
2813     #endif
2814     if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2815 nigel 77 }
2816     }
2817    
2818     /* If max == min we can continue with the main loop without the
2819     need to recurse. */
2820    
2821     if (min == max) continue;
2822    
2823     /* If minimizing, keep testing the rest of the expression and advancing
2824     the pointer while it matches the class. */
2825    
2826     if (minimize)
2827     {
2828 ph10 836 #ifdef SUPPORT_UTF
2829     if (utf)
2830 nigel 77 {
2831     for (fi = min;; fi++)
2832     {
2833 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2834 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2835 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
2836 ph10 427 if (eptr >= md->end_subject)
2837 ph10 426 {
2838 ph10 427 SCHECK_PARTIAL();
2839 ph10 836 RRETURN(MATCH_NOMATCH);
2840 ph10 427 }
2841 nigel 77 GETCHARINC(c, eptr);
2842     if (c > 255)
2843     {
2844 ph10 836 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2845 nigel 77 }
2846     else
2847 ph10 836 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2848 nigel 77 }
2849     }
2850     else
2851     #endif
2852 ph10 836 /* Not UTF mode */
2853 nigel 77 {
2854     for (fi = min;; fi++)
2855     {
2856 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2857 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2858 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
2859 ph10 427 if (eptr >= md->end_subject)
2860 ph10 426 {
2861 ph10 427 SCHECK_PARTIAL();
2862 ph10 836 RRETURN(MATCH_NOMATCH);
2863 ph10 427 }
2864 nigel 77 c = *eptr++;
2865 ph10 836 #ifndef COMPILE_PCRE8
2866     if (c > 255)
2867     {
2868     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2869     }
2870     else
2871     #endif
2872     if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2873 nigel 77 }
2874     }
2875     /* Control never gets here */
2876     }
2877    
2878     /* If maximizing, find the longest possible run, then work backwards. */
2879    
2880     else
2881     {
2882     pp = eptr;
2883    
2884 ph10 836 #ifdef SUPPORT_UTF
2885     if (utf)
2886 nigel 77 {
2887     for (i = min; i < max; i++)
2888     {
2889     int len = 1;
2890 ph10 463 if (eptr >= md->end_subject)
2891 ph10 462 {
2892 ph10 463 SCHECK_PARTIAL();
2893 ph10 462 break;
2894 ph10 463 }
2895 nigel 77 GETCHARLEN(c, eptr, len);
2896     if (c > 255)
2897     {
2898     if (op == OP_CLASS) break;
2899     }
2900     else
2901 ph10 836 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2902 nigel 77 eptr += len;
2903     }
2904     for (;;)
2905     {
2906 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2907 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2908     if (eptr-- == pp) break; /* Stop if tried at original pos */
2909     BACKCHAR(eptr);
2910     }
2911     }
2912     else
2913     #endif
2914 ph10 836 /* Not UTF mode */
2915 nigel 77 {
2916     for (i = min; i < max; i++)
2917     {
2918 ph10 463 if (eptr >= md->end_subject)
2919 ph10 462 {
2920 ph10 463 SCHECK_PARTIAL();
2921 ph10 462 break;
2922 ph10 463 }
2923 nigel 77 c = *eptr;
2924 ph10 836 #ifndef COMPILE_PCRE8
2925     if (c > 255)
2926     {
2927     if (op == OP_CLASS) break;
2928     }
2929     else
2930     #endif
2931     if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2932 nigel 77 eptr++;
2933     }
2934     while (eptr >= pp)
2935     {
2936 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2937 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2938 nigel 77 eptr--;
2939     }
2940     }
2941    
2942 ph10 836 RRETURN(MATCH_NOMATCH);
2943 nigel 77 }
2944 ph10 836 #undef BYTE_MAP
2945 nigel 77 }
2946     /* Control never gets here */
2947    
2948    
2949     /* Match an extended character class. This opcode is encountered only
2950 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2951     mode, because Unicode properties are supported in non-UTF-8 mode. */
2952 nigel 77
2953 ph10 836 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2954 nigel 77 case OP_XCLASS:
2955     {
2956     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2957     ecode += GET(ecode, 1); /* Advance past the item */
2958    
2959     switch (*ecode)
2960     {
2961     case OP_CRSTAR:
2962     case OP_CRMINSTAR:
2963     case OP_CRPLUS:
2964     case OP_CRMINPLUS:
2965     case OP_CRQUERY:
2966     case OP_CRMINQUERY:
2967     c = *ecode++ - OP_CRSTAR;
2968     minimize = (c & 1) != 0;
2969     min = rep_min[c]; /* Pick up values from tables; */
2970     max = rep_max[c]; /* zero for max => infinity */
2971     if (max == 0) max = INT_MAX;
2972     break;
2973    
2974     case OP_CRRANGE:
2975     case OP_CRMINRANGE:
2976     minimize = (*ecode == OP_CRMINRANGE);
2977     min = GET2(ecode, 1);
2978 ph10 836 max = GET2(ecode, 1 + IMM2_SIZE);
2979 nigel 77 if (max == 0) max = INT_MAX;
2980 ph10 836 ecode += 1 + 2 * IMM2_SIZE;
2981 nigel 77 break;
2982    
2983     default: /* No repeat follows */
2984     min = max = 1;
2985     break;
2986     }
2987    
2988     /* First, ensure the minimum number of matches are present. */
2989    
2990     for (i = 1; i <= min; i++)
2991     {
2992 ph10 427 if (eptr >= md->end_subject)
2993 ph10 426 {
2994     SCHECK_PARTIAL();
2995 ph10 836 RRETURN(MATCH_NOMATCH);
2996 ph10 427 }
2997 ph10 384 GETCHARINCTEST(c, eptr);
2998 ph10 836 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
2999 nigel 77 }
3000    
3001     /* If max == min we can continue with the main loop without the
3002     need to recurse. */
3003    
3004     if (min == max) continue;
3005    
3006     /* If minimizing, keep testing the rest of the expression and advancing
3007     the pointer while it matches the class. */
3008    
3009     if (minimize)
3010     {
3011     for (fi = min;; fi++)
3012     {
3013 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3014 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3015 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
3016 ph10 427 if (eptr >= md->end_subject)
3017 ph10 426 {
3018 ph10 427 SCHECK_PARTIAL();
3019 ph10 836 RRETURN(MATCH_NOMATCH);
3020 ph10 427 }
3021 ph10 384 GETCHARINCTEST(c, eptr);
3022 ph10 836 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3023 nigel 77 }
3024     /* Control never gets here */
3025     }
3026    
3027     /* If maximizing, find the longest possible run, then work backwards. */
3028    
3029     else
3030     {
3031     pp = eptr;
3032     for (i = min; i < max; i++)
3033     {
3034     int len = 1;
3035 ph10 463 if (eptr >= md->end_subject)
3036 ph10 462 {
3037 ph10 463 SCHECK_PARTIAL();
3038 ph10 462 break;
3039 ph10 463 }
3040 ph10 836 #ifdef SUPPORT_UTF
3041 ph10 384 GETCHARLENTEST(c, eptr, len);
3042 ph10 836 #else
3043     c = *eptr;
3044     #endif
3045     if (!PRIV(xclass)(c, data, utf)) break;
3046 nigel 77 eptr += len;
3047     }
3048     for(;;)
3049     {
3050 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3051 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3052     if (eptr-- == pp) break; /* Stop if tried at original pos */
3053 ph10 836 #ifdef SUPPORT_UTF
3054     if (utf) BACKCHAR(eptr);
3055     #endif
3056 nigel 77 }
3057 ph10 836 RRETURN(MATCH_NOMATCH);
3058 nigel 77 }
3059    
3060     /* Control never gets here */
3061     }
3062     #endif /* End of XCLASS */
3063    
3064     /* Match a single character, casefully */
3065    
3066     case OP_CHAR:
3067 ph10 836 #ifdef SUPPORT_UTF
3068     if (utf)
3069 nigel 77 {
3070     length = 1;
3071     ecode++;
3072     GETCHARLEN(fc, ecode, length);
3073 ph10 443 if (length > md->end_subject - eptr)
3074 ph10 428 {
3075     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3076 ph10 836 RRETURN(MATCH_NOMATCH);
3077 ph10 443 }
3078 ph10 836 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3079 nigel 77 }
3080     else
3081     #endif
3082 ph10 836 /* Not UTF mode */
3083 nigel 77 {
3084 ph10 443 if (md->end_subject - eptr < 1)
3085 ph10 428 {
3086     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3087 ph10 836 RRETURN(MATCH_NOMATCH);
3088 ph10 443 }
3089 ph10 836 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3090 nigel 77 ecode += 2;
3091     }
3092     break;
3093    
3094 ph10 836 /* Match a single character, caselessly. If we are at the end of the
3095     subject, give up immediately. */
3096 nigel 77
3097 ph10 602 case OP_CHARI:
3098 ph10 836 if (eptr >= md->end_subject)
3099 nigel 77 {
3100 ph10 836 SCHECK_PARTIAL();
3101     RRETURN(MATCH_NOMATCH);
3102     }
3103    
3104     #ifdef SUPPORT_UTF
3105     if (utf)
3106     {
3107 nigel 77 length = 1;
3108     ecode++;
3109     GETCHARLEN(fc, ecode, length);
3110 ph10 788
3111 nigel 77 /* If the pattern character's value is < 128, we have only one byte, and
3112 ph10 836 we know that its other case must also be one byte long, so we can use the
3113     fast lookup table. We know that there is at least one byte left in the
3114     subject. */
3115 nigel 77
3116     if (fc < 128)
3117     {
3118 ph10 836 if (md->lcc[fc]
3119     != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3120     ecode++;
3121     eptr++;
3122 nigel 77 }
3123    
3124 ph10 836 /* Otherwise we must pick up the subject character. Note that we cannot
3125     use the value of "length" to check for sufficient bytes left, because the
3126     other case of the character may have more or fewer bytes. */
3127 nigel 77
3128     else
3129     {
3130 nigel 93 unsigned int dc;
3131 nigel 77 GETCHARINC(dc, eptr);
3132     ecode += length;
3133    
3134     /* If we have Unicode property support, we can use it to test the other
3135 nigel 87 case of the character, if there is one. */
3136 nigel 77
3137     if (fc != dc)
3138     {
3139     #ifdef SUPPORT_UCP
3140 ph10 349 if (dc != UCD_OTHERCASE(fc))
3141 nigel 77 #endif
3142 ph10 836 RRETURN(MATCH_NOMATCH);
3143 nigel 77 }
3144     }
3145     }
3146     else
3147 ph10 836 #endif /* SUPPORT_UTF */
3148 nigel 77
3149 ph10 836 /* Not UTF mode */
3150 nigel 77 {
3151 ph10 836 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3152     != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3153     eptr++;
3154 nigel 77 ecode += 2;
3155     }
3156     break;
3157    
3158 nigel 93 /* Match a single character repeatedly. */
3159 nigel 77
3160     case OP_EXACT:
3161 ph10 602 case OP_EXACTI:
3162 nigel 77 min = max = GET2(ecode, 1);
3163 ph10 836 ecode += 1 + IMM2_SIZE;
3164 nigel 77 goto REPEATCHAR;
3165    
3166 nigel 93 case OP_POSUPTO:
3167 ph10 602 case OP_POSUPTOI:
3168 nigel 93 possessive = TRUE;
3169     /* Fall through */
3170    
3171 nigel 77 case OP_UPTO:
3172 ph10 602 case OP_UPTOI:
3173 nigel 77 case OP_MINUPTO:
3174 ph10 602 case OP_MINUPTOI:
3175 nigel 77 min = 0;
3176     max = GET2(ecode, 1);
3177 ph10 602 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3178 ph10 836 ecode += 1 + IMM2_SIZE;
3179 nigel 77 goto REPEATCHAR;
3180    
3181 nigel 93 case OP_POSSTAR:
3182 ph10 602 case OP_POSSTARI:
3183 nigel 93 possessive = TRUE;
3184     min = 0;
3185     max = INT_MAX;
3186     ecode++;
3187     goto REPEATCHAR;
3188    
3189     case OP_POSPLUS:
3190 ph10 602 case OP_POSPLUSI:
3191 nigel 93 possessive = TRUE;
3192     min = 1;
3193     max = INT_MAX;
3194     ecode++;
3195     goto REPEATCHAR;
3196    
3197     case OP_POSQUERY:
3198 ph10 602 case OP_POSQUERYI:
3199 nigel 93 possessive = TRUE;
3200     min = 0;
3201     max = 1;
3202     ecode++;
3203     goto REPEATCHAR;
3204    
3205 nigel 77 case OP_STAR:
3206 ph10 602 case OP_STARI:
3207 nigel 77 case OP_MINSTAR:
3208 ph10 602 case OP_MINSTARI:
3209 nigel 77 case OP_PLUS:
3210 ph10 602 case OP_PLUSI:
3211 nigel 77 case OP_MINPLUS:
3212 ph10 602 case OP_MINPLUSI:
3213 nigel 77 case OP_QUERY:
3214 ph10 602 case OP_QUERYI:
3215 nigel 77 case OP_MINQUERY:
3216 ph10 602 case OP_MINQUERYI:
3217     c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3218 nigel 77 minimize = (c & 1) != 0;
3219     min = rep_min[c]; /* Pick up values from tables; */
3220     max = rep_max[c]; /* zero for max => infinity */
3221     if (max == 0) max = INT_MAX;
3222    
3223 ph10 426 /* Common code for all repeated single-character matches. */
3224 nigel 77
3225     REPEATCHAR:
3226 ph10 836 #ifdef SUPPORT_UTF
3227     if (utf)
3228 nigel 77 {
3229     length = 1;
3230     charptr = ecode;
3231     GETCHARLEN(fc, ecode, length);
3232     ecode += length;
3233    
3234     /* Handle multibyte character matching specially here. There is
3235     support for caseless matching if UCP support is present. */
3236    
3237     if (length > 1)
3238     {
3239     #ifdef SUPPORT_UCP
3240 nigel 93 unsigned int othercase;
3241 ph10 602 if (op >= OP_STARI && /* Caseless */
3242 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
3243 ph10 836 oclength = PRIV(ord2utf)(othercase, occhars);
3244 ph10 115 else oclength = 0;
3245 nigel 77 #endif /* SUPPORT_UCP */
3246    
3247     for (i = 1; i <= min; i++)
3248     {
3249 ph10 426 if (eptr <= md->end_subject - length &&
3250 ph10 836 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3251 ph10 123 #ifdef SUPPORT_UCP
3252 ph10 426 else if (oclength > 0 &&
3253     eptr <= md->end_subject - oclength &&
3254 ph10 836 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3255 ph10 426 #endif /* SUPPORT_UCP */
3256 nigel 77 else
3257     {
3258 ph10 426 CHECK_PARTIAL();
3259 ph10 836 RRETURN(MATCH_NOMATCH);
3260 nigel 77 }
3261     }
3262    
3263     if (min == max) continue;
3264    
3265     if (minimize)
3266     {
3267     for (fi = min;; fi++)
3268     {
3269 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3270 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3271 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
3272 ph10 426 if (eptr <= md->end_subject - length &&
3273 ph10 836 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3274 ph10 123 #ifdef SUPPORT_UCP
3275 ph10 426 else if (oclength > 0 &&
3276     eptr <= md->end_subject - oclength &&
3277 ph10 836 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3278 ph10 426 #endif /* SUPPORT_UCP */
3279 nigel 77 else
3280     {
3281 ph10 426 CHECK_PARTIAL();
3282 ph10 836 RRETURN(MATCH_NOMATCH);
3283 nigel 77 }
3284     }
3285     /* Control never gets here */
3286     }
3287 nigel 93
3288     else /* Maximize */
3289 nigel 77 {
3290     pp = eptr;
3291     for (i = min; i < max; i++)
3292     {
3293 ph10 426 if (eptr <= md->end_subject - length &&
3294 ph10 836 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3295 ph10 123 #ifdef SUPPORT_UCP
3296 ph10 426 else if (oclength > 0 &&
3297     eptr <= md->end_subject - oclength &&
3298 ph10 836 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3299 ph10 426 #endif /* SUPPORT_UCP */
3300 ph10 463 else
3301 ph10 462 {
3302 ph10 463 CHECK_PARTIAL();
3303 ph10 462 break;
3304 ph10 463 }
3305 nigel 77 }
3306 nigel 93
3307     if (possessive) continue;
3308 ph10 427
3309 ph10 120 for(;;)
3310 ph10 426 {
3311 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3312 ph10 426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3313 ph10 836 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3314 ph10 115 #ifdef SUPPORT_UCP
3315 ph10 426 eptr--;
3316     BACKCHAR(eptr);
3317 ph10 123 #else /* without SUPPORT_UCP */
3318 ph10 426 eptr -= length;
3319 ph10 123 #endif /* SUPPORT_UCP */
3320 ph10 426 }
3321 nigel 77 }
3322     /* Control never gets here */
3323     }
3324    
3325     /* If the length of a UTF-8 character is 1, we fall through here, and
3326     obey the code as for non-UTF-8 characters below, though in this case the
3327     value of fc will always be < 128. */
3328     }
3329     else
3330 ph10 836 #endif /* SUPPORT_UTF */
3331     /* When not in UTF-8 mode, load a single-byte character. */
3332     fc = *ecode++;
3333 nigel 77
3334 ph10 836 /* The value of fc at this point is always one character, though we may
3335     or may not be in UTF mode. The code is duplicated for the caseless and
3336 nigel 77 caseful cases, for speed, since matching characters is likely to be quite
3337     common. First, ensure the minimum number of matches are present. If min =
3338     max, continue at the same level without recursing. Otherwise, if
3339     minimizing, keep trying the rest of the expression and advancing one
3340     matching character if failing, up to the maximum. Alternatively, if
3341     maximizing, find the maximum number of characters and work backwards. */
3342    
3343     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3344     max, eptr));
3345    
3346 ph10 602 if (op >= OP_STARI) /* Caseless */
3347 nigel 77 {
3348 ph10 836 #ifdef COMPILE_PCRE8
3349     /* fc must be < 128 if UTF is enabled. */
3350     foc = md->fcc[fc];
3351     #else
3352     #ifdef SUPPORT_UTF
3353     #ifdef SUPPORT_UCP
3354     if (utf && fc > 127)
3355     foc = UCD_OTHERCASE(fc);
3356     #else
3357     if (utf && fc > 127)
3358     foc = fc;
3359     #endif /* SUPPORT_UCP */
3360     else
3361     #endif /* SUPPORT_UTF */
3362     foc = TABLE_GET(fc, md->fcc, fc);
3363     #endif /* COMPILE_PCRE8 */
3364    
3365 nigel 77 for (i = 1; i <= min; i++)
3366 ph10 426 {
3367     if (eptr >= md->end_subject)
3368     {
3369     SCHECK_PARTIAL();
3370 ph10 836 RRETURN(MATCH_NOMATCH);
3371 ph10 426 }
3372 ph10 836 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3373     eptr++;
3374 ph10 426 }
3375 nigel 77 if (min == max) continue;
3376     if (minimize)
3377     {
3378     for (fi = min;; fi++)
3379     {
3380 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3381 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3382 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
3383 ph10 426 if (eptr >= md->end_subject)
3384     {
3385 ph10 427 SCHECK_PARTIAL();
3386 ph10 836 RRETURN(MATCH_NOMATCH);
3387 ph10 426 }
3388 ph10 836 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3389     eptr++;
3390 nigel 77 }
3391     /* Control never gets here */
3392     }
3393 nigel 93 else /* Maximize */
3394 nigel 77 {
3395     pp = eptr;
3396     for (i = min; i < max; i++)
3397     {
3398 ph10 463 if (eptr >= md->end_subject)
3399 ph10 462 {
3400     SCHECK_PARTIAL();
3401     break;
3402 ph10 463 }
3403 ph10 836 if (fc != *eptr && foc != *eptr) break;
3404 nigel 77 eptr++;
3405     }
3406 ph10 427
3407 nigel 93 if (possessive) continue;
3408 ph10 427
3409 nigel 77 while (eptr >= pp)
3410     {
3411 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3412 nigel 77 eptr--;
3413     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3414     }
3415 ph10 836 RRETURN(MATCH_NOMATCH);
3416 nigel 77 }
3417     /* Control never gets here */
3418     }
3419    
3420     /* Caseful comparisons (includes all multi-byte characters) */
3421    
3422     else
3423     {
3424 ph10 427 for (i = 1; i <= min; i++)
3425 ph10 426 {
3426     if (eptr >= md->end_subject)
3427     {
3428     SCHECK_PARTIAL();
3429 ph10 836 RRETURN(MATCH_NOMATCH);
3430 ph10 426 }
3431 ph10 836 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3432 ph10 427 }
3433 ph10 443
3434 nigel 77 if (min == max) continue;
3435 ph10 443
3436 nigel 77 if (minimize)
3437     {
3438     for (fi = min;; fi++)
3439     {
3440 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3441 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3442 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
3443 ph10 426 if (eptr >= md->end_subject)
3444 ph10 427 {
3445 ph10 426 SCHECK_PARTIAL();
3446 ph10 836 RRETURN(MATCH_NOMATCH);
3447 ph10 427 }
3448 ph10 836 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3449 nigel 77 }
3450     /* Control never gets here */
3451     }
3452 nigel 93 else /* Maximize */
3453 nigel 77 {
3454     pp = eptr;
3455     for (i = min; i < max; i++)
3456     {
3457 ph10 463 if (eptr >= md->end_subject)
3458 ph10 462 {
3459 ph10 463 SCHECK_PARTIAL();
3460 ph10 462 break;
3461 ph10 463 }
3462 ph10 462 if (fc != *eptr) break;
3463 nigel 77 eptr++;
3464     }
3465 nigel 93 if (possessive) continue;
3466 ph10 443
3467 nigel 77 while (eptr >= pp)
3468     {
3469 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3470 nigel 77 eptr--;
3471     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3472     }
3473 ph10 836 RRETURN(MATCH_NOMATCH);
3474 nigel 77 }
3475     }
3476     /* Control never gets here */
3477    
3478     /* Match a negated single one-byte character. The character we are
3479     checking can be multibyte. */
3480    
3481     case OP_NOT:
3482 ph10 625 case OP_NOTI:
3483 ph10 443 if (eptr >= md->end_subject)
3484 ph10 428 {
3485 ph10 443 SCHECK_PARTIAL();
3486 ph10 836 RRETURN(MATCH_NOMATCH);
3487 ph10 443 }
3488 nigel 77 ecode++;
3489     GETCHARINCTEST(c, eptr);
3490 ph10 602 if (op == OP_NOTI) /* The caseless case */
3491 nigel 77 {
3492 ph10 836 register int ch, och;
3493     ch = *ecode++;
3494     #ifdef COMPILE_PCRE8
3495     /* ch must be < 128 if UTF is enabled. */
3496     och = md->fcc[ch];
3497     #else
3498     #ifdef SUPPORT_UTF
3499     #ifdef SUPPORT_UCP
3500     if (utf && ch > 127)
3501     och = UCD_OTHERCASE(ch);
3502     #else
3503     if (utf && ch > 127)
3504     och = ch;
3505     #endif /* SUPPORT_UCP */
3506     else
3507     #endif /* SUPPORT_UTF */
3508     och = TABLE_GET(ch, md->fcc, ch);
3509     #endif /* COMPILE_PCRE8 */
3510     if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3511 nigel 77 }
3512 ph10 602 else /* Caseful */
3513 nigel 77 {
3514 ph10 836 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
3515 nigel 77 }
3516     break;
3517    
3518     /* Match a negated single one-byte character repeatedly. This is almost a
3519     repeat of the code for a repeated single character, but I haven't found a
3520     nice way of commoning these up that doesn't require a test of the
3521     positive/negative option for each character match. Maybe that wouldn't add
3522     very much to the time taken, but character matching *is* what this is all
3523     about... */
3524    
3525     case OP_NOTEXACT:
3526 ph10 602 case OP_NOTEXACTI:
3527 nigel 77 min = max = GET2(ecode, 1);
3528 ph10 836 ecode += 1 + IMM2_SIZE;
3529 nigel 77 goto REPEATNOTCHAR;
3530    
3531     case OP_NOTUPTO:
3532 ph10 602 case OP_NOTUPTOI:
3533 nigel 77 case OP_NOTMINUPTO:
3534 ph10 602 case OP_NOTMINUPTOI:
3535 nigel 77 min = 0;
3536     max = GET2(ecode, 1);
3537 ph10 602 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3538 ph10 836 ecode += 1 + IMM2_SIZE;
3539 nigel 77 goto REPEATNOTCHAR;
3540    
3541 nigel 93 case OP_NOTPOSSTAR:
3542 ph10 602 case OP_NOTPOSSTARI:
3543 nigel 93 possessive = TRUE;
3544     min = 0;
3545     max = INT_MAX;
3546     ecode++;
3547     goto REPEATNOTCHAR;
3548    
3549     case OP_NOTPOSPLUS:
3550 ph10 602 case OP_NOTPOSPLUSI:
3551 nigel 93 possessive = TRUE;
3552     min = 1;
3553     max = INT_MAX;
3554     ecode++;
3555     goto REPEATNOTCHAR;
3556    
3557     case OP_NOTPOSQUERY:
3558 ph10 602 case OP_NOTPOSQUERYI:
3559 nigel 93 possessive = TRUE;
3560     min = 0;
3561     max = 1;
3562     ecode++;
3563     goto REPEATNOTCHAR;
3564    
3565     case OP_NOTPOSUPTO:
3566 ph10 602 case OP_NOTPOSUPTOI:
3567 nigel 93 possessive = TRUE;
3568     min = 0;
3569     max = GET2(ecode, 1);
3570 ph10 836 ecode += 1 + IMM2_SIZE;
3571 nigel 93 goto REPEATNOTCHAR;
3572    
3573 nigel 77 case OP_NOTSTAR:
3574 ph10 602 case OP_NOTSTARI:
3575 nigel 77 case OP_NOTMINSTAR:
3576 ph10 602 case OP_NOTMINSTARI:
3577 nigel 77 case OP_NOTPLUS:
3578 ph10 602 case OP_NOTPLUSI:
3579 nigel 77 case OP_NOTMINPLUS:
3580 ph10 602 case OP_NOTMINPLUSI:
3581 nigel 77 case OP_NOTQUERY:
3582 ph10 602 case OP_NOTQUERYI:
3583 nigel 77 case OP_NOTMINQUERY:
3584 ph10 602 case OP_NOTMINQUERYI:
3585     c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3586 nigel 77 minimize = (c & 1) != 0;
3587     min = rep_min[c]; /* Pick up values from tables; */
3588     max = rep_max[c]; /* zero for max => infinity */
3589     if (max == 0) max = INT_MAX;
3590    
3591 ph10 426 /* Common code for all repeated single-byte matches. */
3592 nigel 77
3593     REPEATNOTCHAR:
3594     fc = *ecode++;
3595    
3596     /* The code is duplicated for the caseless and caseful cases, for speed,
3597     since matching characters is likely to be quite common. First, ensure the
3598     minimum number of matches are present. If min = max, continue at the same
3599     level without recursing. Otherwise, if minimizing, keep trying the rest of
3600     the expression and advancing one matching character if failing, up to the
3601     maximum. Alternatively, if maximizing, find the maximum number of
3602     characters and work backwards. */
3603    
3604     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3605     max, eptr));
3606    
3607 ph10 602 if (op >= OP_NOTSTARI) /* Caseless */
3608 nigel 77 {
3609 ph10 836 #ifdef COMPILE_PCRE8
3610     /* fc must be < 128 if UTF is enabled. */
3611     foc = md->fcc[fc];
3612     #else
3613     #ifdef SUPPORT_UTF
3614     #ifdef SUPPORT_UCP
3615     if (utf && fc > 127)
3616     foc = UCD_OTHERCASE(fc);
3617     #else
3618     if (utf && fc > 127)
3619     foc = fc;
3620     #endif /* SUPPORT_UCP */
3621     else
3622     #endif /* SUPPORT_UTF */
3623     foc = TABLE_GET(fc, md->fcc, fc);
3624     #endif /* COMPILE_PCRE8 */
3625 nigel 77
3626 ph10 836 #ifdef SUPPORT_UTF
3627     if (utf)
3628 nigel 77 {
3629 nigel 93 register unsigned int d;
3630 nigel 77 for (i = 1; i <= min; i++)
3631     {
3632 ph10 426 if (eptr >= md->end_subject)
3633     {
3634     SCHECK_PARTIAL();
3635 ph10 836 RRETURN(MATCH_NOMATCH);
3636 ph10 427 }
3637 nigel 77 GETCHARINC(d, eptr);
3638 ph10 836 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3639 nigel 77 }
3640     }
3641     else
3642     #endif
3643 ph10 836 /* Not UTF mode */
3644 nigel 77 {
3645     for (i = 1; i <= min; i++)
3646 ph10 426 {
3647     if (eptr >= md->end_subject)
3648     {
3649     SCHECK_PARTIAL();
3650 ph10 836 RRETURN(MATCH_NOMATCH);
3651 ph10 427 }
3652 ph10 836 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3653     eptr++;
3654 ph10 427 }
3655 nigel 77 }
3656    
3657     if (min == max) continue;
3658    
3659     if (minimize)
3660     {
3661 ph10 836 #ifdef SUPPORT_UTF
3662     if (utf)
3663 nigel 77 {
3664 nigel 93 register unsigned int d;
3665 nigel 77 for (fi = min;; fi++)
3666     {
3667 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3668 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3669 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
3670 ph10 427 if (eptr >= md->end_subject)
3671 ph10 426 {
3672 ph10 427 SCHECK_PARTIAL();
3673 ph10 836 RRETURN(MATCH_NOMATCH);
3674 ph10 427 }
3675 nigel 77 GETCHARINC(d, eptr);
3676 ph10 836 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3677 nigel 77 }
3678     }
3679     else
3680     #endif
3681 ph10 836 /* Not UTF mode */
3682 nigel 77 {
3683     for (fi = min;; fi++)
3684     {
3685 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3686 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3687 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
3688 ph10 426 if (eptr >= md->end_subject)
3689     {
3690     SCHECK_PARTIAL();
3691 ph10 836 RRETURN(MATCH_NOMATCH);
3692 ph10 426 }
3693 ph10 836 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3694     eptr++;
3695 nigel 77 }
3696     }
3697     /* Control never gets here */
3698     }
3699    
3700     /* Maximize case */
3701    
3702     else
3703     {
3704     pp = eptr;
3705    
3706 ph10 836 #ifdef SUPPORT_UTF
3707     if (utf)
3708 nigel 77 {
3709 nigel 93 register unsigned int d;
3710 nigel 77 for (i = min; i < max; i++)
3711     {
3712     int len = 1;
3713 ph10 463 if (eptr >= md->end_subject)
3714 ph10 462