/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 836 - (hide annotations) (download)
Wed Dec 28 17:16:11 2011 UTC (2 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 211364 byte(s)
Merging all the changes from the pcre16 branch into the trunk.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 836 Copyright (c) 1997-2012 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 ph10 625 /* Values for setting in md->match_function_type to indicate two special types
61     of call to match(). We do it this way to save on using another stack variable,
62 ph10 604 as stack usage is to be discouraged. */
63 nigel 77
64 ph10 604 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65     #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66 nigel 77
67     /* Non-error returns from the match() function. Error returns are externally
68     defined PCRE_ERROR_xxx codes, which are all negative. */
69    
70     #define MATCH_MATCH 1
71     #define MATCH_NOMATCH 0
72    
73 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
74 ph10 210 negative to avoid the external error codes. */
75    
76 ph10 511 #define MATCH_ACCEPT (-999)
77     #define MATCH_COMMIT (-998)
78 ph10 604 #define MATCH_KETRPOS (-997)
79 ph10 618 #define MATCH_ONCE (-996)
80     #define MATCH_PRUNE (-995)
81     #define MATCH_SKIP (-994)
82     #define MATCH_SKIP_ARG (-993)
83     #define MATCH_THEN (-992)
84 ph10 210
85 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
86     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87     because the offset vector is always a multiple of 3 long. */
88    
89     #define REC_STACK_SAVE_MAX 30
90    
91     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92    
93     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95    
96    
97    
98 ph10 475 #ifdef PCRE_DEBUG
99 nigel 77 /*************************************************
100     * Debugging function to print chars *
101     *************************************************/
102    
103     /* Print a sequence of chars in printable format, stopping at the end of the
104     subject if the requested.
105    
106     Arguments:
107     p points to characters
108     length number to print
109     is_subject TRUE if printing from within md->start_subject
110     md pointer to matching data block, if is_subject is TRUE
111    
112     Returns: nothing
113     */
114    
115     static void
116 ph10 836 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
117 nigel 77 {
118 nigel 93 unsigned int c;
119 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120     while (length-- > 0)
121     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122     }
123     #endif
124    
125    
126    
127     /*************************************************
128     * Match a back-reference *
129     *************************************************/
130    
131 ph10 595 /* Normally, if a back reference hasn't been set, the length that is passed is
132     negative, so the match always fails. However, in JavaScript compatibility mode,
133 ph10 625 the length passed is zero. Note that in caseless UTF-8 mode, the number of
134 ph10 595 subject bytes matched may be different to the number of reference bytes.
135 nigel 77
136     Arguments:
137     offset index into the offset vector
138 ph10 595 eptr pointer into the subject
139     length length of reference to be matched (number of bytes)
140 nigel 77 md points to match data block
141 ph10 602 caseless TRUE if caseless
142 nigel 77
143 ph10 595 Returns: < 0 if not matched, otherwise the number of subject bytes matched
144 nigel 77 */
145    
146 ph10 595 static int
147 ph10 836 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
148 ph10 602 BOOL caseless)
149 nigel 77 {
150 ph10 836 PCRE_PUCHAR eptr_start = eptr;
151     register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
152 nigel 77
153 ph10 475 #ifdef PCRE_DEBUG
154 nigel 77 if (eptr >= md->end_subject)
155     printf("matching subject <null>");
156     else
157     {
158     printf("matching subject ");
159     pchars(eptr, length, TRUE, md);
160     }
161     printf(" against backref ");
162     pchars(p, length, FALSE, md);
163     printf("\n");
164     #endif
165    
166 ph10 595 /* Always fail if reference not set (and not JavaScript compatible). */
167 nigel 77
168 ph10 595 if (length < 0) return -1;
169 nigel 77
170 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
171     properly if Unicode properties are supported. Otherwise, we can check only
172     ASCII characters. */
173 nigel 77
174 ph10 602 if (caseless)
175 nigel 77 {
176 ph10 836 #ifdef SUPPORT_UTF
177 ph10 354 #ifdef SUPPORT_UCP
178 ph10 836 if (md->utf)
179 ph10 354 {
180 ph10 625 /* Match characters up to the end of the reference. NOTE: the number of
181 ph10 595 bytes matched may differ, because there are some characters whose upper and
182     lower case versions code as different numbers of bytes. For example, U+023A
183     (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
184     a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
185 ph10 625 the latter. It is important, therefore, to check the length along the
186 ph10 595 reference, not along the subject (earlier code did this wrong). */
187 ph10 625
188 ph10 836 PCRE_PUCHAR endptr = p + length;
189 ph10 595 while (p < endptr)
190 ph10 354 {
191 ph10 358 int c, d;
192 ph10 597 if (eptr >= md->end_subject) return -1;
193 ph10 354 GETCHARINC(c, eptr);
194     GETCHARINC(d, p);
195 ph10 595 if (c != d && c != UCD_OTHERCASE(d)) return -1;
196 ph10 358 }
197     }
198 ph10 354 else
199     #endif
200     #endif
201    
202     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
203     is no UCP support. */
204 ph10 597 {
205 ph10 625 if (eptr + length > md->end_subject) return -1;
206 ph10 597 while (length-- > 0)
207 ph10 836 {
208     if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
209     p++;
210     eptr++;
211     }
212 ph10 625 }
213 nigel 77 }
214 ph10 358
215 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
216     are in UTF-8 mode. */
217 ph10 358
218 nigel 77 else
219 ph10 625 {
220     if (eptr + length > md->end_subject) return -1;
221     while (length-- > 0) if (*p++ != *eptr++) return -1;
222 ph10 597 }
223 nigel 77
224 ph10 836 return (int)(eptr - eptr_start);
225 nigel 77 }
226    
227    
228    
229     /***************************************************************************
230     ****************************************************************************
231     RECURSION IN THE match() FUNCTION
232    
233 nigel 87 The match() function is highly recursive, though not every recursive call
234     increases the recursive depth. Nevertheless, some regular expressions can cause
235     it to recurse to a great depth. I was writing for Unix, so I just let it call
236     itself recursively. This uses the stack for saving everything that has to be
237     saved for a recursive call. On Unix, the stack can be large, and this works
238     fine.
239 nigel 77
240 nigel 87 It turns out that on some non-Unix-like systems there are problems with
241     programs that use a lot of stack. (This despite the fact that every last chip
242     has oodles of memory these days, and techniques for extending the stack have
243     been known for decades.) So....
244 nigel 77
245     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
246     calls by keeping local variables that need to be preserved in blocks of memory
247 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
248 nigel 77 achieve this so that the actual code doesn't look very different to what it
249     always used to.
250 ph10 164
251 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
252 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
253     Switzer, the use of longjmp() has been abolished, at the cost of having to
254     provide a unique number for each call to RMATCH. There is no way of generating
255     a sequence of numbers at compile time in C. I have given them names, to make
256     them stand out more clearly.
257    
258     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
259     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
260 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
261     don't have indeterminate values; this has meant that the frame size can be
262 ph10 164 reduced because the result can be "passed back" by straight setting of the
263     variable instead of being passed in the frame.
264 nigel 77 ****************************************************************************
265     ***************************************************************************/
266    
267 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
268     below must be updated in sync. */
269 nigel 77
270 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
271     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
272     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
273     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
274 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
275 ph10 527 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
276 ph10 723 RM61, RM62, RM63, RM64, RM65, RM66 };
277 ph10 164
278 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
279 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
280 ph10 501 actually used in this definition. */
281 nigel 77
282     #ifndef NO_RECURSE
283     #define REGISTER register
284 ph10 164
285 ph10 475 #ifdef PCRE_DEBUG
286 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
287 nigel 87 { \
288     printf("match() called in line %d\n", __LINE__); \
289 ph10 836 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
290 nigel 87 printf("to line %d\n", __LINE__); \
291     }
292     #define RRETURN(ra) \
293     { \
294     printf("match() returned %d from line %d ", ra, __LINE__); \
295     return ra; \
296     }
297     #else
298 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
299 ph10 836 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
300 nigel 77 #define RRETURN(ra) return ra
301 nigel 87 #endif
302    
303 nigel 77 #else
304    
305    
306 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
307     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
308     argument of match(), which never changes. */
309 nigel 77
310     #define REGISTER
311    
312 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw)\
313 nigel 77 {\
314 ph10 836 heapframe *newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
315 ph10 534 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
316 ph10 164 frame->Xwhere = rw; \
317     newframe->Xeptr = ra;\
318     newframe->Xecode = rb;\
319 ph10 168 newframe->Xmstart = mstart;\
320 ph10 164 newframe->Xoffset_top = rc;\
321 ph10 602 newframe->Xeptrb = re;\
322 ph10 164 newframe->Xrdepth = frame->Xrdepth + 1;\
323     newframe->Xprevframe = frame;\
324     frame = newframe;\
325     DPRINTF(("restarting from line %d\n", __LINE__));\
326     goto HEAP_RECURSE;\
327     L_##rw:\
328     DPRINTF(("jumped back to line %d\n", __LINE__));\
329 nigel 77 }
330    
331     #define RRETURN(ra)\
332     {\
333 ph10 527 heapframe *oldframe = frame;\
334     frame = oldframe->Xprevframe;\
335 ph10 836 (PUBL(stack_free))(oldframe);\
336 nigel 77 if (frame != NULL)\
337     {\
338 ph10 164 rrc = ra;\
339     goto HEAP_RETURN;\
340 nigel 77 }\
341     return ra;\
342     }
343    
344    
345     /* Structure for remembering the local variables in a private frame */
346    
347     typedef struct heapframe {
348     struct heapframe *Xprevframe;
349    
350     /* Function arguments that may change */
351    
352 ph10 836 PCRE_PUCHAR Xeptr;
353     const pcre_uchar *Xecode;
354     PCRE_PUCHAR Xmstart;
355 nigel 77 int Xoffset_top;
356     eptrblock *Xeptrb;
357 nigel 91 unsigned int Xrdepth;
358 nigel 77
359     /* Function local variables */
360    
361 ph10 836 PCRE_PUCHAR Xcallpat;
362     #ifdef SUPPORT_UTF
363     PCRE_PUCHAR Xcharptr;
364 ph10 406 #endif
365 ph10 836 PCRE_PUCHAR Xdata;
366     PCRE_PUCHAR Xnext;
367     PCRE_PUCHAR Xpp;
368     PCRE_PUCHAR Xprev;
369     PCRE_PUCHAR Xsaved_eptr;
370 nigel 77
371     recursion_info Xnew_recursive;
372    
373     BOOL Xcur_is_word;
374     BOOL Xcondition;
375     BOOL Xprev_is_word;
376    
377     #ifdef SUPPORT_UCP
378     int Xprop_type;
379 nigel 87 int Xprop_value;
380 nigel 77 int Xprop_fail_result;
381 ph10 123 int Xoclength;
382 ph10 836 pcre_uchar Xocchars[6];
383 nigel 77 #endif
384    
385 ph10 403 int Xcodelink;
386 nigel 77 int Xctype;
387 nigel 93 unsigned int Xfc;
388 nigel 77 int Xfi;
389     int Xlength;
390     int Xmax;
391     int Xmin;
392     int Xnumber;
393     int Xoffset;
394     int Xop;
395     int Xsave_capture_last;
396     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
397     int Xstacksave[REC_STACK_SAVE_MAX];
398    
399     eptrblock Xnewptrb;
400    
401 ph10 164 /* Where to jump back to */
402 nigel 77
403 ph10 164 int Xwhere;
404 ph10 165
405 nigel 77 } heapframe;
406    
407     #endif
408    
409    
410     /***************************************************************************
411     ***************************************************************************/
412    
413    
414    
415     /*************************************************
416     * Match from current position *
417     *************************************************/
418    
419 nigel 93 /* This function is called recursively in many circumstances. Whenever it
420 nigel 77 returns a negative (error) response, the outer incarnation must also return the
421 ph10 426 same response. */
422 nigel 77
423 ph10 426 /* These macros pack up tests that are used for partial matching, and which
424 ph10 836 appear several times in the code. We set the "hit end" flag if the pointer is
425 ph10 426 at the end of the subject and also past the start of the subject (i.e.
426 ph10 427 something has been matched). For hard partial matching, we then return
427     immediately. The second one is used when we already know we are past the end of
428     the subject. */
429 ph10 426
430     #define CHECK_PARTIAL()\
431 ph10 553 if (md->partial != 0 && eptr >= md->end_subject && \
432     eptr > md->start_used_ptr) \
433     { \
434     md->hitend = TRUE; \
435 ph10 836 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
436 ph10 427 }
437 ph10 426
438     #define SCHECK_PARTIAL()\
439 ph10 553 if (md->partial != 0 && eptr > md->start_used_ptr) \
440     { \
441     md->hitend = TRUE; \
442 ph10 836 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
443 ph10 427 }
444 ph10 426
445 ph10 427
446 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
447 ph10 836 the md structure (e.g. utf, end_subject) into individual variables to improve
448 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
449     made performance worse.
450    
451     Arguments:
452 nigel 93 eptr pointer to current character in subject
453     ecode pointer to current position in compiled code
454 ph10 168 mstart pointer to the current match start position (can be modified
455 ph10 172 by encountering \K)
456 nigel 77 offset_top current top pointer
457     md pointer to "static" info for the match
458     eptrb pointer to chain of blocks containing eptr at start of
459     brackets - for testing for empty matches
460 nigel 87 rdepth the recursion depth
461 nigel 77
462     Returns: MATCH_MATCH if matched ) these values are >= 0
463     MATCH_NOMATCH if failed to match )
464 ph10 510 a negative MATCH_xxx value for PRUNE, SKIP, etc
465 nigel 77 a negative PCRE_ERROR_xxx value if aborted by an error condition
466 nigel 87 (e.g. stopped by repeated call or recursion limit)
467 nigel 77 */
468    
469     static int
470 ph10 836 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
471     PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
472 ph10 835 unsigned int rdepth)
473 nigel 77 {
474     /* These variables do not need to be preserved over recursion in this function,
475 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
476     "register" because they are used a lot in loops. */
477 nigel 77
478 nigel 91 register int rrc; /* Returns from recursive calls */
479     register int i; /* Used for loops not involving calls to RMATCH() */
480 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
481 ph10 836 register BOOL utf; /* Local copy of UTF flag for speed */
482 nigel 77
483 nigel 93 BOOL minimize, possessive; /* Quantifier options */
484 ph10 602 BOOL caseless;
485 ph10 403 int condcode;
486 nigel 93
487 nigel 77 /* When recursion is not being used, all "local" variables that have to be
488     preserved over calls to RMATCH() are part of a "frame" which is obtained from
489     heap storage. Set up the top-level frame here; others are obtained from the
490     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
491    
492     #ifdef NO_RECURSE
493 ph10 836 heapframe *frame = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));
494 ph10 531 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
495 nigel 77 frame->Xprevframe = NULL; /* Marks the top level */
496    
497     /* Copy in the original argument variables */
498    
499     frame->Xeptr = eptr;
500     frame->Xecode = ecode;
501 ph10 168 frame->Xmstart = mstart;
502 nigel 77 frame->Xoffset_top = offset_top;
503     frame->Xeptrb = eptrb;
504 nigel 87 frame->Xrdepth = rdepth;
505 nigel 77
506     /* This is where control jumps back to to effect "recursion" */
507    
508     HEAP_RECURSE:
509    
510     /* Macros make the argument variables come from the current frame */
511    
512     #define eptr frame->Xeptr
513     #define ecode frame->Xecode
514 ph10 168 #define mstart frame->Xmstart
515 nigel 77 #define offset_top frame->Xoffset_top
516     #define eptrb frame->Xeptrb
517 nigel 87 #define rdepth frame->Xrdepth
518 nigel 77
519     /* Ditto for the local variables */
520    
521 ph10 836 #ifdef SUPPORT_UTF
522 nigel 77 #define charptr frame->Xcharptr
523     #endif
524     #define callpat frame->Xcallpat
525 ph10 403 #define codelink frame->Xcodelink
526 nigel 77 #define data frame->Xdata
527     #define next frame->Xnext
528     #define pp frame->Xpp
529     #define prev frame->Xprev
530     #define saved_eptr frame->Xsaved_eptr
531    
532     #define new_recursive frame->Xnew_recursive
533    
534     #define cur_is_word frame->Xcur_is_word
535     #define condition frame->Xcondition
536     #define prev_is_word frame->Xprev_is_word
537    
538     #ifdef SUPPORT_UCP
539     #define prop_type frame->Xprop_type
540 nigel 87 #define prop_value frame->Xprop_value
541 nigel 77 #define prop_fail_result frame->Xprop_fail_result
542 ph10 115 #define oclength frame->Xoclength
543     #define occhars frame->Xocchars
544 nigel 77 #endif
545    
546     #define ctype frame->Xctype
547     #define fc frame->Xfc
548     #define fi frame->Xfi
549     #define length frame->Xlength
550     #define max frame->Xmax
551     #define min frame->Xmin
552     #define number frame->Xnumber
553     #define offset frame->Xoffset
554     #define op frame->Xop
555     #define save_capture_last frame->Xsave_capture_last
556     #define save_offset1 frame->Xsave_offset1
557     #define save_offset2 frame->Xsave_offset2
558     #define save_offset3 frame->Xsave_offset3
559     #define stacksave frame->Xstacksave
560    
561     #define newptrb frame->Xnewptrb
562    
563     /* When recursion is being used, local variables are allocated on the stack and
564     get preserved during recursion in the normal way. In this environment, fi and
565     i, and fc and c, can be the same variables. */
566    
567 nigel 93 #else /* NO_RECURSE not defined */
568 nigel 77 #define fi i
569     #define fc c
570    
571 ph10 604 /* Many of the following variables are used only in small blocks of the code.
572     My normal style of coding would have declared them within each of those blocks.
573     However, in order to accommodate the version of this code that uses an external
574     "stack" implemented on the heap, it is easier to declare them all here, so the
575     declarations can be cut out in a block. The only declarations within blocks
576     below are for variables that do not have to be preserved over a recursive call
577     to RMATCH(). */
578 nigel 77
579 ph10 836 #ifdef SUPPORT_UTF
580     const pcre_uchar *charptr;
581 ph10 625 #endif
582 ph10 836 const pcre_uchar *callpat;
583     const pcre_uchar *data;
584     const pcre_uchar *next;
585     PCRE_PUCHAR pp;
586     const pcre_uchar *prev;
587     PCRE_PUCHAR saved_eptr;
588 ph10 625
589     recursion_info new_recursive;
590    
591     BOOL cur_is_word;
592 nigel 87 BOOL condition;
593 nigel 77 BOOL prev_is_word;
594    
595     #ifdef SUPPORT_UCP
596     int prop_type;
597 nigel 87 int prop_value;
598 nigel 77 int prop_fail_result;
599 ph10 115 int oclength;
600 ph10 836 pcre_uchar occhars[6];
601 nigel 77 #endif
602    
603 ph10 399 int codelink;
604 nigel 77 int ctype;
605     int length;
606     int max;
607     int min;
608     int number;
609     int offset;
610     int op;
611     int save_capture_last;
612     int save_offset1, save_offset2, save_offset3;
613     int stacksave[REC_STACK_SAVE_MAX];
614    
615     eptrblock newptrb;
616 nigel 93 #endif /* NO_RECURSE */
617 nigel 77
618 ph10 625 /* To save space on the stack and in the heap frame, I have doubled up on some
619     of the local variables that are used only in localised parts of the code, but
620     still need to be preserved over recursive calls of match(). These macros define
621 ph10 604 the alternative names that are used. */
622    
623     #define allow_zero cur_is_word
624     #define cbegroup condition
625     #define code_offset codelink
626     #define condassert condition
627     #define matched_once prev_is_word
628 ph10 836 #define foc number
629 ph10 604
630 nigel 77 /* These statements are here to stop the compiler complaining about unitialized
631     variables. */
632    
633     #ifdef SUPPORT_UCP
634 nigel 87 prop_value = 0;
635 nigel 77 prop_fail_result = 0;
636     #endif
637    
638 nigel 93
639 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
640     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
641     used. Thanks to Ian Taylor for noticing this possibility and sending the
642     original patch. */
643    
644     TAIL_RECURSE:
645    
646 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
647     are specified by the macro RMATCH and RRETURN is used to return. When
648     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
649 ph10 475 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
650 nigel 87 defined). However, RMATCH isn't like a function call because it's quite a
651     complicated macro. It has to be used in one particular way. This shouldn't,
652     however, impact performance when true recursion is being used. */
653 nigel 77
654 ph10 836 #ifdef SUPPORT_UTF
655     utf = md->utf; /* Local copy of the flag */
656 ph10 164 #else
657 ph10 836 utf = FALSE;
658 ph10 164 #endif
659    
660 nigel 87 /* First check that we haven't called match() too many times, or that we
661     haven't exceeded the recursive call limit. */
662    
663 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
664 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
665 nigel 77
666 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
667 ph10 625 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
668     done this way to save having to use another function argument, which would take
669 ph10 604 up space on the stack. See also MATCH_CONDASSERT below.
670 nigel 77
671 ph10 604 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
672     such remembered pointers, to be checked when we hit the closing ket, in order
673     to break infinite loops that match no characters. When match() is called in
674     other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
675     NOT be used with tail recursion, because the memory block that is used is on
676     the stack, so a new one may be required for each match(). */
677    
678     if (md->match_function_type == MATCH_CBEGROUP)
679 nigel 77 {
680 ph10 197 newptrb.epb_saved_eptr = eptr;
681     newptrb.epb_prev = eptrb;
682     eptrb = &newptrb;
683 ph10 604 md->match_function_type = 0;
684 nigel 77 }
685    
686 nigel 93 /* Now start processing the opcodes. */
687 nigel 77
688     for (;;)
689     {
690 nigel 93 minimize = possessive = FALSE;
691 nigel 77 op = *ecode;
692 ph10 625
693 nigel 93 switch(op)
694     {
695 ph10 510 case OP_MARK:
696 ph10 836 md->nomatch_mark = ecode + 2;
697     md->mark = NULL; /* In case previously set by assertion */
698     RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
699 ph10 604 eptrb, RM55);
700 ph10 836 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
701     md->mark == NULL) md->mark = ecode + 2;
702 ph10 512
703     /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
704     argument, and we must check whether that argument matches this MARK's
705     argument. It is passed back in md->start_match_ptr (an overloading of that
706     variable). If it does match, we reset that variable to the current subject
707     position and return MATCH_SKIP. Otherwise, pass back the return code
708 ph10 510 unaltered. */
709 ph10 512
710 ph10 836 else if (rrc == MATCH_SKIP_ARG &&
711     STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
712 ph10 510 {
713     md->start_match_ptr = eptr;
714     RRETURN(MATCH_SKIP);
715     }
716     RRETURN(rrc);
717    
718 ph10 210 case OP_FAIL:
719 ph10 836 RRETURN(MATCH_NOMATCH);
720 ph10 211
721 ph10 551 /* COMMIT overrides PRUNE, SKIP, and THEN */
722 ph10 553
723 ph10 510 case OP_COMMIT:
724 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
725 ph10 604 eptrb, RM52);
726 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
727 ph10 553 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
728     rrc != MATCH_THEN)
729 ph10 551 RRETURN(rrc);
730 ph10 836 RRETURN(MATCH_COMMIT);
731 ph10 510
732 ph10 551 /* PRUNE overrides THEN */
733 ph10 553
734 ph10 210 case OP_PRUNE:
735 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
736 ph10 604 eptrb, RM51);
737 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
738 ph10 836 RRETURN(MATCH_PRUNE);
739 ph10 211
740 ph10 510 case OP_PRUNE_ARG:
741 ph10 836 md->nomatch_mark = ecode + 2;
742     md->mark = NULL; /* In case previously set by assertion */
743     RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
744 ph10 604 eptrb, RM56);
745 ph10 836 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
746     md->mark == NULL) md->mark = ecode + 2;
747 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
748 ph10 510 RRETURN(MATCH_PRUNE);
749 ph10 211
750 ph10 551 /* SKIP overrides PRUNE and THEN */
751 ph10 553
752 ph10 210 case OP_SKIP:
753 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
754 ph10 604 eptrb, RM53);
755 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
756 ph10 551 RRETURN(rrc);
757 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
758 ph10 836 RRETURN(MATCH_SKIP);
759 ph10 211
760 ph10 836 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
761     nomatch_mark. There is a flag that disables this opcode when re-matching a
762     pattern that ended with a SKIP for which there was not a matching MARK. */
763    
764 ph10 510 case OP_SKIP_ARG:
765 ph10 836 if (md->ignore_skip_arg)
766     {
767     ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
768     break;
769     }
770     RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
771 ph10 604 eptrb, RM57);
772 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
773 ph10 551 RRETURN(rrc);
774 ph10 512
775     /* Pass back the current skip name by overloading md->start_match_ptr and
776     returning the special MATCH_SKIP_ARG return code. This will either be
777 ph10 836 caught by a matching MARK, or get to the top, where it causes a rematch
778     with the md->ignore_skip_arg flag set. */
779 ph10 512
780 ph10 510 md->start_match_ptr = ecode + 2;
781 ph10 512 RRETURN(MATCH_SKIP_ARG);
782 ph10 553
783 ph10 716 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
784     the branch in which it occurs can be determined. Overload the start of
785     match pointer to do this. */
786 ph10 512
787 ph10 210 case OP_THEN:
788 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
789 ph10 604 eptrb, RM54);
790 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
791 ph10 716 md->start_match_ptr = ecode;
792 ph10 836 RRETURN(MATCH_THEN);
793 ph10 510
794     case OP_THEN_ARG:
795 ph10 836 md->nomatch_mark = ecode + 2;
796     md->mark = NULL; /* In case previously set by assertion */
797     RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
798 ph10 716 md, eptrb, RM58);
799 ph10 836 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
800     md->mark == NULL) md->mark = ecode + 2;
801 ph10 510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 ph10 733 md->start_match_ptr = ecode;
803 ph10 212 RRETURN(MATCH_THEN);
804 ph10 733
805 ph10 723 /* Handle an atomic group that does not contain any capturing parentheses.
806 ph10 733 This can be handled like an assertion. Prior to 8.13, all atomic groups
807     were handled this way. In 8.13, the code was changed as below for ONCE, so
808     that backups pass through the group and thereby reset captured values.
809     However, this uses a lot more stack, so in 8.20, atomic groups that do not
810     contain any captures generate OP_ONCE_NC, which can be handled in the old,
811 ph10 723 less stack intensive way.
812 ph10 211
813 ph10 723 Check the alternative branches in turn - the matching won't pass the KET
814     for this kind of subpattern. If any one branch matches, we carry on as at
815     the end of a normal bracket, leaving the subject pointer, but resetting
816     the start-of-match value in case it was changed by \K. */
817    
818     case OP_ONCE_NC:
819     prev = ecode;
820     saved_eptr = eptr;
821     do
822     {
823     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
824     if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
825     {
826     mstart = md->start_match_ptr;
827     break;
828     }
829     if (rrc == MATCH_THEN)
830     {
831     next = ecode + GET(ecode,1);
832 ph10 733 if (md->start_match_ptr < next &&
833 ph10 723 (*ecode == OP_ALT || *next == OP_ALT))
834     rrc = MATCH_NOMATCH;
835 ph10 733 }
836    
837 ph10 723 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
838     ecode += GET(ecode,1);
839     }
840     while (*ecode == OP_ALT);
841    
842     /* If hit the end of the group (which could be repeated), fail */
843    
844     if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
845    
846     /* Continue as from after the group, updating the offsets high water
847     mark, since extracts may have been taken. */
848    
849     do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
850    
851     offset_top = md->end_offset_top;
852     eptr = md->end_match_ptr;
853    
854     /* For a non-repeating ket, just continue at this level. This also
855     happens for a repeating ket if no characters were matched in the group.
856     This is the forcible breaking of infinite loops as implemented in Perl
857     5.005. */
858    
859     if (*ecode == OP_KET || eptr == saved_eptr)
860     {
861     ecode += 1+LINK_SIZE;
862     break;
863     }
864    
865     /* The repeating kets try the rest of the pattern or restart from the
866     preceding bracket, in the appropriate order. The second "call" of match()
867     uses tail recursion, to avoid using another stack frame. */
868    
869     if (*ecode == OP_KETRMIN)
870     {
871     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
872     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
873     ecode = prev;
874     goto TAIL_RECURSE;
875     }
876     else /* OP_KETRMAX */
877     {
878 ph10 733 md->match_function_type = MATCH_CBEGROUP;
879 ph10 723 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
880     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
881     ecode += 1 + LINK_SIZE;
882     goto TAIL_RECURSE;
883     }
884     /* Control never gets here */
885    
886 ph10 604 /* Handle a capturing bracket, other than those that are possessive with an
887     unlimited repeat. If there is space in the offset vector, save the current
888     subject position in the working slot at the top of the vector. We mustn't
889     change the current values of the data slot, because they may be set from a
890     previous iteration of this group, and be referred to by a reference inside
891 ph10 625 the group. A failure to match might occur after the group has succeeded,
892 ph10 617 if something later on doesn't match. For this reason, we need to restore
893     the working value and also the values of the final offsets, in case they
894     were set by a previous iteration of the same bracket.
895 nigel 77
896 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
897     a non-capturing bracket. Don't worry about setting the flag for the error
898     case here; that is handled in the code for KET. */
899 nigel 77
900 nigel 93 case OP_CBRA:
901     case OP_SCBRA:
902     number = GET2(ecode, 1+LINK_SIZE);
903 nigel 77 offset = number << 1;
904 ph10 625
905 ph10 475 #ifdef PCRE_DEBUG
906 nigel 93 printf("start bracket %d\n", number);
907     printf("subject=");
908 nigel 77 pchars(eptr, 16, TRUE, md);
909     printf("\n");
910     #endif
911    
912     if (offset < md->offset_max)
913     {
914     save_offset1 = md->offset_vector[offset];
915     save_offset2 = md->offset_vector[offset+1];
916     save_offset3 = md->offset_vector[md->offset_end - number];
917     save_capture_last = md->capture_last;
918    
919     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
920 ph10 531 md->offset_vector[md->offset_end - number] =
921 ph10 530 (int)(eptr - md->start_subject);
922 nigel 77
923 ph10 604 for (;;)
924 nigel 77 {
925 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
926 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
927 ph10 604 eptrb, RM1);
928 ph10 618 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
929 ph10 733
930     /* If we backed up to a THEN, check whether it is within the current
931     branch by comparing the address of the THEN that is passed back with
932 ph10 716 the end of the branch. If it is within the current branch, and the
933     branch is one of two or more alternatives (it either starts or ends
934 ph10 733 with OP_ALT), we have reached the limit of THEN's action, so convert
935     the return code to NOMATCH, which will cause normal backtracking to
936 ph10 716 happen from now on. Otherwise, THEN is passed back to an outer
937 ph10 733 alternative. This implements Perl's treatment of parenthesized groups,
938     where a group not containing | does not affect the current alternative,
939 ph10 716 that is, (X) is NOT the same as (X|(*F)). */
940    
941     if (rrc == MATCH_THEN)
942     {
943     next = ecode + GET(ecode,1);
944 ph10 733 if (md->start_match_ptr < next &&
945 ph10 716 (*ecode == OP_ALT || *next == OP_ALT))
946     rrc = MATCH_NOMATCH;
947 ph10 733 }
948    
949 ph10 716 /* Anything other than NOMATCH is passed back. */
950    
951     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
952 nigel 77 md->capture_last = save_capture_last;
953     ecode += GET(ecode, 1);
954 ph10 625 if (*ecode != OP_ALT) break;
955 nigel 77 }
956    
957     DPRINTF(("bracket %d failed\n", number));
958     md->offset_vector[offset] = save_offset1;
959     md->offset_vector[offset+1] = save_offset2;
960     md->offset_vector[md->offset_end - number] = save_offset3;
961 ph10 625
962 ph10 716 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
963 nigel 77
964 ph10 716 RRETURN(rrc);
965 nigel 77 }
966    
967 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
968     as a non-capturing bracket. */
969 nigel 77
970 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
971     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
972    
973 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
974 nigel 77
975 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
976     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
977    
978 ph10 618 /* Non-capturing or atomic group, except for possessive with unlimited
979 ph10 723 repeat and ONCE group with no captures. Loop for all the alternatives.
980 ph10 708
981 ph10 702 When we get to the final alternative within the brackets, we used to return
982     the result of a recursive call to match() whatever happened so it was
983     possible to reduce stack usage by turning this into a tail recursion,
984     except in the case of a possibly empty group. However, now that there is
985     the possiblity of (*THEN) occurring in the final alternative, this
986     optimization is no longer always possible.
987 ph10 625
988 ph10 708 We can optimize if we know there are no (*THEN)s in the pattern; at present
989     this is the best that can be done.
990    
991 ph10 625 MATCH_ONCE is returned when the end of an atomic group is successfully
992     reached, but subsequent matching fails. It passes back up the tree (causing
993     captured values to be reset) until the original atomic group level is
994 ph10 618 reached. This is tested by comparing md->once_target with the start of the
995     group. At this point, the return is converted into MATCH_NOMATCH so that
996     previous backup points can be taken. */
997 nigel 77
998 ph10 618 case OP_ONCE:
999 nigel 93 case OP_BRA:
1000     case OP_SBRA:
1001     DPRINTF(("start non-capturing bracket\n"));
1002 ph10 618
1003 nigel 91 for (;;)
1004 nigel 77 {
1005 ph10 618 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1006 ph10 702
1007     /* If this is not a possibly empty group, and there are no (*THEN)s in
1008 ph10 708 the pattern, and this is the final alternative, optimize as described
1009 ph10 702 above. */
1010    
1011     else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1012     {
1013 ph10 836 ecode += PRIV(OP_lengths)[*ecode];
1014 ph10 702 goto TAIL_RECURSE;
1015 ph10 708 }
1016 ph10 702
1017     /* In all other cases, we have to make another call to match(). */
1018    
1019 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1020 ph10 604 RM2);
1021 ph10 733
1022 ph10 716 /* See comment in the code for capturing groups above about handling
1023     THEN. */
1024    
1025     if (rrc == MATCH_THEN)
1026 ph10 625 {
1027 ph10 716 next = ecode + GET(ecode,1);
1028 ph10 733 if (md->start_match_ptr < next &&
1029 ph10 716 (*ecode == OP_ALT || *next == OP_ALT))
1030     rrc = MATCH_NOMATCH;
1031 ph10 733 }
1032    
1033     if (rrc != MATCH_NOMATCH)
1034 ph10 716 {
1035 ph10 618 if (rrc == MATCH_ONCE)
1036     {
1037 ph10 836 const pcre_uchar *scode = ecode;
1038 ph10 618 if (*scode != OP_ONCE) /* If not at start, find it */
1039     {
1040     while (*scode == OP_ALT) scode += GET(scode, 1);
1041     scode -= GET(scode, 1);
1042 ph10 625 }
1043 ph10 618 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1044 ph10 625 }
1045 ph10 550 RRETURN(rrc);
1046 ph10 625 }
1047 nigel 77 ecode += GET(ecode, 1);
1048 ph10 625 if (*ecode != OP_ALT) break;
1049 nigel 77 }
1050 ph10 733
1051 ph10 609 RRETURN(MATCH_NOMATCH);
1052    
1053 ph10 625 /* Handle possessive capturing brackets with an unlimited repeat. We come
1054 ph10 604 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1055     handled similarly to the normal case above. However, the matching is
1056     different. The end of these brackets will always be OP_KETRPOS, which
1057     returns MATCH_KETRPOS without going further in the pattern. By this means
1058     we can handle the group by iteration rather than recursion, thereby
1059     reducing the amount of stack needed. */
1060 ph10 625
1061 ph10 604 case OP_CBRAPOS:
1062     case OP_SCBRAPOS:
1063     allow_zero = FALSE;
1064 ph10 625
1065 ph10 604 POSSESSIVE_CAPTURE:
1066     number = GET2(ecode, 1+LINK_SIZE);
1067     offset = number << 1;
1068    
1069     #ifdef PCRE_DEBUG
1070     printf("start possessive bracket %d\n", number);
1071     printf("subject=");
1072     pchars(eptr, 16, TRUE, md);
1073     printf("\n");
1074     #endif
1075    
1076     if (offset < md->offset_max)
1077     {
1078     matched_once = FALSE;
1079 ph10 836 code_offset = (int)(ecode - md->start_code);
1080 ph10 604
1081     save_offset1 = md->offset_vector[offset];
1082     save_offset2 = md->offset_vector[offset+1];
1083     save_offset3 = md->offset_vector[md->offset_end - number];
1084     save_capture_last = md->capture_last;
1085    
1086     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1087 ph10 625
1088     /* Each time round the loop, save the current subject position for use
1089     when the group matches. For MATCH_MATCH, the group has matched, so we
1090     restart it with a new subject starting position, remembering that we had
1091     at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1092     usual. If we haven't matched any alternatives in any iteration, check to
1093     see if a previous iteration matched. If so, the group has matched;
1094     continue from afterwards. Otherwise it has failed; restore the previous
1095 ph10 604 capture values before returning NOMATCH. */
1096 ph10 625
1097 ph10 604 for (;;)
1098     {
1099     md->offset_vector[md->offset_end - number] =
1100     (int)(eptr - md->start_subject);
1101 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1102 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1103 ph10 604 eptrb, RM63);
1104     if (rrc == MATCH_KETRPOS)
1105     {
1106     offset_top = md->end_offset_top;
1107     eptr = md->end_match_ptr;
1108 ph10 625 ecode = md->start_code + code_offset;
1109 ph10 604 save_capture_last = md->capture_last;
1110 ph10 625 matched_once = TRUE;
1111     continue;
1112     }
1113 ph10 733
1114 ph10 716 /* See comment in the code for capturing groups above about handling
1115     THEN. */
1116    
1117     if (rrc == MATCH_THEN)
1118     {
1119     next = ecode + GET(ecode,1);
1120 ph10 733 if (md->start_match_ptr < next &&
1121 ph10 716 (*ecode == OP_ALT || *next == OP_ALT))
1122     rrc = MATCH_NOMATCH;
1123 ph10 733 }
1124 ph10 716
1125     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1126 ph10 604 md->capture_last = save_capture_last;
1127     ecode += GET(ecode, 1);
1128 ph10 625 if (*ecode != OP_ALT) break;
1129 ph10 604 }
1130 ph10 610
1131 ph10 604 if (!matched_once)
1132 ph10 625 {
1133 ph10 604 md->offset_vector[offset] = save_offset1;
1134     md->offset_vector[offset+1] = save_offset2;
1135     md->offset_vector[md->offset_end - number] = save_offset3;
1136     }
1137 ph10 625
1138 ph10 604 if (allow_zero || matched_once)
1139 ph10 625 {
1140 ph10 604 ecode += 1 + LINK_SIZE;
1141     break;
1142 ph10 625 }
1143    
1144 ph10 604 RRETURN(MATCH_NOMATCH);
1145     }
1146 ph10 625
1147 ph10 604 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1148     as a non-capturing bracket. */
1149    
1150     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1151     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1152    
1153     DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1154    
1155     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1156     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1157    
1158 ph10 625 /* Non-capturing possessive bracket with unlimited repeat. We come here
1159 ph10 604 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1160     without the capturing complication. It is written out separately for speed
1161     and cleanliness. */
1162    
1163     case OP_BRAPOS:
1164     case OP_SBRAPOS:
1165 ph10 625 allow_zero = FALSE;
1166    
1167 ph10 604 POSSESSIVE_NON_CAPTURE:
1168     matched_once = FALSE;
1169 ph10 836 code_offset = (int)(ecode - md->start_code);
1170 ph10 604
1171     for (;;)
1172     {
1173 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1174 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1175 ph10 609 eptrb, RM48);
1176 ph10 604 if (rrc == MATCH_KETRPOS)
1177     {
1178 ph10 610 offset_top = md->end_offset_top;
1179 ph10 604 eptr = md->end_match_ptr;
1180 ph10 625 ecode = md->start_code + code_offset;
1181     matched_once = TRUE;
1182     continue;
1183     }
1184 ph10 733
1185 ph10 716 /* See comment in the code for capturing groups above about handling
1186     THEN. */
1187    
1188     if (rrc == MATCH_THEN)
1189     {
1190     next = ecode + GET(ecode,1);
1191 ph10 733 if (md->start_match_ptr < next &&
1192 ph10 716 (*ecode == OP_ALT || *next == OP_ALT))
1193     rrc = MATCH_NOMATCH;
1194 ph10 733 }
1195 ph10 716
1196     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1197 ph10 604 ecode += GET(ecode, 1);
1198 ph10 625 if (*ecode != OP_ALT) break;
1199 ph10 604 }
1200 ph10 625
1201     if (matched_once || allow_zero)
1202 ph10 604 {
1203     ecode += 1 + LINK_SIZE;
1204     break;
1205 ph10 625 }
1206 ph10 604 RRETURN(MATCH_NOMATCH);
1207    
1208     /* Control never reaches here. */
1209    
1210 nigel 77 /* Conditional group: compilation checked that there are no more than
1211     two branches. If the condition is false, skipping the first branch takes us
1212     past the end if there is only one branch, but that's OK because that is
1213 ph10 609 exactly what going to the ket would do. */
1214 nigel 77
1215     case OP_COND:
1216 nigel 93 case OP_SCOND:
1217 ph10 604 codelink = GET(ecode, 1);
1218 ph10 406
1219 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
1220     inserted between OP_COND and an assertion condition. */
1221 ph10 392
1222 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1223     {
1224 ph10 836 if (PUBL(callout) != NULL)
1225 ph10 381 {
1226     pcre_callout_block cb;
1227 ph10 645 cb.version = 2; /* Version 1 of the callout block */
1228 ph10 381 cb.callout_number = ecode[LINK_SIZE+2];
1229     cb.offset_vector = md->offset_vector;
1230     cb.subject = (PCRE_SPTR)md->start_subject;
1231 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1232     cb.start_match = (int)(mstart - md->start_subject);
1233     cb.current_position = (int)(eptr - md->start_subject);
1234 ph10 381 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1235     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1236     cb.capture_top = offset_top/2;
1237     cb.capture_last = md->capture_last;
1238     cb.callout_data = md->callout_data;
1239 ph10 836 cb.mark = md->nomatch_mark;
1240     if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1241 ph10 381 if (rrc < 0) RRETURN(rrc);
1242     }
1243 ph10 836 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1244 ph10 381 }
1245 ph10 392
1246 ph10 399 condcode = ecode[LINK_SIZE+1];
1247 ph10 406
1248 ph10 381 /* Now see what the actual condition is */
1249 ph10 392
1250 ph10 459 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1251 nigel 77 {
1252 ph10 459 if (md->recursive == NULL) /* Not recursing => FALSE */
1253     {
1254 ph10 461 condition = FALSE;
1255     ecode += GET(ecode, 1);
1256     }
1257 ph10 459 else
1258 ph10 461 {
1259 ph10 459 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1260 ph10 751 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1261 ph10 461
1262 ph10 459 /* If the test is for recursion into a specific subpattern, and it is
1263     false, but the test was set up by name, scan the table to see if the
1264     name refers to any other numbers, and test them. The condition is true
1265     if any one is set. */
1266 ph10 461
1267 ph10 751 if (!condition && condcode == OP_NRREF)
1268 ph10 459 {
1269 ph10 836 pcre_uchar *slotA = md->name_table;
1270 ph10 459 for (i = 0; i < md->name_count; i++)
1271 ph10 461 {
1272     if (GET2(slotA, 0) == recno) break;
1273 ph10 459 slotA += md->name_entry_size;
1274     }
1275 ph10 461
1276 ph10 459 /* Found a name for the number - there can be only one; duplicate
1277     names for different numbers are allowed, but not vice versa. First
1278     scan down for duplicates. */
1279 ph10 461
1280 ph10 459 if (i < md->name_count)
1281 ph10 461 {
1282 ph10 836 pcre_uchar *slotB = slotA;
1283 ph10 459 while (slotB > md->name_table)
1284     {
1285     slotB -= md->name_entry_size;
1286 ph10 836 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1287 ph10 459 {
1288     condition = GET2(slotB, 0) == md->recursive->group_num;
1289 ph10 461 if (condition) break;
1290     }
1291 ph10 459 else break;
1292 ph10 461 }
1293    
1294 ph10 459 /* Scan up for duplicates */
1295 ph10 461
1296 ph10 459 if (!condition)
1297 ph10 461 {
1298 ph10 459 slotB = slotA;
1299     for (i++; i < md->name_count; i++)
1300     {
1301     slotB += md->name_entry_size;
1302 ph10 836 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1303 ph10 459 {
1304     condition = GET2(slotB, 0) == md->recursive->group_num;
1305     if (condition) break;
1306 ph10 461 }
1307 ph10 459 else break;
1308 ph10 461 }
1309     }
1310 ph10 459 }
1311 ph10 461 }
1312    
1313 ph10 459 /* Chose branch according to the condition */
1314 ph10 461
1315 ph10 836 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1316 ph10 459 }
1317 ph10 461 }
1318 nigel 93
1319 ph10 459 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1320 nigel 93 {
1321 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1322 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1323 ph10 461
1324 ph10 459 /* If the numbered capture is unset, but the reference was by name,
1325 ph10 461 scan the table to see if the name refers to any other numbers, and test
1326     them. The condition is true if any one is set. This is tediously similar
1327     to the code above, but not close enough to try to amalgamate. */
1328    
1329 ph10 459 if (!condition && condcode == OP_NCREF)
1330     {
1331 ph10 461 int refno = offset >> 1;
1332 ph10 836 pcre_uchar *slotA = md->name_table;
1333 ph10 461
1334 ph10 459 for (i = 0; i < md->name_count; i++)
1335 ph10 461 {
1336     if (GET2(slotA, 0) == refno) break;
1337 ph10 459 slotA += md->name_entry_size;
1338     }
1339 ph10 461
1340     /* Found a name for the number - there can be only one; duplicate names
1341     for different numbers are allowed, but not vice versa. First scan down
1342 ph10 459 for duplicates. */
1343 ph10 461
1344 ph10 459 if (i < md->name_count)
1345 ph10 461 {
1346 ph10 836 pcre_uchar *slotB = slotA;
1347 ph10 459 while (slotB > md->name_table)
1348     {
1349     slotB -= md->name_entry_size;
1350 ph10 836 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1351 ph10 459 {
1352     offset = GET2(slotB, 0) << 1;
1353 ph10 461 condition = offset < offset_top &&
1354 ph10 459 md->offset_vector[offset] >= 0;
1355 ph10 461 if (condition) break;
1356     }
1357 ph10 459 else break;
1358 ph10 461 }
1359    
1360 ph10 459 /* Scan up for duplicates */
1361 ph10 461
1362 ph10 459 if (!condition)
1363 ph10 461 {
1364 ph10 459 slotB = slotA;
1365     for (i++; i < md->name_count; i++)
1366     {
1367     slotB += md->name_entry_size;
1368 ph10 836 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1369 ph10 459 {
1370     offset = GET2(slotB, 0) << 1;
1371 ph10 461 condition = offset < offset_top &&
1372 ph10 459 md->offset_vector[offset] >= 0;
1373 ph10 461 if (condition) break;
1374     }
1375 ph10 459 else break;
1376 ph10 461 }
1377     }
1378 ph10 459 }
1379 ph10 461 }
1380    
1381 ph10 459 /* Chose branch according to the condition */
1382    
1383 ph10 836 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1384 nigel 77 }
1385    
1386 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
1387 nigel 93 {
1388     condition = FALSE;
1389     ecode += GET(ecode, 1);
1390     }
1391    
1392 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
1393 ph10 604 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1394     an assertion. */
1395 nigel 77
1396     else
1397     {
1398 ph10 625 md->match_function_type = MATCH_CONDASSERT;
1399 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1400 nigel 77 if (rrc == MATCH_MATCH)
1401     {
1402 ph10 619 if (md->end_offset_top > offset_top)
1403     offset_top = md->end_offset_top; /* Captures may have happened */
1404 nigel 93 condition = TRUE;
1405     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1406 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1407     }
1408 ph10 733
1409 ph10 716 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1410 ph10 733 assertion; it is therefore treated as NOMATCH. */
1411 ph10 716
1412 ph10 733 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1413 nigel 77 {
1414     RRETURN(rrc); /* Need braces because of following else */
1415     }
1416 nigel 93 else
1417     {
1418     condition = FALSE;
1419 ph10 399 ecode += codelink;
1420 nigel 93 }
1421     }
1422 nigel 91
1423 ph10 716 /* We are now at the branch that is to be obeyed. As there is only one, can
1424     use tail recursion to avoid using another stack frame, except when there is
1425     unlimited repeat of a possibly empty group. In the latter case, a recursive
1426     call to match() is always required, unless the second alternative doesn't
1427     exist, in which case we can just plough on. Note that, for compatibility
1428     with Perl, the | in a conditional group is NOT treated as creating two
1429     alternatives. If a THEN is encountered in the branch, it propagates out to
1430     the enclosing alternative (unless nested in a deeper set of alternatives,
1431     of course). */
1432 nigel 91
1433 nigel 93 if (condition || *ecode == OP_ALT)
1434     {
1435 ph10 716 if (op != OP_SCOND)
1436 ph10 702 {
1437     ecode += 1 + LINK_SIZE;
1438     goto TAIL_RECURSE;
1439 ph10 708 }
1440 ph10 733
1441 ph10 716 md->match_function_type = MATCH_CBEGROUP;
1442 ph10 609 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1443     RRETURN(rrc);
1444 nigel 77 }
1445 ph10 708
1446 ph10 702 /* Condition false & no alternative; continue after the group. */
1447 ph10 708
1448 ph10 702 else
1449 nigel 93 {
1450     ecode += 1 + LINK_SIZE;
1451     }
1452     break;
1453 nigel 77
1454 ph10 461
1455 ph10 447 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1456     to close any currently open capturing brackets. */
1457 ph10 461
1458 ph10 447 case OP_CLOSE:
1459 ph10 461 number = GET2(ecode, 1);
1460 ph10 447 offset = number << 1;
1461 ph10 461
1462 ph10 475 #ifdef PCRE_DEBUG
1463 ph10 447 printf("end bracket %d at *ACCEPT", number);
1464     printf("\n");
1465     #endif
1466 nigel 77
1467 ph10 447 md->capture_last = number;
1468     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1469     {
1470     md->offset_vector[offset] =
1471     md->offset_vector[md->offset_end - number];
1472 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1473 ph10 447 if (offset_top <= offset) offset_top = offset + 2;
1474     }
1475 ph10 836 ecode += 1 + IMM2_SIZE;
1476 ph10 461 break;
1477 ph10 447
1478    
1479 ph10 619 /* End of the pattern, either real or forced. */
1480 nigel 77
1481 ph10 619 case OP_END:
1482 ph10 210 case OP_ACCEPT:
1483 ph10 625 case OP_ASSERT_ACCEPT:
1484    
1485 ph10 619 /* If we have matched an empty string, fail if not in an assertion and not
1486     in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1487 ph10 613 is set and we have matched at the start of the subject. In both cases,
1488     backtracking will then try other alternatives, if any. */
1489 ph10 443
1490 ph10 619 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1491 ph10 618 md->recursive == NULL &&
1492 ph10 619 (md->notempty ||
1493     (md->notempty_atstart &&
1494     mstart == md->start_subject + md->start_offset)))
1495 ph10 836 RRETURN(MATCH_NOMATCH);
1496 ph10 443
1497 ph10 442 /* Otherwise, we have a match. */
1498 ph10 625
1499 ph10 168 md->end_match_ptr = eptr; /* Record where we ended */
1500     md->end_offset_top = offset_top; /* and how many extracts were taken */
1501 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1502 nigel 77
1503 ph10 512 /* For some reason, the macros don't work properly if an expression is
1504 ph10 836 given as the argument to RRETURN when the heap is in use. */
1505 ph10 512
1506     rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1507 ph10 836 RRETURN(rrc);
1508 ph10 512
1509 nigel 77 /* Assertion brackets. Check the alternative branches in turn - the
1510     matching won't pass the KET for an assertion. If any one branch matches,
1511     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1512     start of each branch to move the current point backwards, so the code at
1513 ph10 625 this level is identical to the lookahead case. When the assertion is part
1514     of a condition, we want to return immediately afterwards. The caller of
1515     this incarnation of the match() function will have set MATCH_CONDASSERT in
1516     md->match_function type, and one of these opcodes will be the first opcode
1517     that is processed. We use a local variable that is preserved over calls to
1518 ph10 604 match() to remember this case. */
1519 nigel 77
1520     case OP_ASSERT:
1521     case OP_ASSERTBACK:
1522 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1523     {
1524     condassert = TRUE;
1525     md->match_function_type = 0;
1526     }
1527 ph10 625 else condassert = FALSE;
1528    
1529 nigel 77 do
1530     {
1531 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1532 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1533 ph10 500 {
1534     mstart = md->start_match_ptr; /* In case \K reset it */
1535     break;
1536 ph10 501 }
1537 ph10 733
1538     /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1539 ph10 716 as NOMATCH. */
1540 ph10 733
1541 ph10 716 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1542 nigel 77 ecode += GET(ecode, 1);
1543     }
1544     while (*ecode == OP_ALT);
1545 ph10 625
1546 ph10 836 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1547 nigel 77
1548     /* If checking an assertion for a condition, return MATCH_MATCH. */
1549    
1550 ph10 604 if (condassert) RRETURN(MATCH_MATCH);
1551 nigel 77
1552     /* Continue from after the assertion, updating the offsets high water
1553     mark, since extracts may have been taken during the assertion. */
1554    
1555     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1556     ecode += 1 + LINK_SIZE;
1557     offset_top = md->end_offset_top;
1558     continue;
1559    
1560 ph10 473 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1561 ph10 482 PRUNE, or COMMIT means we must assume failure without checking subsequent
1562 ph10 473 branches. */
1563 nigel 77
1564     case OP_ASSERT_NOT:
1565     case OP_ASSERTBACK_NOT:
1566 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1567     {
1568     condassert = TRUE;
1569     md->match_function_type = 0;
1570     }
1571 ph10 625 else condassert = FALSE;
1572 ph10 604
1573 nigel 77 do
1574     {
1575 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1576 ph10 836 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1577 ph10 473 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1578     {
1579     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1580 ph10 482 break;
1581     }
1582 ph10 716
1583 ph10 733 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1584 ph10 716 as NOMATCH. */
1585    
1586     if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1587 nigel 77 ecode += GET(ecode,1);
1588     }
1589     while (*ecode == OP_ALT);
1590    
1591 ph10 604 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1592 ph10 625
1593 nigel 77 ecode += 1 + LINK_SIZE;
1594     continue;
1595    
1596     /* Move the subject pointer back. This occurs only at the start of
1597     each branch of a lookbehind assertion. If we are too close to the start to
1598     move back, this match function fails. When working with UTF-8 we move
1599     back a number of characters, not bytes. */
1600    
1601     case OP_REVERSE:
1602 ph10 836 #ifdef SUPPORT_UTF
1603     if (utf)
1604 nigel 77 {
1605 nigel 93 i = GET(ecode, 1);
1606     while (i-- > 0)
1607 nigel 77 {
1608     eptr--;
1609 ph10 836 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1610 ph10 207 BACKCHAR(eptr);
1611 nigel 77 }
1612     }
1613     else
1614     #endif
1615    
1616     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1617    
1618     {
1619 nigel 93 eptr -= GET(ecode, 1);
1620 ph10 836 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1621 nigel 77 }
1622    
1623 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1624 nigel 77
1625 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1626 nigel 77 ecode += 1 + LINK_SIZE;
1627     break;
1628    
1629     /* The callout item calls an external function, if one is provided, passing
1630     details of the match so far. This is mainly for debugging, though the
1631     function is able to force a failure. */
1632    
1633     case OP_CALLOUT:
1634 ph10 836 if (PUBL(callout) != NULL)
1635 nigel 77 {
1636     pcre_callout_block cb;
1637 ph10 645 cb.version = 2; /* Version 1 of the callout block */
1638 nigel 77 cb.callout_number = ecode[1];
1639     cb.offset_vector = md->offset_vector;
1640 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1641 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1642     cb.start_match = (int)(mstart - md->start_subject);
1643     cb.current_position = (int)(eptr - md->start_subject);
1644 nigel 77 cb.pattern_position = GET(ecode, 2);
1645     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1646     cb.capture_top = offset_top/2;
1647     cb.capture_last = md->capture_last;
1648     cb.callout_data = md->callout_data;
1649 ph10 836 cb.mark = md->nomatch_mark;
1650     if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1651 nigel 77 if (rrc < 0) RRETURN(rrc);
1652     }
1653     ecode += 2 + 2*LINK_SIZE;
1654     break;
1655    
1656     /* Recursion either matches the current regex, or some subexpression. The
1657     offset data is the offset to the starting bracket from the start of the
1658     whole pattern. (This is so that it works from duplicated subpatterns.)
1659 ph10 625
1660 ph10 618 The state of the capturing groups is preserved over recursion, and
1661 ph10 625 re-instated afterwards. We don't know how many are started and not yet
1662 ph10 618 finished (offset_top records the completed total) so we just have to save
1663     all the potential data. There may be up to 65535 such values, which is too
1664     large to put on the stack, but using malloc for small numbers seems
1665     expensive. As a compromise, the stack is used when there are no more than
1666     REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1667 nigel 77
1668     There are also other values that have to be saved. We use a chained
1669     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1670 ph10 625 for the original version of this logic. It has, however, been hacked around
1671 ph10 618 a lot, so he is not to blame for the current way it works. */
1672 nigel 77
1673     case OP_RECURSE:
1674     {
1675 ph10 642 recursion_info *ri;
1676     int recno;
1677 ph10 654
1678 nigel 77 callpat = md->start_code + GET(ecode, 1);
1679 ph10 642 recno = (callpat == md->start_code)? 0 :
1680 ph10 654 GET2(callpat, 1 + LINK_SIZE);
1681    
1682     /* Check for repeating a recursion without advancing the subject pointer.
1683 ph10 642 This should catch convoluted mutual recursions. (Some simple cases are
1684 ph10 654 caught at compile time.) */
1685    
1686 ph10 642 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1687 ph10 654 if (recno == ri->group_num && eptr == ri->subject_position)
1688 ph10 642 RRETURN(PCRE_ERROR_RECURSELOOP);
1689 nigel 77
1690     /* Add to "recursing stack" */
1691    
1692 ph10 642 new_recursive.group_num = recno;
1693     new_recursive.subject_position = eptr;
1694 nigel 77 new_recursive.prevrec = md->recursive;
1695     md->recursive = &new_recursive;
1696    
1697 ph10 618 /* Where to continue from afterwards */
1698 nigel 77
1699     ecode += 1 + LINK_SIZE;
1700    
1701 ph10 618 /* Now save the offset data */
1702 nigel 77
1703     new_recursive.saved_max = md->offset_end;
1704     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1705     new_recursive.offset_save = stacksave;
1706     else
1707     {
1708     new_recursive.offset_save =
1709 ph10 836 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1710 nigel 77 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1711     }
1712     memcpy(new_recursive.offset_save, md->offset_vector,
1713     new_recursive.saved_max * sizeof(int));
1714 ph10 625
1715 ph10 618 /* OK, now we can do the recursion. After processing each alternative,
1716 ph10 625 restore the offset data. If there were nested recursions, md->recursive
1717 ph10 618 might be changed, so reset it before looping. */
1718 nigel 77
1719     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1720 ph10 604 cbegroup = (*callpat >= OP_SBRA);
1721 nigel 77 do
1722     {
1723 ph10 604 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1724 ph10 836 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1725 ph10 604 md, eptrb, RM6);
1726 ph10 618 memcpy(md->offset_vector, new_recursive.offset_save,
1727     new_recursive.saved_max * sizeof(int));
1728 ph10 681 md->recursive = new_recursive.prevrec;
1729 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1730 nigel 77 {
1731 nigel 87 DPRINTF(("Recursion matched\n"));
1732 nigel 77 if (new_recursive.offset_save != stacksave)
1733 ph10 836 (PUBL(free))(new_recursive.offset_save);
1734 ph10 618
1735     /* Set where we got to in the subject, and reset the start in case
1736 ph10 625 it was changed by \K. This *is* propagated back out of a recursion,
1737     for Perl compatibility. */
1738    
1739 ph10 618 eptr = md->end_match_ptr;
1740     mstart = md->start_match_ptr;
1741     goto RECURSION_MATCHED; /* Exit loop; end processing */
1742 nigel 77 }
1743 ph10 716
1744     /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1745     as NOMATCH. */
1746    
1747 ph10 733 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1748 nigel 87 {
1749     DPRINTF(("Recursion gave error %d\n", rrc));
1750 ph10 400 if (new_recursive.offset_save != stacksave)
1751 ph10 836 (PUBL(free))(new_recursive.offset_save);
1752 nigel 87 RRETURN(rrc);
1753     }
1754 nigel 77
1755     md->recursive = &new_recursive;
1756     callpat += GET(callpat, 1);
1757     }
1758     while (*callpat == OP_ALT);
1759    
1760     DPRINTF(("Recursion didn't match\n"));
1761     md->recursive = new_recursive.prevrec;
1762     if (new_recursive.offset_save != stacksave)
1763 ph10 836 (PUBL(free))(new_recursive.offset_save);
1764     RRETURN(MATCH_NOMATCH);
1765 nigel 77 }
1766 ph10 625
1767 ph10 618 RECURSION_MATCHED:
1768     break;
1769 nigel 77
1770     /* An alternation is the end of a branch; scan along to find the end of the
1771     bracketed group and go to there. */
1772    
1773     case OP_ALT:
1774     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1775     break;
1776    
1777 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1778     indicating that it may occur zero times. It may repeat infinitely, or not
1779     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1780     with fixed upper repeat limits are compiled as a number of copies, with the
1781     optional ones preceded by BRAZERO or BRAMINZERO. */
1782 ph10 625
1783 nigel 77 case OP_BRAZERO:
1784 ph10 604 next = ecode + 1;
1785     RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1786     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1787     do next += GET(next, 1); while (*next == OP_ALT);
1788     ecode = next + 1 + LINK_SIZE;
1789 nigel 77 break;
1790 ph10 625
1791 nigel 77 case OP_BRAMINZERO:
1792 ph10 604 next = ecode + 1;
1793     do next += GET(next, 1); while (*next == OP_ALT);
1794     RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1795     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1796     ecode++;
1797 nigel 77 break;
1798    
1799 ph10 335 case OP_SKIPZERO:
1800 ph10 604 next = ecode+1;
1801     do next += GET(next,1); while (*next == OP_ALT);
1802     ecode = next + 1 + LINK_SIZE;
1803 ph10 335 break;
1804 ph10 625
1805 ph10 604 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1806     here; just jump to the group, with allow_zero set TRUE. */
1807 ph10 625
1808 ph10 604 case OP_BRAPOSZERO:
1809 ph10 625 op = *(++ecode);
1810 ph10 604 allow_zero = TRUE;
1811     if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1812     goto POSSESSIVE_NON_CAPTURE;
1813 ph10 335
1814 nigel 93 /* End of a group, repeated or non-repeating. */
1815 nigel 77
1816     case OP_KET:
1817     case OP_KETRMIN:
1818     case OP_KETRMAX:
1819 ph10 625 case OP_KETRPOS:
1820 nigel 91 prev = ecode - GET(ecode, 1);
1821 ph10 625
1822 nigel 93 /* If this was a group that remembered the subject start, in order to break
1823     infinite repeats of empty string matches, retrieve the subject start from
1824     the chain. Otherwise, set it NULL. */
1825 nigel 77
1826 ph10 618 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1827 nigel 93 {
1828     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1829     eptrb = eptrb->epb_prev; /* Backup to previous group */
1830     }
1831     else saved_eptr = NULL;
1832 nigel 77
1833 ph10 733 /* If we are at the end of an assertion group or a non-capturing atomic
1834 ph10 723 group, stop matching and return MATCH_MATCH, but record the current high
1835     water mark for use by positive assertions. We also need to record the match
1836     start in case it was changed by \K. */
1837 nigel 93
1838 ph10 723 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1839 ph10 733 *prev == OP_ONCE_NC)
1840 nigel 91 {
1841 ph10 723 md->end_match_ptr = eptr; /* For ONCE_NC */
1842 nigel 91 md->end_offset_top = offset_top;
1843 ph10 500 md->start_match_ptr = mstart;
1844 ph10 836 RRETURN(MATCH_MATCH); /* Sets md->mark */
1845 nigel 91 }
1846 nigel 77
1847 nigel 93 /* For capturing groups we have to check the group number back at the start
1848     and if necessary complete handling an extraction by setting the offsets and
1849 ph10 618 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1850     into group 0, so it won't be picked up here. Instead, we catch it when the
1851     OP_END is reached. Other recursion is handled here. We just have to record
1852     the current subject position and start match pointer and give a MATCH
1853     return. */
1854 nigel 77
1855 ph10 604 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1856     *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1857 nigel 91 {
1858 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1859 nigel 91 offset = number << 1;
1860 ph10 461
1861 ph10 475 #ifdef PCRE_DEBUG
1862 nigel 91 printf("end bracket %d", number);
1863     printf("\n");
1864 nigel 77 #endif
1865    
1866 ph10 618 /* Handle a recursively called group. */
1867    
1868     if (md->recursive != NULL && md->recursive->group_num == number)
1869     {
1870     md->end_match_ptr = eptr;
1871     md->start_match_ptr = mstart;
1872     RRETURN(MATCH_MATCH);
1873     }
1874    
1875     /* Deal with capturing */
1876    
1877 nigel 93 md->capture_last = number;
1878     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1879 nigel 91 {
1880 ph10 625 /* If offset is greater than offset_top, it means that we are
1881     "skipping" a capturing group, and that group's offsets must be marked
1882     unset. In earlier versions of PCRE, all the offsets were unset at the
1883     start of matching, but this doesn't work because atomic groups and
1884 ph10 615 assertions can cause a value to be set that should later be unset.
1885     Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1886 ph10 625 part of the atomic group, but this is not on the final matching path,
1887     so must be unset when 2 is set. (If there is no group 2, there is no
1888 ph10 615 problem, because offset_top will then be 2, indicating no capture.) */
1889 ph10 625
1890 ph10 615 if (offset > offset_top)
1891     {
1892     register int *iptr = md->offset_vector + offset_top;
1893     register int *iend = md->offset_vector + offset;
1894     while (iptr < iend) *iptr++ = -1;
1895 ph10 625 }
1896    
1897 ph10 615 /* Now make the extraction */
1898    
1899 nigel 93 md->offset_vector[offset] =
1900     md->offset_vector[md->offset_end - number];
1901 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1902 nigel 93 if (offset_top <= offset) offset_top = offset + 2;
1903     }
1904 nigel 91 }
1905 nigel 77
1906 ph10 618 /* For an ordinary non-repeating ket, just continue at this level. This
1907     also happens for a repeating ket if no characters were matched in the
1908     group. This is the forcible breaking of infinite loops as implemented in
1909 ph10 723 Perl 5.005. For a non-repeating atomic group that includes captures,
1910     establish a backup point by processing the rest of the pattern at a lower
1911     level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1912     original OP_ONCE level, thereby bypassing intermediate backup points, but
1913     resetting any captures that happened along the way. */
1914 nigel 77
1915 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1916     {
1917 ph10 618 if (*prev == OP_ONCE)
1918     {
1919     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1920     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1921     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1922 ph10 625 RRETURN(MATCH_ONCE);
1923     }
1924 ph10 618 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1925 nigel 91 break;
1926     }
1927 ph10 625
1928     /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1929 ph10 604 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1930     at a time from the outer level, thus saving stack. */
1931 ph10 625
1932 ph10 604 if (*ecode == OP_KETRPOS)
1933 ph10 625 {
1934 ph10 604 md->end_match_ptr = eptr;
1935 ph10 625 md->end_offset_top = offset_top;
1936 ph10 604 RRETURN(MATCH_KETRPOS);
1937 ph10 625 }
1938 nigel 77
1939 ph10 604 /* The normal repeating kets try the rest of the pattern or restart from
1940     the preceding bracket, in the appropriate order. In the second case, we can
1941     use tail recursion to avoid using another stack frame, unless we have an
1942 ph10 618 an atomic group or an unlimited repeat of a group that can match an empty
1943     string. */
1944 nigel 77
1945 nigel 91 if (*ecode == OP_KETRMIN)
1946     {
1947 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1948 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1949 ph10 618 if (*prev == OP_ONCE)
1950     {
1951 ph10 623 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1952 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1953     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1954 ph10 625 RRETURN(MATCH_ONCE);
1955     }
1956 ph10 604 if (*prev >= OP_SBRA) /* Could match an empty string */
1957 ph10 197 {
1958 ph10 625 md->match_function_type = MATCH_CBEGROUP;
1959 ph10 604 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1960 ph10 197 RRETURN(rrc);
1961     }
1962 nigel 91 ecode = prev;
1963     goto TAIL_RECURSE;
1964 nigel 77 }
1965 nigel 91 else /* OP_KETRMAX */
1966     {
1967 ph10 625 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1968 ph10 604 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1969 ph10 618 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1970 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1971 ph10 618 if (*prev == OP_ONCE)
1972     {
1973 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1974 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1975     md->once_target = prev;
1976 ph10 625 RRETURN(MATCH_ONCE);
1977     }
1978 nigel 91 ecode += 1 + LINK_SIZE;
1979     goto TAIL_RECURSE;
1980     }
1981     /* Control never gets here */
1982 nigel 77
1983 ph10 602 /* Not multiline mode: start of subject assertion, unless notbol. */
1984 nigel 77
1985     case OP_CIRC:
1986 ph10 836 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1987 ph10 625
1988 nigel 77 /* Start of subject assertion */
1989    
1990     case OP_SOD:
1991 ph10 836 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1992 nigel 77 ecode++;
1993     break;
1994 ph10 625
1995 ph10 602 /* Multiline mode: start of subject unless notbol, or after any newline. */
1996 nigel 77
1997 ph10 602 case OP_CIRCM:
1998 ph10 836 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1999 ph10 602 if (eptr != md->start_subject &&
2000     (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2001 ph10 836 RRETURN(MATCH_NOMATCH);
2002 ph10 602 ecode++;
2003     break;
2004    
2005 nigel 77 /* Start of match assertion */
2006    
2007     case OP_SOM:
2008 ph10 836 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2009 nigel 77 ecode++;
2010     break;
2011 ph10 172
2012 ph10 168 /* Reset the start of match point */
2013 ph10 172
2014 ph10 168 case OP_SET_SOM:
2015     mstart = eptr;
2016 ph10 172 ecode++;
2017     break;
2018 nigel 77
2019 ph10 602 /* Multiline mode: assert before any newline, or before end of subject
2020     unless noteol is set. */
2021 nigel 77
2022 ph10 602 case OP_DOLLM:
2023     if (eptr < md->end_subject)
2024 ph10 836 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
2025 ph10 602 else
2026 nigel 77 {
2027 ph10 836 if (md->noteol) RRETURN(MATCH_NOMATCH);
2028 ph10 602 SCHECK_PARTIAL();
2029 nigel 77 }
2030 ph10 602 ecode++;
2031     break;
2032 ph10 579
2033 ph10 625 /* Not multiline mode: assert before a terminating newline or before end of
2034 ph10 602 subject unless noteol is set. */
2035    
2036     case OP_DOLL:
2037 ph10 836 if (md->noteol) RRETURN(MATCH_NOMATCH);
2038 ph10 602 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2039    
2040 nigel 91 /* ... else fall through for endonly */
2041 nigel 77
2042     /* End of subject assertion (\z) */
2043    
2044     case OP_EOD:
2045 ph10 836 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2046 ph10 553 SCHECK_PARTIAL();
2047 nigel 77 ecode++;
2048     break;
2049    
2050     /* End of subject or ending \n assertion (\Z) */
2051    
2052     case OP_EODN:
2053 ph10 553 ASSERT_NL_OR_EOS:
2054     if (eptr < md->end_subject &&
2055 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2056 ph10 836 RRETURN(MATCH_NOMATCH);
2057 ph10 579
2058 ph10 553 /* Either at end of string or \n before end. */
2059 ph10 579
2060 ph10 553 SCHECK_PARTIAL();
2061 nigel 77 ecode++;
2062     break;
2063    
2064     /* Word boundary assertions */
2065    
2066     case OP_NOT_WORD_BOUNDARY:
2067     case OP_WORD_BOUNDARY:
2068     {
2069    
2070     /* Find out if the previous and current characters are "word" characters.
2071     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2072 ph10 443 be "non-word" characters. Remember the earliest consulted character for
2073 ph10 435 partial matching. */
2074 nigel 77
2075 ph10 836 #ifdef SUPPORT_UTF
2076     if (utf)
2077 nigel 77 {
2078 ph10 518 /* Get status of previous character */
2079 ph10 527
2080 nigel 77 if (eptr == md->start_subject) prev_is_word = FALSE; else
2081     {
2082 ph10 836 PCRE_PUCHAR lastptr = eptr - 1;
2083     BACKCHAR(lastptr);
2084 ph10 443 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2085 nigel 77 GETCHAR(c, lastptr);
2086 ph10 527 #ifdef SUPPORT_UCP
2087 ph10 518 if (md->use_ucp)
2088     {
2089     if (c == '_') prev_is_word = TRUE; else
2090 ph10 527 {
2091 ph10 518 int cat = UCD_CATEGORY(c);
2092     prev_is_word = (cat == ucp_L || cat == ucp_N);
2093 ph10 527 }
2094     }
2095     else
2096     #endif
2097 nigel 77 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2098     }
2099 ph10 527
2100 ph10 518 /* Get status of next character */
2101 ph10 527
2102 ph10 443 if (eptr >= md->end_subject)
2103 nigel 77 {
2104 ph10 443 SCHECK_PARTIAL();
2105     cur_is_word = FALSE;
2106 ph10 428 }
2107     else
2108     {
2109 nigel 77 GETCHAR(c, eptr);
2110 ph10 527 #ifdef SUPPORT_UCP
2111 ph10 518 if (md->use_ucp)
2112     {
2113     if (c == '_') cur_is_word = TRUE; else
2114 ph10 527 {
2115 ph10 518 int cat = UCD_CATEGORY(c);
2116     cur_is_word = (cat == ucp_L || cat == ucp_N);
2117 ph10 527 }
2118     }
2119     else
2120     #endif
2121 nigel 77 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2122     }
2123     }
2124     else
2125     #endif
2126    
2127 ph10 527 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2128 ph10 518 consistency with the behaviour of \w we do use it in this case. */
2129 nigel 77
2130     {
2131 ph10 518 /* Get status of previous character */
2132 ph10 527
2133 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
2134     {
2135 ph10 443 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2136 ph10 527 #ifdef SUPPORT_UCP
2137 ph10 518 if (md->use_ucp)
2138     {
2139 ph10 527 c = eptr[-1];
2140 ph10 518 if (c == '_') prev_is_word = TRUE; else
2141 ph10 527 {
2142 ph10 518 int cat = UCD_CATEGORY(c);
2143     prev_is_word = (cat == ucp_L || cat == ucp_N);
2144 ph10 527 }
2145     }
2146     else
2147     #endif
2148 ph10 836 prev_is_word = MAX_255(eptr[-1])
2149     && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2150 ph10 435 }
2151 ph10 527
2152 ph10 518 /* Get status of next character */
2153 ph10 527
2154 ph10 443 if (eptr >= md->end_subject)
2155 ph10 428 {
2156 ph10 443 SCHECK_PARTIAL();
2157     cur_is_word = FALSE;
2158 ph10 428 }
2159 ph10 527 else
2160     #ifdef SUPPORT_UCP
2161 ph10 518 if (md->use_ucp)
2162     {
2163 ph10 527 c = *eptr;
2164 ph10 518 if (c == '_') cur_is_word = TRUE; else
2165 ph10 527 {
2166 ph10 518 int cat = UCD_CATEGORY(c);
2167     cur_is_word = (cat == ucp_L || cat == ucp_N);
2168 ph10 527 }
2169     }
2170     else
2171     #endif
2172 ph10 836 cur_is_word = MAX_255(*eptr)
2173     && ((md->ctypes[*eptr] & ctype_word) != 0);
2174 nigel 77 }
2175    
2176     /* Now see if the situation is what we want */
2177    
2178     if ((*ecode++ == OP_WORD_BOUNDARY)?
2179     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2180 ph10 836 RRETURN(MATCH_NOMATCH);
2181 nigel 77 }
2182     break;
2183    
2184     /* Match a single character type; inline for speed */
2185    
2186     case OP_ANY:
2187 ph10 836 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2188 ph10 345 /* Fall through */
2189    
2190 ph10 341 case OP_ALLANY:
2191 ph10 648 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2192     { /* not be updated before SCHECK_PARTIAL. */
2193 ph10 443 SCHECK_PARTIAL();
2194 ph10 836 RRETURN(MATCH_NOMATCH);
2195 ph10 443 }
2196 ph10 648 eptr++;
2197 ph10 836 #ifdef SUPPORT_UTF
2198     if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2199     #endif
2200 nigel 77 ecode++;
2201     break;
2202    
2203     /* Match a single byte, even in UTF-8 mode. This opcode really does match
2204     any byte, even newline, independent of the setting of PCRE_DOTALL. */
2205    
2206     case OP_ANYBYTE:
2207 ph10 648 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2208     { /* not be updated before SCHECK_PARTIAL. */
2209 ph10 443 SCHECK_PARTIAL();
2210 ph10 836 RRETURN(MATCH_NOMATCH);
2211 ph10 443 }
2212 ph10 654 eptr++;
2213 nigel 77 ecode++;
2214     break;
2215    
2216     case OP_NOT_DIGIT:
2217 ph10 443 if (eptr >= md->end_subject)
2218 ph10 428 {
2219 ph10 443 SCHECK_PARTIAL();
2220 ph10 836 RRETURN(MATCH_NOMATCH);
2221 ph10 443 }
2222 nigel 77 GETCHARINCTEST(c, eptr);
2223     if (
2224 ph10 836 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2225 nigel 77 c < 256 &&
2226     #endif
2227     (md->ctypes[c] & ctype_digit) != 0
2228     )
2229 ph10 836 RRETURN(MATCH_NOMATCH);
2230 nigel 77 ecode++;
2231     break;
2232    
2233     case OP_DIGIT:
2234 ph10 443 if (eptr >= md->end_subject)
2235 ph10 428 {
2236 ph10 443 SCHECK_PARTIAL();
2237 ph10 836 RRETURN(MATCH_NOMATCH);
2238 ph10 443 }
2239 nigel 77 GETCHARINCTEST(c, eptr);
2240     if (
2241 ph10 836 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2242     c > 255 ||
2243 nigel 77 #endif
2244     (md->ctypes[c] & ctype_digit) == 0
2245     )
2246 ph10 836 RRETURN(MATCH_NOMATCH);
2247 nigel 77 ecode++;
2248     break;
2249    
2250     case OP_NOT_WHITESPACE:
2251 ph10 443 if (eptr >= md->end_subject)
2252 ph10 428 {
2253 ph10 443 SCHECK_PARTIAL();
2254 ph10 836 RRETURN(MATCH_NOMATCH);
2255 ph10 443 }
2256 nigel 77 GETCHARINCTEST(c, eptr);
2257     if (
2258 ph10 836 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2259 nigel 77 c < 256 &&
2260     #endif
2261     (md->ctypes[c] & ctype_space) != 0
2262     )
2263 ph10 836 RRETURN(MATCH_NOMATCH);
2264 nigel 77 ecode++;
2265     break;
2266    
2267     case OP_WHITESPACE:
2268 ph10 443 if (eptr >= md->end_subject)
2269 ph10 428 {
2270 ph10 443 SCHECK_PARTIAL();
2271 ph10 836 RRETURN(MATCH_NOMATCH);
2272 ph10 443 }
2273 nigel 77 GETCHARINCTEST(c, eptr);
2274     if (
2275 ph10 836 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2276     c > 255 ||
2277 nigel 77 #endif
2278     (md->ctypes[c] & ctype_space) == 0
2279     )
2280 ph10 836 RRETURN(MATCH_NOMATCH);
2281 nigel 77 ecode++;
2282     break;
2283    
2284     case OP_NOT_WORDCHAR:
2285 ph10 443 if (eptr >= md->end_subject)
2286 ph10 428 {
2287 ph10 443 SCHECK_PARTIAL();
2288 ph10 836 RRETURN(MATCH_NOMATCH);
2289 ph10 443 }
2290 nigel 77 GETCHARINCTEST(c, eptr);
2291     if (
2292 ph10 836 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2293 nigel 77 c < 256 &&
2294     #endif
2295     (md->ctypes[c] & ctype_word) != 0
2296     )
2297 ph10 836 RRETURN(MATCH_NOMATCH);
2298 nigel 77 ecode++;
2299     break;
2300    
2301     case OP_WORDCHAR:
2302 ph10 443 if (eptr >= md->end_subject)
2303 ph10 428 {
2304 ph10 443 SCHECK_PARTIAL();
2305 ph10 836 RRETURN(MATCH_NOMATCH);
2306 ph10 443 }
2307 nigel 77 GETCHARINCTEST(c, eptr);
2308     if (
2309 ph10 836 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2310     c > 255 ||
2311 nigel 77 #endif
2312     (md->ctypes[c] & ctype_word) == 0
2313     )
2314 ph10 836 RRETURN(MATCH_NOMATCH);
2315 nigel 77 ecode++;
2316     break;
2317    
2318 nigel 93 case OP_ANYNL:
2319 ph10 443 if (eptr >= md->end_subject)
2320 ph10 428 {
2321 ph10 443 SCHECK_PARTIAL();
2322 ph10 836 RRETURN(MATCH_NOMATCH);
2323 ph10 443 }
2324 nigel 93 GETCHARINCTEST(c, eptr);
2325     switch(c)
2326     {
2327 ph10 836 default: RRETURN(MATCH_NOMATCH);
2328 ph10 625
2329 nigel 93 case 0x000d:
2330     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2331     break;
2332 ph10 231
2333 nigel 93 case 0x000a:
2334 ph10 231 break;
2335    
2336 nigel 93 case 0x000b:
2337     case 0x000c:
2338     case 0x0085:
2339     case 0x2028:
2340     case 0x2029:
2341 ph10 836 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2342 nigel 93 break;
2343     }
2344     ecode++;
2345     break;
2346    
2347 ph10 178 case OP_NOT_HSPACE:
2348 ph10 443 if (eptr >= md->end_subject)
2349 ph10 428 {
2350 ph10 443 SCHECK_PARTIAL();
2351 ph10 836 RRETURN(MATCH_NOMATCH);
2352 ph10 443 }
2353 ph10 178 GETCHARINCTEST(c, eptr);
2354     switch(c)
2355     {
2356     default: break;
2357     case 0x09: /* HT */
2358     case 0x20: /* SPACE */
2359     case 0xa0: /* NBSP */
2360     case 0x1680: /* OGHAM SPACE MARK */
2361     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2362     case 0x2000: /* EN QUAD */
2363     case 0x2001: /* EM QUAD */
2364     case 0x2002: /* EN SPACE */
2365     case 0x2003: /* EM SPACE */
2366     case 0x2004: /* THREE-PER-EM SPACE */
2367     case 0x2005: /* FOUR-PER-EM SPACE */
2368     case 0x2006: /* SIX-PER-EM SPACE */
2369     case 0x2007: /* FIGURE SPACE */
2370     case 0x2008: /* PUNCTUATION SPACE */
2371     case 0x2009: /* THIN SPACE */
2372     case 0x200A: /* HAIR SPACE */
2373     case 0x202f: /* NARROW NO-BREAK SPACE */
2374     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2375     case 0x3000: /* IDEOGRAPHIC SPACE */
2376 ph10 836 RRETURN(MATCH_NOMATCH);
2377 ph10 178 }
2378     ecode++;
2379     break;
2380    
2381     case OP_HSPACE:
2382 ph10 443 if (eptr >= md->end_subject)
2383 ph10 428 {
2384 ph10 443 SCHECK_PARTIAL();
2385 ph10 836 RRETURN(MATCH_NOMATCH);
2386 ph10 443 }
2387 ph10 178 GETCHARINCTEST(c, eptr);
2388     switch(c)
2389     {
2390 ph10 836 default: RRETURN(MATCH_NOMATCH);
2391 ph10 178 case 0x09: /* HT */
2392     case 0x20: /* SPACE */
2393     case 0xa0: /* NBSP */
2394     case 0x1680: /* OGHAM SPACE MARK */
2395     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2396     case 0x2000: /* EN QUAD */
2397     case 0x2001: /* EM QUAD */
2398     case 0x2002: /* EN SPACE */
2399     case 0x2003: /* EM SPACE */
2400     case 0x2004: /* THREE-PER-EM SPACE */
2401     case 0x2005: /* FOUR-PER-EM SPACE */
2402     case 0x2006: /* SIX-PER-EM SPACE */
2403     case 0x2007: /* FIGURE SPACE */
2404     case 0x2008: /* PUNCTUATION SPACE */
2405     case 0x2009: /* THIN SPACE */
2406     case 0x200A: /* HAIR SPACE */
2407     case 0x202f: /* NARROW NO-BREAK SPACE */
2408     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2409     case 0x3000: /* IDEOGRAPHIC SPACE */
2410     break;
2411     }
2412     ecode++;
2413     break;
2414    
2415     case OP_NOT_VSPACE:
2416 ph10 443 if (eptr >= md->end_subject)
2417 ph10 428 {
2418 ph10 443 SCHECK_PARTIAL();
2419 ph10 836 RRETURN(MATCH_NOMATCH);
2420 ph10 443 }
2421 ph10 178 GETCHARINCTEST(c, eptr);
2422     switch(c)
2423     {
2424     default: break;
2425     case 0x0a: /* LF */
2426     case 0x0b: /* VT */
2427     case 0x0c: /* FF */
2428     case 0x0d: /* CR */
2429     case 0x85: /* NEL */
2430     case 0x2028: /* LINE SEPARATOR */
2431     case 0x2029: /* PARAGRAPH SEPARATOR */
2432 ph10 836 RRETURN(MATCH_NOMATCH);
2433 ph10 178 }
2434     ecode++;
2435     break;
2436    
2437     case OP_VSPACE:
2438 ph10 443 if (eptr >= md->end_subject)
2439 ph10 428 {
2440 ph10 443 SCHECK_PARTIAL();
2441 ph10 836 RRETURN(MATCH_NOMATCH);
2442 ph10 443 }
2443 ph10 178 GETCHARINCTEST(c, eptr);
2444     switch(c)
2445     {
2446 ph10 836 default: RRETURN(MATCH_NOMATCH);
2447 ph10 178 case 0x0a: /* LF */
2448     case 0x0b: /* VT */
2449     case 0x0c: /* FF */
2450     case 0x0d: /* CR */
2451     case 0x85: /* NEL */
2452     case 0x2028: /* LINE SEPARATOR */
2453     case 0x2029: /* PARAGRAPH SEPARATOR */
2454     break;
2455     }
2456     ecode++;
2457     break;
2458    
2459 nigel 77 #ifdef SUPPORT_UCP
2460     /* Check the next character by Unicode property. We will get here only
2461     if the support is in the binary; otherwise a compile-time error occurs. */
2462    
2463     case OP_PROP:
2464     case OP_NOTPROP:
2465 ph10 443 if (eptr >= md->end_subject)
2466 ph10 428 {
2467 ph10 443 SCHECK_PARTIAL();
2468 ph10 836 RRETURN(MATCH_NOMATCH);
2469 ph10 443 }
2470 nigel 77 GETCHARINCTEST(c, eptr);
2471     {
2472 ph10 384 const ucd_record *prop = GET_UCD(c);
2473 nigel 77
2474 nigel 87 switch(ecode[1])
2475     {
2476     case PT_ANY:
2477 ph10 836 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2478 nigel 87 break;
2479 nigel 77
2480 nigel 87 case PT_LAMP:
2481 ph10 349 if ((prop->chartype == ucp_Lu ||
2482     prop->chartype == ucp_Ll ||
2483     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2484 ph10 836 RRETURN(MATCH_NOMATCH);
2485 ph10 517 break;
2486 nigel 87
2487     case PT_GC:
2488 ph10 836 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2489     RRETURN(MATCH_NOMATCH);
2490 nigel 87 break;
2491    
2492     case PT_PC:
2493 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2494 ph10 836 RRETURN(MATCH_NOMATCH);
2495 nigel 87 break;
2496    
2497     case PT_SC:
2498 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
2499 ph10 836 RRETURN(MATCH_NOMATCH);
2500 nigel 87 break;
2501 ph10 527
2502 ph10 517 /* These are specials */
2503 ph10 527
2504 ph10 517 case PT_ALNUM:
2505 ph10 836 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2506     PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2507     RRETURN(MATCH_NOMATCH);
2508 ph10 527 break;
2509    
2510 ph10 517 case PT_SPACE: /* Perl space */
2511 ph10 836 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2512 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2513     == (op == OP_NOTPROP))
2514 ph10 836 RRETURN(MATCH_NOMATCH);
2515 ph10 527 break;
2516    
2517 ph10 517 case PT_PXSPACE: /* POSIX space */
2518 ph10 836 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2519 ph10 527 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2520 ph10 517 c == CHAR_FF || c == CHAR_CR)
2521     == (op == OP_NOTPROP))
2522 ph10 836 RRETURN(MATCH_NOMATCH);
2523 ph10 527 break;
2524 nigel 87
2525 ph10 527 case PT_WORD:
2526 ph10 836 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2527     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2528 ph10 517 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2529 ph10 836 RRETURN(MATCH_NOMATCH);
2530 ph10 527 break;
2531    
2532 ph10 517 /* This should never occur */
2533    
2534 nigel 87 default:
2535     RRETURN(PCRE_ERROR_INTERNAL);
2536 nigel 77 }
2537 nigel 87
2538     ecode += 3;
2539 nigel 77 }
2540     break;
2541    
2542     /* Match an extended Unicode sequence. We will get here only if the support
2543     is in the binary; otherwise a compile-time error occurs. */
2544    
2545     case OP_EXTUNI:
2546 ph10 443 if (eptr >= md->end_subject)
2547 ph10 428 {
2548 ph10 443 SCHECK_PARTIAL();
2549 ph10 836 RRETURN(MATCH_NOMATCH);
2550 ph10 443 }
2551 nigel 77 GETCHARINCTEST(c, eptr);
2552 ph10 836 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2553 ph10 623 while (eptr < md->end_subject)
2554 nigel 77 {
2555 ph10 623 int len = 1;
2556 ph10 836 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2557 ph10 623 if (UCD_CATEGORY(c) != ucp_M) break;
2558     eptr += len;
2559 nigel 77 }
2560     ecode++;
2561     break;
2562     #endif
2563    
2564    
2565     /* Match a back reference, possibly repeatedly. Look past the end of the
2566     item to see if there is repeat information following. The code is similar
2567     to that for character classes, but repeated for efficiency. Then obey
2568     similar code to character type repeats - written out again for speed.
2569     However, if the referenced string is the empty string, always treat
2570     it as matched, any number of times (otherwise there could be infinite
2571     loops). */
2572    
2573     case OP_REF:
2574 ph10 625 case OP_REFI:
2575     caseless = op == OP_REFI;
2576 ph10 595 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2577 ph10 836 ecode += 1 + IMM2_SIZE;
2578 ph10 345
2579 ph10 595 /* If the reference is unset, there are two possibilities:
2580 ph10 345
2581 ph10 595 (a) In the default, Perl-compatible state, set the length negative;
2582     this ensures that every attempt at a match fails. We can't just fail
2583     here, because of the possibility of quantifiers with zero minima.
2584 ph10 345
2585 ph10 595 (b) If the JavaScript compatibility flag is set, set the length to zero
2586     so that the back reference matches an empty string.
2587 ph10 345
2588 ph10 595 Otherwise, set the length to the length of what was matched by the
2589     referenced subpattern. */
2590 ph10 345
2591 ph10 595 if (offset >= offset_top || md->offset_vector[offset] < 0)
2592     length = (md->jscript_compat)? 0 : -1;
2593     else
2594     length = md->offset_vector[offset+1] - md->offset_vector[offset];
2595 nigel 77
2596 ph10 595 /* Set up for repetition, or handle the non-repeated case */
2597 nigel 77
2598 ph10 595 switch (*ecode)
2599     {
2600     case OP_CRSTAR:
2601     case OP_CRMINSTAR:
2602     case OP_CRPLUS:
2603     case OP_CRMINPLUS:
2604     case OP_CRQUERY:
2605     case OP_CRMINQUERY:
2606     c = *ecode++ - OP_CRSTAR;
2607     minimize = (c & 1) != 0;
2608     min = rep_min[c]; /* Pick up values from tables; */
2609     max = rep_max[c]; /* zero for max => infinity */
2610     if (max == 0) max = INT_MAX;
2611     break;
2612 nigel 77
2613 ph10 595 case OP_CRRANGE:
2614     case OP_CRMINRANGE:
2615     minimize = (*ecode == OP_CRMINRANGE);
2616     min = GET2(ecode, 1);
2617 ph10 836 max = GET2(ecode, 1 + IMM2_SIZE);
2618 ph10 595 if (max == 0) max = INT_MAX;
2619 ph10 836 ecode += 1 + 2 * IMM2_SIZE;
2620 ph10 595 break;
2621 nigel 77
2622 ph10 595 default: /* No repeat follows */
2623 ph10 602 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2624 ph10 595 {
2625     CHECK_PARTIAL();
2626 ph10 836 RRETURN(MATCH_NOMATCH);
2627 nigel 77 }
2628 ph10 595 eptr += length;
2629     continue; /* With the main loop */
2630     }
2631 nigel 77
2632 ph10 595 /* Handle repeated back references. If the length of the reference is
2633 ph10 836 zero, just continue with the main loop. If the length is negative, it
2634     means the reference is unset in non-Java-compatible mode. If the minimum is
2635     zero, we can continue at the same level without recursion. For any other
2636     minimum, carrying on will result in NOMATCH. */
2637 ph10 443
2638 ph10 595 if (length == 0) continue;
2639 ph10 836 if (length < 0 && min == 0) continue;
2640 nigel 77
2641 ph10 595 /* First, ensure the minimum number of matches are present. We get back
2642     the length of the reference string explicitly rather than passing the
2643     address of eptr, so that eptr can be a register variable. */
2644 nigel 77
2645 ph10 595 for (i = 1; i <= min; i++)
2646     {
2647 ph10 625 int slength;
2648 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2649 nigel 77 {
2650 ph10 595 CHECK_PARTIAL();
2651 ph10 836 RRETURN(MATCH_NOMATCH);
2652 nigel 77 }
2653 ph10 595 eptr += slength;
2654     }
2655 nigel 77
2656 ph10 595 /* If min = max, continue at the same level without recursion.
2657     They are not both allowed to be zero. */
2658 nigel 77
2659 ph10 595 if (min == max) continue;
2660 nigel 77
2661 ph10 595 /* If minimizing, keep trying and advancing the pointer */
2662 nigel 77
2663 ph10 595 if (minimize)
2664     {
2665     for (fi = min;; fi++)
2666 nigel 77 {
2667 ph10 625 int slength;
2668 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2669 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2670 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
2671 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2672 nigel 77 {
2673 ph10 595 CHECK_PARTIAL();
2674 ph10 836 RRETURN(MATCH_NOMATCH);
2675 nigel 77 }
2676 ph10 595 eptr += slength;
2677 nigel 77 }
2678 ph10 595 /* Control never gets here */
2679     }
2680 nigel 77
2681 ph10 595 /* If maximizing, find the longest string and work backwards */
2682 nigel 77
2683 ph10 595 else
2684     {
2685     pp = eptr;
2686     for (i = min; i < max; i++)
2687 nigel 77 {
2688 ph10 625 int slength;
2689 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2690 nigel 77 {
2691 ph10 595 CHECK_PARTIAL();
2692     break;
2693 nigel 77 }
2694 ph10 595 eptr += slength;
2695 nigel 77 }
2696 ph10 595 while (eptr >= pp)
2697     {
2698 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2699 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2700     eptr -= length;
2701     }
2702 ph10 836 RRETURN(MATCH_NOMATCH);
2703 nigel 77 }
2704     /* Control never gets here */
2705    
2706     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2707     used when all the characters in the class have values in the range 0-255,
2708     and either the matching is caseful, or the characters are in the range
2709     0-127 when UTF-8 processing is enabled. The only difference between
2710     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2711     encountered.
2712    
2713     First, look past the end of the item to see if there is repeat information
2714     following. Then obey similar code to character type repeats - written out
2715     again for speed. */
2716    
2717     case OP_NCLASS:
2718     case OP_CLASS:
2719     {
2720 ph10 836 /* The data variable is saved across frames, so the byte map needs to
2721     be stored there. */
2722     #define BYTE_MAP ((pcre_uint8 *)data)
2723 nigel 77 data = ecode + 1; /* Save for matching */
2724 ph10 836 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2725 nigel 77
2726     switch (*ecode)
2727     {
2728     case OP_CRSTAR:
2729     case OP_CRMINSTAR:
2730     case OP_CRPLUS:
2731     case OP_CRMINPLUS:
2732     case OP_CRQUERY:
2733     case OP_CRMINQUERY:
2734     c = *ecode++ - OP_CRSTAR;
2735     minimize = (c & 1) != 0;
2736     min = rep_min[c]; /* Pick up values from tables; */
2737     max = rep_max[c]; /* zero for max => infinity */
2738     if (max == 0) max = INT_MAX;
2739     break;
2740    
2741     case OP_CRRANGE:
2742     case OP_CRMINRANGE:
2743     minimize = (*ecode == OP_CRMINRANGE);
2744     min = GET2(ecode, 1);
2745 ph10 836 max = GET2(ecode, 1 + IMM2_SIZE);
2746 nigel 77 if (max == 0) max = INT_MAX;
2747 ph10 836 ecode += 1 + 2 * IMM2_SIZE;
2748 nigel 77 break;
2749    
2750     default: /* No repeat follows */
2751     min = max = 1;
2752     break;
2753     }
2754    
2755     /* First, ensure the minimum number of matches are present. */
2756    
2757 ph10 836 #ifdef SUPPORT_UTF
2758     if (utf)
2759 nigel 77 {
2760     for (i = 1; i <= min; i++)
2761     {
2762 ph10 427 if (eptr >= md->end_subject)
2763 ph10 426 {
2764 ph10 428 SCHECK_PARTIAL();
2765 ph10 836 RRETURN(MATCH_NOMATCH);
2766 ph10 427 }
2767 nigel 77 GETCHARINC(c, eptr);
2768     if (c > 255)
2769     {
2770 ph10 836 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2771 nigel 77 }
2772     else
2773 ph10 836 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2774 nigel 77 }
2775     }
2776     else
2777     #endif
2778 ph10 836 /* Not UTF mode */
2779 nigel 77 {
2780     for (i = 1; i <= min; i++)
2781     {
2782 ph10 427 if (eptr >= md->end_subject)
2783 ph10 426 {
2784 ph10 428 SCHECK_PARTIAL();
2785 ph10 836 RRETURN(MATCH_NOMATCH);
2786 ph10 427 }
2787 nigel 77 c = *eptr++;
2788 ph10 836 #ifndef COMPILE_PCRE8
2789     if (c > 255)
2790     {
2791     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2792     }
2793     else
2794     #endif
2795     if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2796 nigel 77 }
2797     }
2798    
2799     /* If max == min we can continue with the main loop without the
2800     need to recurse. */
2801    
2802     if (min == max) continue;
2803    
2804     /* If minimizing, keep testing the rest of the expression and advancing
2805     the pointer while it matches the class. */
2806    
2807     if (minimize)
2808     {
2809 ph10 836 #ifdef SUPPORT_UTF
2810     if (utf)
2811 nigel 77 {
2812     for (fi = min;; fi++)
2813     {
2814 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2815 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2816 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
2817 ph10 427 if (eptr >= md->end_subject)
2818 ph10 426 {
2819 ph10 427 SCHECK_PARTIAL();
2820 ph10 836 RRETURN(MATCH_NOMATCH);
2821 ph10 427 }
2822 nigel 77 GETCHARINC(c, eptr);
2823     if (c > 255)
2824     {
2825 ph10 836 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2826 nigel 77 }
2827     else
2828 ph10 836 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2829 nigel 77 }
2830     }
2831     else
2832     #endif
2833 ph10 836 /* Not UTF mode */
2834 nigel 77 {
2835     for (fi = min;; fi++)
2836     {
2837 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2838 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2839 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
2840 ph10 427 if (eptr >= md->end_subject)
2841 ph10 426 {
2842 ph10 427 SCHECK_PARTIAL();
2843 ph10 836 RRETURN(MATCH_NOMATCH);
2844 ph10 427 }
2845 nigel 77 c = *eptr++;
2846 ph10 836 #ifndef COMPILE_PCRE8
2847     if (c > 255)
2848     {
2849     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2850     }
2851     else
2852     #endif
2853     if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2854 nigel 77 }
2855     }
2856     /* Control never gets here */
2857     }
2858    
2859     /* If maximizing, find the longest possible run, then work backwards. */
2860    
2861     else
2862     {
2863     pp = eptr;
2864    
2865 ph10 836 #ifdef SUPPORT_UTF
2866     if (utf)
2867 nigel 77 {
2868     for (i = min; i < max; i++)
2869     {
2870     int len = 1;
2871 ph10 463 if (eptr >= md->end_subject)
2872 ph10 462 {
2873 ph10 463 SCHECK_PARTIAL();
2874 ph10 462 break;
2875 ph10 463 }
2876 nigel 77 GETCHARLEN(c, eptr, len);
2877     if (c > 255)
2878     {
2879     if (op == OP_CLASS) break;
2880     }
2881     else
2882 ph10 836 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2883 nigel 77 eptr += len;
2884     }
2885     for (;;)
2886     {
2887 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2888 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2889     if (eptr-- == pp) break; /* Stop if tried at original pos */
2890     BACKCHAR(eptr);
2891     }
2892     }
2893     else
2894     #endif
2895 ph10 836 /* Not UTF mode */
2896 nigel 77 {
2897     for (i = min; i < max; i++)
2898     {
2899 ph10 463 if (eptr >= md->end_subject)
2900 ph10 462 {
2901 ph10 463 SCHECK_PARTIAL();
2902 ph10 462 break;
2903 ph10 463 }
2904 nigel 77 c = *eptr;
2905 ph10 836 #ifndef COMPILE_PCRE8
2906     if (c > 255)
2907     {
2908     if (op == OP_CLASS) break;
2909     }
2910     else
2911     #endif
2912     if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2913 nigel 77 eptr++;
2914     }
2915     while (eptr >= pp)
2916     {
2917 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2918 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2919 nigel 77 eptr--;
2920     }
2921     }
2922    
2923 ph10 836 RRETURN(MATCH_NOMATCH);
2924 nigel 77 }
2925 ph10 836 #undef BYTE_MAP
2926 nigel 77 }
2927     /* Control never gets here */
2928    
2929    
2930     /* Match an extended character class. This opcode is encountered only
2931 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2932     mode, because Unicode properties are supported in non-UTF-8 mode. */
2933 nigel 77
2934 ph10 836 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2935 nigel 77 case OP_XCLASS:
2936     {
2937     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2938     ecode += GET(ecode, 1); /* Advance past the item */
2939    
2940     switch (*ecode)
2941     {
2942     case OP_CRSTAR:
2943     case OP_CRMINSTAR:
2944     case OP_CRPLUS:
2945     case OP_CRMINPLUS:
2946     case OP_CRQUERY:
2947     case OP_CRMINQUERY:
2948     c = *ecode++ - OP_CRSTAR;
2949     minimize = (c & 1) != 0;
2950     min = rep_min[c]; /* Pick up values from tables; */
2951     max = rep_max[c]; /* zero for max => infinity */
2952     if (max == 0) max = INT_MAX;
2953     break;
2954    
2955     case OP_CRRANGE:
2956     case OP_CRMINRANGE:
2957     minimize = (*ecode == OP_CRMINRANGE);
2958     min = GET2(ecode, 1);
2959 ph10 836 max = GET2(ecode, 1 + IMM2_SIZE);
2960 nigel 77 if (max == 0) max = INT_MAX;
2961 ph10 836 ecode += 1 + 2 * IMM2_SIZE;
2962 nigel 77 break;
2963    
2964     default: /* No repeat follows */
2965     min = max = 1;
2966     break;
2967     }
2968    
2969     /* First, ensure the minimum number of matches are present. */
2970    
2971     for (i = 1; i <= min; i++)
2972     {
2973 ph10 427 if (eptr >= md->end_subject)
2974 ph10 426 {
2975     SCHECK_PARTIAL();
2976 ph10 836 RRETURN(MATCH_NOMATCH);
2977 ph10 427 }
2978 ph10 384 GETCHARINCTEST(c, eptr);
2979 ph10 836 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
2980 nigel 77 }
2981    
2982     /* If max == min we can continue with the main loop without the
2983     need to recurse. */
2984    
2985     if (min == max) continue;
2986    
2987     /* If minimizing, keep testing the rest of the expression and advancing
2988     the pointer while it matches the class. */
2989    
2990     if (minimize)
2991     {
2992     for (fi = min;; fi++)
2993     {
2994 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2995 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2996 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
2997 ph10 427 if (eptr >= md->end_subject)
2998 ph10 426 {
2999 ph10 427 SCHECK_PARTIAL();
3000 ph10 836 RRETURN(MATCH_NOMATCH);
3001 ph10 427 }
3002 ph10 384 GETCHARINCTEST(c, eptr);
3003 ph10 836 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3004 nigel 77 }
3005     /* Control never gets here */
3006     }
3007    
3008     /* If maximizing, find the longest possible run, then work backwards. */
3009    
3010     else
3011     {
3012     pp = eptr;
3013     for (i = min; i < max; i++)
3014     {
3015     int len = 1;
3016 ph10 463 if (eptr >= md->end_subject)
3017 ph10 462 {
3018 ph10 463 SCHECK_PARTIAL();
3019 ph10 462 break;
3020 ph10 463 }
3021 ph10 836 #ifdef SUPPORT_UTF
3022 ph10 384 GETCHARLENTEST(c, eptr, len);
3023 ph10 836 #else
3024     c = *eptr;
3025     #endif
3026     if (!PRIV(xclass)(c, data, utf)) break;
3027 nigel 77 eptr += len;
3028     }
3029     for(;;)
3030     {
3031 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3032 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3033     if (eptr-- == pp) break; /* Stop if tried at original pos */
3034 ph10 836 #ifdef SUPPORT_UTF
3035     if (utf) BACKCHAR(eptr);
3036     #endif
3037 nigel 77 }
3038 ph10 836 RRETURN(MATCH_NOMATCH);
3039 nigel 77 }
3040    
3041     /* Control never gets here */
3042     }
3043     #endif /* End of XCLASS */
3044    
3045     /* Match a single character, casefully */
3046    
3047     case OP_CHAR:
3048 ph10 836 #ifdef SUPPORT_UTF
3049     if (utf)
3050 nigel 77 {
3051     length = 1;
3052     ecode++;
3053     GETCHARLEN(fc, ecode, length);
3054 ph10 443 if (length > md->end_subject - eptr)
3055 ph10 428 {
3056     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3057 ph10 836 RRETURN(MATCH_NOMATCH);
3058 ph10 443 }
3059 ph10 836 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3060 nigel 77 }
3061     else
3062     #endif
3063 ph10 836 /* Not UTF mode */
3064 nigel 77 {
3065 ph10 443 if (md->end_subject - eptr < 1)
3066 ph10 428 {
3067     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3068 ph10 836 RRETURN(MATCH_NOMATCH);
3069 ph10 443 }
3070 ph10 836 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3071 nigel 77 ecode += 2;
3072     }
3073     break;
3074    
3075 ph10 836 /* Match a single character, caselessly. If we are at the end of the
3076     subject, give up immediately. */
3077 nigel 77
3078 ph10 602 case OP_CHARI:
3079 ph10 836 if (eptr >= md->end_subject)
3080 nigel 77 {
3081 ph10 836 SCHECK_PARTIAL();
3082     RRETURN(MATCH_NOMATCH);
3083     }
3084    
3085     #ifdef SUPPORT_UTF
3086     if (utf)
3087     {
3088 nigel 77 length = 1;
3089     ecode++;
3090     GETCHARLEN(fc, ecode, length);
3091 ph10 788
3092 nigel 77 /* If the pattern character's value is < 128, we have only one byte, and
3093 ph10 836 we know that its other case must also be one byte long, so we can use the
3094     fast lookup table. We know that there is at least one byte left in the
3095     subject. */
3096 nigel 77
3097     if (fc < 128)
3098     {
3099 ph10 836 if (md->lcc[fc]
3100     != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3101     ecode++;
3102     eptr++;
3103 nigel 77 }
3104    
3105 ph10 836 /* Otherwise we must pick up the subject character. Note that we cannot
3106     use the value of "length" to check for sufficient bytes left, because the
3107     other case of the character may have more or fewer bytes. */
3108 nigel 77
3109     else
3110     {
3111 nigel 93 unsigned int dc;
3112 nigel 77 GETCHARINC(dc, eptr);
3113     ecode += length;
3114    
3115     /* If we have Unicode property support, we can use it to test the other
3116 nigel 87 case of the character, if there is one. */
3117 nigel 77
3118     if (fc != dc)
3119     {
3120     #ifdef SUPPORT_UCP
3121 ph10 349 if (dc != UCD_OTHERCASE(fc))
3122 nigel 77 #endif
3123 ph10 836 RRETURN(MATCH_NOMATCH);
3124 nigel 77 }
3125     }
3126     }
3127     else
3128 ph10 836 #endif /* SUPPORT_UTF */
3129 nigel 77
3130 ph10 836 /* Not UTF mode */
3131 nigel 77 {
3132 ph10 836 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3133     != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3134     eptr++;
3135 nigel 77 ecode += 2;
3136     }
3137     break;
3138    
3139 nigel 93 /* Match a single character repeatedly. */
3140 nigel 77
3141     case OP_EXACT:
3142 ph10 602 case OP_EXACTI:
3143 nigel 77 min = max = GET2(ecode, 1);
3144 ph10 836 ecode += 1 + IMM2_SIZE;
3145 nigel 77 goto REPEATCHAR;
3146    
3147 nigel 93 case OP_POSUPTO:
3148 ph10 602 case OP_POSUPTOI:
3149 nigel 93 possessive = TRUE;
3150     /* Fall through */
3151    
3152 nigel 77 case OP_UPTO:
3153 ph10 602 case OP_UPTOI:
3154 nigel 77 case OP_MINUPTO:
3155 ph10 602 case OP_MINUPTOI:
3156 nigel 77 min = 0;
3157     max = GET2(ecode, 1);
3158 ph10 602 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3159 ph10 836 ecode += 1 + IMM2_SIZE;
3160 nigel 77 goto REPEATCHAR;
3161    
3162 nigel 93 case OP_POSSTAR:
3163 ph10 602 case OP_POSSTARI:
3164 nigel 93 possessive = TRUE;
3165     min = 0;
3166     max = INT_MAX;
3167     ecode++;
3168     goto REPEATCHAR;
3169    
3170     case OP_POSPLUS:
3171 ph10 602 case OP_POSPLUSI:
3172 nigel 93 possessive = TRUE;
3173     min = 1;
3174     max = INT_MAX;
3175     ecode++;
3176     goto REPEATCHAR;
3177    
3178     case OP_POSQUERY:
3179 ph10 602 case OP_POSQUERYI:
3180 nigel 93 possessive = TRUE;
3181     min = 0;
3182     max = 1;
3183     ecode++;
3184     goto REPEATCHAR;
3185    
3186 nigel 77 case OP_STAR:
3187 ph10 602 case OP_STARI:
3188 nigel 77 case OP_MINSTAR:
3189 ph10 602 case OP_MINSTARI:
3190 nigel 77 case OP_PLUS:
3191 ph10 602 case OP_PLUSI:
3192 nigel 77 case OP_MINPLUS:
3193 ph10 602 case OP_MINPLUSI:
3194 nigel 77 case OP_QUERY:
3195 ph10 602 case OP_QUERYI:
3196 nigel 77 case OP_MINQUERY:
3197 ph10 602 case OP_MINQUERYI:
3198     c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3199 nigel 77 minimize = (c & 1) != 0;
3200     min = rep_min[c]; /* Pick up values from tables; */
3201     max = rep_max[c]; /* zero for max => infinity */
3202     if (max == 0) max = INT_MAX;
3203    
3204 ph10 426 /* Common code for all repeated single-character matches. */
3205 nigel 77
3206     REPEATCHAR:
3207 ph10 836 #ifdef SUPPORT_UTF
3208     if (utf)
3209 nigel 77 {
3210     length = 1;
3211     charptr = ecode;
3212     GETCHARLEN(fc, ecode, length);
3213     ecode += length;
3214    
3215     /* Handle multibyte character matching specially here. There is
3216     support for caseless matching if UCP support is present. */
3217    
3218     if (length > 1)
3219     {
3220     #ifdef SUPPORT_UCP
3221 nigel 93 unsigned int othercase;
3222 ph10 602 if (op >= OP_STARI && /* Caseless */
3223 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
3224 ph10 836 oclength = PRIV(ord2utf)(othercase, occhars);
3225 ph10 115 else oclength = 0;
3226 nigel 77 #endif /* SUPPORT_UCP */
3227    
3228     for (i = 1; i <= min; i++)
3229     {
3230 ph10 426 if (eptr <= md->end_subject - length &&
3231 ph10 836 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3232 ph10 123 #ifdef SUPPORT_UCP
3233 ph10 426 else if (oclength > 0 &&
3234     eptr <= md->end_subject - oclength &&
3235 ph10 836 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3236 ph10 426 #endif /* SUPPORT_UCP */
3237 nigel 77 else
3238     {
3239 ph10 426 CHECK_PARTIAL();
3240 ph10 836 RRETURN(MATCH_NOMATCH);
3241 nigel 77 }
3242     }
3243    
3244     if (min == max) continue;
3245    
3246     if (minimize)
3247     {
3248     for (fi = min;; fi++)
3249     {
3250 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3251 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3252 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
3253 ph10 426 if (eptr <= md->end_subject - length &&
3254 ph10 836 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3255 ph10 123 #ifdef SUPPORT_UCP
3256 ph10 426 else if (oclength > 0 &&
3257     eptr <= md->end_subject - oclength &&
3258 ph10 836 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3259 ph10 426 #endif /* SUPPORT_UCP */
3260 nigel 77 else
3261     {
3262 ph10 426 CHECK_PARTIAL();
3263 ph10 836 RRETURN(MATCH_NOMATCH);
3264 nigel 77 }
3265     }
3266     /* Control never gets here */
3267     }
3268 nigel 93
3269     else /* Maximize */
3270 nigel 77 {
3271     pp = eptr;
3272     for (i = min; i < max; i++)
3273     {
3274 ph10 426 if (eptr <= md->end_subject - length &&
3275 ph10 836 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3276 ph10 123 #ifdef SUPPORT_UCP
3277 ph10 426 else if (oclength > 0 &&
3278     eptr <= md->end_subject - oclength &&
3279 ph10 836 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3280 ph10 426 #endif /* SUPPORT_UCP */
3281 ph10 463 else
3282 ph10 462 {
3283 ph10 463 CHECK_PARTIAL();
3284 ph10 462 break;
3285 ph10 463 }
3286 nigel 77 }
3287 nigel 93
3288     if (possessive) continue;
3289 ph10 427
3290 ph10 120 for(;;)
3291 ph10 426 {
3292 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3293 ph10 426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3294 ph10 836 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3295 ph10 115 #ifdef SUPPORT_UCP
3296 ph10 426 eptr--;
3297     BACKCHAR(eptr);
3298 ph10 123 #else /* without SUPPORT_UCP */
3299 ph10 426 eptr -= length;
3300 ph10 123 #endif /* SUPPORT_UCP */
3301 ph10 426 }
3302 nigel 77 }
3303     /* Control never gets here */
3304     }
3305    
3306     /* If the length of a UTF-8 character is 1, we fall through here, and
3307     obey the code as for non-UTF-8 characters below, though in this case the
3308     value of fc will always be < 128. */
3309     }
3310     else
3311 ph10 836 #endif /* SUPPORT_UTF */
3312     /* When not in UTF-8 mode, load a single-byte character. */
3313     fc = *ecode++;
3314 nigel 77
3315 ph10 836 /* The value of fc at this point is always one character, though we may
3316     or may not be in UTF mode. The code is duplicated for the caseless and
3317 nigel 77 caseful cases, for speed, since matching characters is likely to be quite
3318     common. First, ensure the minimum number of matches are present. If min =
3319     max, continue at the same level without recursing. Otherwise, if
3320     minimizing, keep trying the rest of the expression and advancing one
3321     matching character if failing, up to the maximum. Alternatively, if
3322     maximizing, find the maximum number of characters and work backwards. */
3323    
3324     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3325     max, eptr));
3326    
3327 ph10 602 if (op >= OP_STARI) /* Caseless */
3328 nigel 77 {
3329 ph10 836 #ifdef COMPILE_PCRE8
3330     /* fc must be < 128 if UTF is enabled. */
3331     foc = md->fcc[fc];
3332     #else
3333     #ifdef SUPPORT_UTF
3334     #ifdef SUPPORT_UCP
3335     if (utf && fc > 127)
3336     foc = UCD_OTHERCASE(fc);
3337     #else
3338     if (utf && fc > 127)
3339     foc = fc;
3340     #endif /* SUPPORT_UCP */
3341     else
3342     #endif /* SUPPORT_UTF */
3343     foc = TABLE_GET(fc, md->fcc, fc);
3344     #endif /* COMPILE_PCRE8 */
3345    
3346 nigel 77 for (i = 1; i <= min; i++)
3347 ph10 426 {
3348     if (eptr >= md->end_subject)
3349     {
3350     SCHECK_PARTIAL();
3351 ph10 836 RRETURN(MATCH_NOMATCH);
3352 ph10 426 }
3353 ph10 836 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3354     eptr++;
3355 ph10 426 }
3356 nigel 77 if (min == max) continue;
3357     if (minimize)
3358     {
3359     for (fi = min;; fi++)
3360     {
3361 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3362 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3363 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
3364 ph10 426 if (eptr >= md->end_subject)
3365     {
3366 ph10 427 SCHECK_PARTIAL();
3367 ph10 836 RRETURN(MATCH_NOMATCH);
3368 ph10 426 }
3369 ph10 836 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3370     eptr++;
3371 nigel 77 }
3372     /* Control never gets here */
3373     }
3374 nigel 93 else /* Maximize */
3375 nigel 77 {
3376     pp = eptr;
3377     for (i = min; i < max; i++)
3378     {
3379 ph10 463 if (eptr >= md->end_subject)
3380 ph10 462 {
3381     SCHECK_PARTIAL();
3382     break;
3383 ph10 463 }
3384 ph10 836 if (fc != *eptr && foc != *eptr) break;
3385 nigel 77 eptr++;
3386     }
3387 ph10 427
3388 nigel 93 if (possessive) continue;
3389 ph10 427
3390 nigel 77 while (eptr >= pp)
3391     {
3392 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3393 nigel 77 eptr--;
3394     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3395     }
3396 ph10 836 RRETURN(MATCH_NOMATCH);
3397 nigel 77 }
3398     /* Control never gets here */
3399     }
3400    
3401     /* Caseful comparisons (includes all multi-byte characters) */
3402    
3403     else
3404     {
3405 ph10 427 for (i = 1; i <= min; i++)
3406 ph10 426 {
3407     if (eptr >= md->end_subject)
3408     {
3409     SCHECK_PARTIAL();
3410 ph10 836 RRETURN(MATCH_NOMATCH);
3411 ph10 426 }
3412 ph10 836 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3413 ph10 427 }
3414 ph10 443
3415 nigel 77 if (min == max) continue;
3416 ph10 443
3417 nigel 77 if (minimize)
3418     {
3419     for (fi = min;; fi++)
3420     {
3421 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3422 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3423 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
3424 ph10 426 if (eptr >= md->end_subject)
3425 ph10 427 {
3426 ph10 426 SCHECK_PARTIAL();
3427 ph10 836 RRETURN(MATCH_NOMATCH);
3428 ph10 427 }
3429 ph10 836 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3430 nigel 77 }
3431     /* Control never gets here */
3432     }
3433 nigel 93 else /* Maximize */
3434 nigel 77 {
3435     pp = eptr;
3436     for (i = min; i < max; i++)
3437     {
3438 ph10 463 if (eptr >= md->end_subject)
3439 ph10 462 {
3440 ph10 463 SCHECK_PARTIAL();
3441 ph10 462 break;
3442 ph10 463 }
3443 ph10 462 if (fc != *eptr) break;
3444 nigel 77 eptr++;
3445     }
3446 nigel 93 if (possessive) continue;
3447 ph10 443
3448 nigel 77 while (eptr >= pp)
3449     {
3450 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3451 nigel 77 eptr--;
3452     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3453     }
3454 ph10 836 RRETURN(MATCH_NOMATCH);
3455 nigel 77 }
3456     }
3457     /* Control never gets here */
3458    
3459     /* Match a negated single one-byte character. The character we are
3460     checking can be multibyte. */
3461    
3462     case OP_NOT:
3463 ph10 625 case OP_NOTI:
3464 ph10 443 if (eptr >= md->end_subject)
3465 ph10 428 {
3466 ph10 443 SCHECK_PARTIAL();
3467 ph10 836 RRETURN(MATCH_NOMATCH);
3468 ph10 443 }
3469 nigel 77 ecode++;
3470     GETCHARINCTEST(c, eptr);
3471 ph10 602 if (op == OP_NOTI) /* The caseless case */
3472 nigel 77 {
3473 ph10 836 register int ch, och;
3474     ch = *ecode++;
3475     #ifdef COMPILE_PCRE8
3476     /* ch must be < 128 if UTF is enabled. */
3477     och = md->fcc[ch];
3478     #else
3479     #ifdef SUPPORT_UTF
3480     #ifdef SUPPORT_UCP
3481     if (utf && ch > 127)
3482     och = UCD_OTHERCASE(ch);
3483     #else
3484     if (utf && ch > 127)
3485     och = ch;
3486     #endif /* SUPPORT_UCP */
3487     else
3488     #endif /* SUPPORT_UTF */
3489     och = TABLE_GET(ch, md->fcc, ch);
3490     #endif /* COMPILE_PCRE8 */
3491     if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3492 nigel 77 }
3493 ph10 602 else /* Caseful */
3494 nigel 77 {
3495 ph10 836 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
3496 nigel 77 }
3497     break;
3498    
3499     /* Match a negated single one-byte character repeatedly. This is almost a
3500     repeat of the code for a repeated single character, but I haven't found a
3501     nice way of commoning these up that doesn't require a test of the
3502     positive/negative option for each character match. Maybe that wouldn't add
3503     very much to the time taken, but character matching *is* what this is all
3504     about... */
3505    
3506     case OP_NOTEXACT:
3507 ph10 602 case OP_NOTEXACTI:
3508 nigel 77 min = max = GET2(ecode, 1);
3509 ph10 836 ecode += 1 + IMM2_SIZE;
3510 nigel 77 goto REPEATNOTCHAR;
3511    
3512     case OP_NOTUPTO:
3513 ph10 602 case OP_NOTUPTOI:
3514 nigel 77 case OP_NOTMINUPTO:
3515 ph10 602 case OP_NOTMINUPTOI:
3516 nigel 77 min = 0;
3517     max = GET2(ecode, 1);
3518 ph10 602 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3519 ph10 836 ecode += 1 + IMM2_SIZE;
3520 nigel 77 goto REPEATNOTCHAR;
3521    
3522 nigel 93 case OP_NOTPOSSTAR:
3523 ph10 602 case OP_NOTPOSSTARI:
3524 nigel 93 possessive = TRUE;
3525     min = 0;
3526     max = INT_MAX;
3527     ecode++;
3528     goto REPEATNOTCHAR;
3529    
3530     case OP_NOTPOSPLUS:
3531 ph10 602 case OP_NOTPOSPLUSI:
3532 nigel 93 possessive = TRUE;
3533     min = 1;
3534     max = INT_MAX;
3535     ecode++;
3536     goto REPEATNOTCHAR;
3537    
3538     case OP_NOTPOSQUERY:
3539 ph10 602 case OP_NOTPOSQUERYI:
3540 nigel 93 possessive = TRUE;
3541     min = 0;
3542     max = 1;
3543     ecode++;
3544     goto REPEATNOTCHAR;
3545    
3546     case OP_NOTPOSUPTO:
3547 ph10 602 case OP_NOTPOSUPTOI:
3548 nigel 93 possessive = TRUE;
3549     min = 0;
3550     max = GET2(ecode, 1);
3551 ph10 836 ecode += 1 + IMM2_SIZE;
3552 nigel 93 goto REPEATNOTCHAR;
3553    
3554 nigel 77 case OP_NOTSTAR:
3555 ph10 602 case OP_NOTSTARI:
3556 nigel 77 case OP_NOTMINSTAR:
3557 ph10 602 case OP_NOTMINSTARI:
3558 nigel 77 case OP_NOTPLUS:
3559 ph10 602 case OP_NOTPLUSI:
3560 nigel 77 case OP_NOTMINPLUS:
3561 ph10 602 case OP_NOTMINPLUSI:
3562 nigel 77 case OP_NOTQUERY:
3563 ph10 602 case OP_NOTQUERYI:
3564 nigel 77 case OP_NOTMINQUERY:
3565 ph10 602 case OP_NOTMINQUERYI:
3566     c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3567 nigel 77 minimize = (c & 1) != 0;
3568     min = rep_min[c]; /* Pick up values from tables; */
3569     max = rep_max[c]; /* zero for max => infinity */
3570     if (max == 0) max = INT_MAX;
3571    
3572 ph10 426 /* Common code for all repeated single-byte matches. */
3573 nigel 77
3574     REPEATNOTCHAR:
3575     fc = *ecode++;
3576    
3577     /* The code is duplicated for the caseless and caseful cases, for speed,
3578     since matching characters is likely to be quite common. First, ensure the
3579     minimum number of matches are present. If min = max, continue at the same
3580     level without recursing. Otherwise, if minimizing, keep trying the rest of
3581     the expression and advancing one matching character if failing, up to the
3582     maximum. Alternatively, if maximizing, find the maximum number of
3583     characters and work backwards. */
3584    
3585     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3586     max, eptr));
3587    
3588 ph10 602 if (op >= OP_NOTSTARI) /* Caseless */
3589 nigel 77 {
3590 ph10 836 #ifdef COMPILE_PCRE8
3591     /* fc must be < 128 if UTF is enabled. */
3592     foc = md->fcc[fc];
3593     #else
3594     #ifdef SUPPORT_UTF
3595     #ifdef SUPPORT_UCP
3596     if (utf && fc > 127)
3597     foc = UCD_OTHERCASE(fc);
3598     #else
3599     if (utf && fc > 127)
3600     foc = fc;
3601     #endif /* SUPPORT_UCP */
3602     else
3603     #endif /* SUPPORT_UTF */
3604     foc = TABLE_GET(fc, md->fcc, fc);
3605     #endif /* COMPILE_PCRE8 */
3606 nigel 77
3607 ph10 836 #ifdef SUPPORT_UTF
3608     if (utf)
3609 nigel 77 {
3610 nigel 93 register unsigned int d;
3611 nigel 77 for (i = 1; i <= min; i++)
3612     {
3613 ph10 426 if (eptr >= md->end_subject)
3614     {
3615     SCHECK_PARTIAL();
3616 ph10 836 RRETURN(MATCH_NOMATCH);
3617 ph10 427 }
3618 nigel 77 GETCHARINC(d, eptr);
3619 ph10 836 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3620 nigel 77 }
3621     }
3622     else
3623     #endif
3624 ph10 836 /* Not UTF mode */
3625 nigel 77 {
3626     for (i = 1; i <= min; i++)
3627 ph10 426 {
3628     if (eptr >= md->end_subject)
3629     {
3630     SCHECK_PARTIAL();
3631 ph10 836 RRETURN(MATCH_NOMATCH);
3632 ph10 427 }
3633 ph10 836 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3634     eptr++;
3635 ph10 427 }
3636 nigel 77 }
3637    
3638     if (min == max) continue;
3639    
3640     if (minimize)
3641     {
3642 ph10 836 #ifdef SUPPORT_UTF
3643     if (utf)
3644 nigel 77 {
3645 nigel 93 register unsigned int d;
3646 nigel 77 for (fi = min;; fi++)
3647     {
3648 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3649 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3650 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
3651 ph10 427 if (eptr >= md->end_subject)
3652 ph10 426 {
3653 ph10 427 SCHECK_PARTIAL();
3654 ph10 836 RRETURN(MATCH_NOMATCH);
3655 ph10 427 }
3656 nigel 77 GETCHARINC(d, eptr);
3657 ph10 836 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3658 nigel 77 }
3659     }
3660     else
3661     #endif
3662 ph10 836 /* Not UTF mode */
3663 nigel 77 {
3664     for (fi = min;; fi++)
3665     {
3666 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3667 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3668 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
3669 ph10 426 if (eptr >= md->end_subject)
3670     {
3671     SCHECK_PARTIAL();
3672 ph10 836 RRETURN(MATCH_NOMATCH);
3673 ph10 426 }
3674 ph10 836 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3675     eptr++;
3676 nigel 77 }
3677     }
3678     /* Control never gets here */
3679     }
3680    
3681     /* Maximize case */
3682    
3683     else
3684     {
3685     pp = eptr;
3686    
3687 ph10 836 #ifdef SUPPORT_UTF
3688     if (utf)
3689 nigel 77 {
3690 nigel 93 register unsigned int d;
3691 nigel 77 for (i = min; i < max; i++)
3692     {
3693     int len = 1;
3694 ph10 463 if (eptr >= md->end_subject)
3695 ph10 462 {
3696 ph10 463 SCHECK_PARTIAL();
3697 ph10 462 break;
3698 ph10 463 }
3699 nigel 77 GETCHARLEN(d, eptr, len);
3700 ph10 836 if (fc == d || foc == d) break;
3701 nigel 77 eptr += len;
3702     }
3703 nigel 93 if (possessive) continue;
3704     for(;;)
3705 nigel 77 {
3706 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3707 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3708     if (eptr-- == pp) break; /* Stop if tried at original pos */
3709     BACKCHAR(eptr);
3710     }
3711     }
3712     else
3713     #endif
3714 ph10 836 /* Not UTF mode */
3715 nigel 77 {
3716     for (i = min; i < max; i++)
3717     {
3718 ph10 463 if (eptr >= md->end_subject)
3719 ph10 462 {
3720     SCHECK_PARTIAL();
3721     break;
3722 ph10 463 }
3723 ph10 836 if (fc == *eptr || foc == *eptr) break;
3724 nigel 77 eptr++;
3725     }
3726 nigel 93 if (possessive) continue;
3727 nigel 77 while (eptr >= pp)
3728     {
3729 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3730 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3731     eptr--;
3732     }
3733     }
3734    
3735 ph10 836 RRETURN(MATCH_NOMATCH);
3736 nigel 77 }
3737     /* Control never gets here */
3738     }
3739    
3740     /* Caseful comparisons */
3741    
3742     else
3743     {
3744 ph10 836 #ifdef SUPPORT_UTF
3745     if (utf)
3746 nigel 77 {
3747 nigel 93 register unsigned int d;
3748 nigel 77 for (i = 1; i <= min; i++)
3749     {
3750 ph10 426 if (eptr >= md->end_subject)
3751     {
3752     SCHECK_PARTIAL();
3753 ph10 836 RRETURN(MATCH_NOMATCH);
3754 ph10 427 }
3755 nigel 77 GETCHARINC(d, eptr);
3756 ph10 836 if (fc == d) RRETURN(MATCH_NOMATCH);
3757 nigel 77 }
3758     }
3759     else
3760     #endif
3761 ph10 836 /* Not UTF mode */
3762 nigel 77 {
3763     for (i = 1; i <= min; i++)
3764 ph10 426 {
3765     if (eptr >= md->end_subject)
3766     {
3767     SCHECK_PARTIAL();
3768 ph10 836 RRETURN(MATCH_NOMATCH);
3769 ph10 427 }
3770 ph10 836 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3771 ph10 427 }
3772 nigel 77 }
3773    
3774     if (min == max) continue;
3775    
3776     if (minimize)
3777     {
3778 ph10 836 #ifdef SUPPORT_UTF
3779     if (utf)
3780 nigel 77 {
3781 nigel 93 register unsigned int d;
3782 nigel 77 for (fi = min;; fi++)
3783     {
3784 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3785 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3786 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
3787 ph10 427 if (eptr >= md->end_subject)
3788 ph10 426 {
3789 ph10 427 SCHECK_PARTIAL();
3790 ph10 836 RRETURN(MATCH_NOMATCH);
3791 ph10 427 }
3792 nigel 77 GETCHARINC(d, eptr);
3793 ph10 836 if (fc == d) RRETURN(MATCH_NOMATCH);
3794 nigel 77 }
3795     }
3796     else
3797     #endif
3798 ph10 836 /* Not UTF mode */
3799 nigel 77 {
3800     for (fi = min;; fi++)
3801     {
3802 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3803 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3804 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
3805 ph10 426 if (eptr >= md->end_subject)
3806     {
3807     SCHECK_PARTIAL();
3808 ph10 836 RRETURN(MATCH_NOMATCH);
3809 ph10 427 }
3810 ph10 836 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3811 nigel 77 }
3812     }
3813     /* Control never gets here */
3814     }
3815    
3816     /* Maximize case */
3817    
3818     else
3819     {
3820     pp = eptr;
3821    
3822 ph10 836 #ifdef SUPPORT_UTF
3823     if (utf)
3824 nigel 77 {
3825 nigel 93 register unsigned int d;
3826 nigel 77 for (i = min; i < max; i++)
3827     {
3828     int len = 1;
3829 ph10 463 if (eptr >= md->end_subject)
3830 ph10 462 {
3831 ph10 463 SCHECK_PARTIAL();
3832 ph10 462 break;
3833 ph10 463 }
3834 nigel 77 GETCHARLEN(d, eptr, len);
3835     if (fc == d) break;
3836     eptr += len;
3837     }
3838 nigel 93 if (possessive) continue;
3839 nigel 77 for(;;)
3840     {
3841 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3842 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3843     if (eptr-- == pp) break; /* Stop if tried at original pos */
3844     BACKCHAR(eptr);
3845     }
3846     }
3847     else
3848     #endif
3849 ph10 836 /* Not UTF mode */
3850 nigel 77 {
3851     for (i = min; i < max; i++)
3852     {
3853 ph10 463 if (eptr >= md->end_subject)
3854 ph10 462 {
3855 ph10 463 SCHECK_PARTIAL();
3856 ph10 462 break;
3857 ph10 463 }
3858 ph10 462 if (fc == *eptr) break;
3859 nigel 77 eptr++;
3860     }
3861 nigel 93 if (possessive) continue;
3862 nigel 77 while (eptr >= pp)
3863     {
3864 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3865 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3866     eptr--;
3867     }
3868     }
3869    
3870 ph10 836 RRETURN(MATCH_NOMATCH);
3871 nigel 77 }
3872     }
3873     /* Control never gets here */
3874    
3875     /* Match a single character type repeatedly; several different opcodes
3876     share code. This is very similar to the code for single characters, but we
3877     repeat it in the interests of efficiency. */
3878    
3879     case OP_TYPEEXACT:
3880     min = max = GET2(ecode, 1);
3881     minimize = TRUE;
3882 ph10 836 ecode += 1 + IMM2_SIZE;
3883 nigel 77 goto REPEATTYPE;
3884    
3885     case OP_TYPEUPTO:
3886     case OP_TYPEMINUPTO:
3887     min = 0;
3888     max = GET2(ecode, 1);
3889     minimize = *ecode == OP_TYPEMINUPTO;
3890 ph10 836 ecode += 1 + IMM2_SIZE;
3891 nigel 77 goto REPEATTYPE;
3892    
3893 nigel 93 case OP_TYPEPOSSTAR:
3894     possessive = TRUE;
3895     min = 0;
3896     max = INT_MAX;
3897     ecode++;