/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 852 - (hide annotations) (download)
Thu Jan 5 19:18:12 2012 UTC (2 years, 9 months ago) by zherczeg
File MIME type: text/plain
File size: 211468 byte(s)
Add pcre16 prefix to 16 bit structs
1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 836 Copyright (c) 1997-2012 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 ph10 625 /* Values for setting in md->match_function_type to indicate two special types
61     of call to match(). We do it this way to save on using another stack variable,
62 ph10 604 as stack usage is to be discouraged. */
63 nigel 77
64 ph10 604 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65     #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66 nigel 77
67     /* Non-error returns from the match() function. Error returns are externally
68     defined PCRE_ERROR_xxx codes, which are all negative. */
69    
70     #define MATCH_MATCH 1
71     #define MATCH_NOMATCH 0
72    
73 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
74 ph10 210 negative to avoid the external error codes. */
75    
76 ph10 511 #define MATCH_ACCEPT (-999)
77     #define MATCH_COMMIT (-998)
78 ph10 604 #define MATCH_KETRPOS (-997)
79 ph10 618 #define MATCH_ONCE (-996)
80     #define MATCH_PRUNE (-995)
81     #define MATCH_SKIP (-994)
82     #define MATCH_SKIP_ARG (-993)
83     #define MATCH_THEN (-992)
84 ph10 210
85 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
86     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87     because the offset vector is always a multiple of 3 long. */
88    
89     #define REC_STACK_SAVE_MAX 30
90    
91     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92    
93     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95    
96    
97    
98 ph10 475 #ifdef PCRE_DEBUG
99 nigel 77 /*************************************************
100     * Debugging function to print chars *
101     *************************************************/
102    
103     /* Print a sequence of chars in printable format, stopping at the end of the
104     subject if the requested.
105    
106     Arguments:
107     p points to characters
108     length number to print
109     is_subject TRUE if printing from within md->start_subject
110     md pointer to matching data block, if is_subject is TRUE
111    
112     Returns: nothing
113     */
114    
115     static void
116 ph10 836 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
117 nigel 77 {
118 nigel 93 unsigned int c;
119 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120     while (length-- > 0)
121     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122     }
123     #endif
124    
125    
126    
127     /*************************************************
128     * Match a back-reference *
129     *************************************************/
130    
131 ph10 595 /* Normally, if a back reference hasn't been set, the length that is passed is
132     negative, so the match always fails. However, in JavaScript compatibility mode,
133 ph10 625 the length passed is zero. Note that in caseless UTF-8 mode, the number of
134 ph10 595 subject bytes matched may be different to the number of reference bytes.
135 nigel 77
136     Arguments:
137     offset index into the offset vector
138 ph10 595 eptr pointer into the subject
139     length length of reference to be matched (number of bytes)
140 nigel 77 md points to match data block
141 ph10 602 caseless TRUE if caseless
142 nigel 77
143 ph10 595 Returns: < 0 if not matched, otherwise the number of subject bytes matched
144 nigel 77 */
145    
146 ph10 595 static int
147 ph10 836 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
148 ph10 602 BOOL caseless)
149 nigel 77 {
150 ph10 836 PCRE_PUCHAR eptr_start = eptr;
151     register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
152 nigel 77
153 ph10 475 #ifdef PCRE_DEBUG
154 nigel 77 if (eptr >= md->end_subject)
155     printf("matching subject <null>");
156     else
157     {
158     printf("matching subject ");
159     pchars(eptr, length, TRUE, md);
160     }
161     printf(" against backref ");
162     pchars(p, length, FALSE, md);
163     printf("\n");
164     #endif
165    
166 ph10 595 /* Always fail if reference not set (and not JavaScript compatible). */
167 nigel 77
168 ph10 595 if (length < 0) return -1;
169 nigel 77
170 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
171     properly if Unicode properties are supported. Otherwise, we can check only
172     ASCII characters. */
173 nigel 77
174 ph10 602 if (caseless)
175 nigel 77 {
176 ph10 836 #ifdef SUPPORT_UTF
177 ph10 354 #ifdef SUPPORT_UCP
178 ph10 836 if (md->utf)
179 ph10 354 {
180 ph10 625 /* Match characters up to the end of the reference. NOTE: the number of
181 ph10 595 bytes matched may differ, because there are some characters whose upper and
182     lower case versions code as different numbers of bytes. For example, U+023A
183     (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
184     a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
185 ph10 625 the latter. It is important, therefore, to check the length along the
186 ph10 595 reference, not along the subject (earlier code did this wrong). */
187 ph10 625
188 ph10 836 PCRE_PUCHAR endptr = p + length;
189 ph10 595 while (p < endptr)
190 ph10 354 {
191 ph10 358 int c, d;
192 ph10 597 if (eptr >= md->end_subject) return -1;
193 ph10 354 GETCHARINC(c, eptr);
194     GETCHARINC(d, p);
195 ph10 595 if (c != d && c != UCD_OTHERCASE(d)) return -1;
196 ph10 358 }
197     }
198 ph10 354 else
199     #endif
200     #endif
201    
202     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
203     is no UCP support. */
204 ph10 597 {
205 ph10 625 if (eptr + length > md->end_subject) return -1;
206 ph10 597 while (length-- > 0)
207 ph10 836 {
208     if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
209     p++;
210     eptr++;
211     }
212 ph10 625 }
213 nigel 77 }
214 ph10 358
215 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
216     are in UTF-8 mode. */
217 ph10 358
218 nigel 77 else
219 ph10 625 {
220     if (eptr + length > md->end_subject) return -1;
221     while (length-- > 0) if (*p++ != *eptr++) return -1;
222 ph10 597 }
223 nigel 77
224 ph10 836 return (int)(eptr - eptr_start);
225 nigel 77 }
226    
227    
228    
229     /***************************************************************************
230     ****************************************************************************
231     RECURSION IN THE match() FUNCTION
232    
233 nigel 87 The match() function is highly recursive, though not every recursive call
234     increases the recursive depth. Nevertheless, some regular expressions can cause
235     it to recurse to a great depth. I was writing for Unix, so I just let it call
236     itself recursively. This uses the stack for saving everything that has to be
237     saved for a recursive call. On Unix, the stack can be large, and this works
238     fine.
239 nigel 77
240 nigel 87 It turns out that on some non-Unix-like systems there are problems with
241     programs that use a lot of stack. (This despite the fact that every last chip
242     has oodles of memory these days, and techniques for extending the stack have
243     been known for decades.) So....
244 nigel 77
245     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
246     calls by keeping local variables that need to be preserved in blocks of memory
247 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
248 nigel 77 achieve this so that the actual code doesn't look very different to what it
249     always used to.
250 ph10 164
251 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
252 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
253     Switzer, the use of longjmp() has been abolished, at the cost of having to
254     provide a unique number for each call to RMATCH. There is no way of generating
255     a sequence of numbers at compile time in C. I have given them names, to make
256     them stand out more clearly.
257    
258     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
259     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
260 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
261     don't have indeterminate values; this has meant that the frame size can be
262 ph10 164 reduced because the result can be "passed back" by straight setting of the
263     variable instead of being passed in the frame.
264 nigel 77 ****************************************************************************
265     ***************************************************************************/
266    
267 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
268     below must be updated in sync. */
269 nigel 77
270 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
271     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
272     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
273     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
274 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
275 ph10 527 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
276 ph10 723 RM61, RM62, RM63, RM64, RM65, RM66 };
277 ph10 164
278 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
279 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
280 ph10 501 actually used in this definition. */
281 nigel 77
282     #ifndef NO_RECURSE
283     #define REGISTER register
284 ph10 164
285 ph10 475 #ifdef PCRE_DEBUG
286 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
287 nigel 87 { \
288     printf("match() called in line %d\n", __LINE__); \
289 ph10 836 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
290 nigel 87 printf("to line %d\n", __LINE__); \
291     }
292     #define RRETURN(ra) \
293     { \
294     printf("match() returned %d from line %d ", ra, __LINE__); \
295     return ra; \
296     }
297     #else
298 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
299 ph10 836 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
300 nigel 77 #define RRETURN(ra) return ra
301 nigel 87 #endif
302    
303 nigel 77 #else
304    
305    
306 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
307     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
308     argument of match(), which never changes. */
309 nigel 77
310     #define REGISTER
311    
312 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw)\
313 nigel 77 {\
314 ph10 836 heapframe *newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
315 ph10 534 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
316 ph10 164 frame->Xwhere = rw; \
317     newframe->Xeptr = ra;\
318     newframe->Xecode = rb;\
319 ph10 168 newframe->Xmstart = mstart;\
320 ph10 164 newframe->Xoffset_top = rc;\
321 ph10 602 newframe->Xeptrb = re;\
322 ph10 164 newframe->Xrdepth = frame->Xrdepth + 1;\
323     newframe->Xprevframe = frame;\
324     frame = newframe;\
325     DPRINTF(("restarting from line %d\n", __LINE__));\
326     goto HEAP_RECURSE;\
327     L_##rw:\
328     DPRINTF(("jumped back to line %d\n", __LINE__));\
329 nigel 77 }
330    
331     #define RRETURN(ra)\
332     {\
333 ph10 527 heapframe *oldframe = frame;\
334     frame = oldframe->Xprevframe;\
335 ph10 836 (PUBL(stack_free))(oldframe);\
336 nigel 77 if (frame != NULL)\
337     {\
338 ph10 164 rrc = ra;\
339     goto HEAP_RETURN;\
340 nigel 77 }\
341     return ra;\
342     }
343    
344    
345     /* Structure for remembering the local variables in a private frame */
346    
347     typedef struct heapframe {
348     struct heapframe *Xprevframe;
349    
350     /* Function arguments that may change */
351    
352 ph10 836 PCRE_PUCHAR Xeptr;
353     const pcre_uchar *Xecode;
354     PCRE_PUCHAR Xmstart;
355 nigel 77 int Xoffset_top;
356     eptrblock *Xeptrb;
357 nigel 91 unsigned int Xrdepth;
358 nigel 77
359     /* Function local variables */
360    
361 ph10 836 PCRE_PUCHAR Xcallpat;
362     #ifdef SUPPORT_UTF
363     PCRE_PUCHAR Xcharptr;
364 ph10 406 #endif
365 ph10 836 PCRE_PUCHAR Xdata;
366     PCRE_PUCHAR Xnext;
367     PCRE_PUCHAR Xpp;
368     PCRE_PUCHAR Xprev;
369     PCRE_PUCHAR Xsaved_eptr;
370 nigel 77
371     recursion_info Xnew_recursive;
372    
373     BOOL Xcur_is_word;
374     BOOL Xcondition;
375     BOOL Xprev_is_word;
376    
377     #ifdef SUPPORT_UCP
378     int Xprop_type;
379 nigel 87 int Xprop_value;
380 nigel 77 int Xprop_fail_result;
381 ph10 123 int Xoclength;
382 ph10 836 pcre_uchar Xocchars[6];
383 nigel 77 #endif
384    
385 ph10 403 int Xcodelink;
386 nigel 77 int Xctype;
387 nigel 93 unsigned int Xfc;
388 nigel 77 int Xfi;
389     int Xlength;
390     int Xmax;
391     int Xmin;
392     int Xnumber;
393     int Xoffset;
394     int Xop;
395     int Xsave_capture_last;
396     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
397     int Xstacksave[REC_STACK_SAVE_MAX];
398    
399     eptrblock Xnewptrb;
400    
401 ph10 164 /* Where to jump back to */
402 nigel 77
403 ph10 164 int Xwhere;
404 ph10 165
405 nigel 77 } heapframe;
406    
407     #endif
408    
409    
410     /***************************************************************************
411     ***************************************************************************/
412    
413    
414    
415     /*************************************************
416     * Match from current position *
417     *************************************************/
418    
419 nigel 93 /* This function is called recursively in many circumstances. Whenever it
420 nigel 77 returns a negative (error) response, the outer incarnation must also return the
421 ph10 426 same response. */
422 nigel 77
423 ph10 426 /* These macros pack up tests that are used for partial matching, and which
424 ph10 836 appear several times in the code. We set the "hit end" flag if the pointer is
425 ph10 426 at the end of the subject and also past the start of the subject (i.e.
426 ph10 427 something has been matched). For hard partial matching, we then return
427     immediately. The second one is used when we already know we are past the end of
428     the subject. */
429 ph10 426
430     #define CHECK_PARTIAL()\
431 ph10 553 if (md->partial != 0 && eptr >= md->end_subject && \
432     eptr > md->start_used_ptr) \
433     { \
434     md->hitend = TRUE; \
435 ph10 836 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
436 ph10 427 }
437 ph10 426
438     #define SCHECK_PARTIAL()\
439 ph10 553 if (md->partial != 0 && eptr > md->start_used_ptr) \
440     { \
441     md->hitend = TRUE; \
442 ph10 836 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
443 ph10 427 }
444 ph10 426
445 ph10 427
446 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
447 ph10 836 the md structure (e.g. utf, end_subject) into individual variables to improve
448 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
449     made performance worse.
450    
451     Arguments:
452 nigel 93 eptr pointer to current character in subject
453     ecode pointer to current position in compiled code
454 ph10 168 mstart pointer to the current match start position (can be modified
455 ph10 172 by encountering \K)
456 nigel 77 offset_top current top pointer
457     md pointer to "static" info for the match
458     eptrb pointer to chain of blocks containing eptr at start of
459     brackets - for testing for empty matches
460 nigel 87 rdepth the recursion depth
461 nigel 77
462     Returns: MATCH_MATCH if matched ) these values are >= 0
463     MATCH_NOMATCH if failed to match )
464 ph10 510 a negative MATCH_xxx value for PRUNE, SKIP, etc
465 nigel 77 a negative PCRE_ERROR_xxx value if aborted by an error condition
466 nigel 87 (e.g. stopped by repeated call or recursion limit)
467 nigel 77 */
468    
469     static int
470 ph10 836 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
471 ph10 842 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
472 ph10 835 unsigned int rdepth)
473 nigel 77 {
474     /* These variables do not need to be preserved over recursion in this function,
475 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
476     "register" because they are used a lot in loops. */
477 nigel 77
478 nigel 91 register int rrc; /* Returns from recursive calls */
479     register int i; /* Used for loops not involving calls to RMATCH() */
480 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
481 ph10 836 register BOOL utf; /* Local copy of UTF flag for speed */
482 nigel 77
483 nigel 93 BOOL minimize, possessive; /* Quantifier options */
484 ph10 602 BOOL caseless;
485 ph10 403 int condcode;
486 nigel 93
487 nigel 77 /* When recursion is not being used, all "local" variables that have to be
488     preserved over calls to RMATCH() are part of a "frame" which is obtained from
489     heap storage. Set up the top-level frame here; others are obtained from the
490     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
491    
492     #ifdef NO_RECURSE
493 ph10 836 heapframe *frame = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));
494 ph10 531 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
495 nigel 77 frame->Xprevframe = NULL; /* Marks the top level */
496    
497     /* Copy in the original argument variables */
498    
499     frame->Xeptr = eptr;
500     frame->Xecode = ecode;
501 ph10 168 frame->Xmstart = mstart;
502 nigel 77 frame->Xoffset_top = offset_top;
503     frame->Xeptrb = eptrb;
504 nigel 87 frame->Xrdepth = rdepth;
505 nigel 77
506     /* This is where control jumps back to to effect "recursion" */
507    
508     HEAP_RECURSE:
509    
510     /* Macros make the argument variables come from the current frame */
511    
512     #define eptr frame->Xeptr
513     #define ecode frame->Xecode
514 ph10 168 #define mstart frame->Xmstart
515 nigel 77 #define offset_top frame->Xoffset_top
516     #define eptrb frame->Xeptrb
517 nigel 87 #define rdepth frame->Xrdepth
518 nigel 77
519     /* Ditto for the local variables */
520    
521 ph10 836 #ifdef SUPPORT_UTF
522 nigel 77 #define charptr frame->Xcharptr
523     #endif
524     #define callpat frame->Xcallpat
525 ph10 403 #define codelink frame->Xcodelink
526 nigel 77 #define data frame->Xdata
527     #define next frame->Xnext
528     #define pp frame->Xpp
529     #define prev frame->Xprev
530     #define saved_eptr frame->Xsaved_eptr
531    
532     #define new_recursive frame->Xnew_recursive
533    
534     #define cur_is_word frame->Xcur_is_word
535     #define condition frame->Xcondition
536     #define prev_is_word frame->Xprev_is_word
537    
538     #ifdef SUPPORT_UCP
539     #define prop_type frame->Xprop_type
540 nigel 87 #define prop_value frame->Xprop_value
541 nigel 77 #define prop_fail_result frame->Xprop_fail_result
542 ph10 115 #define oclength frame->Xoclength
543     #define occhars frame->Xocchars
544 nigel 77 #endif
545    
546     #define ctype frame->Xctype
547     #define fc frame->Xfc
548     #define fi frame->Xfi
549     #define length frame->Xlength
550     #define max frame->Xmax
551     #define min frame->Xmin
552     #define number frame->Xnumber
553     #define offset frame->Xoffset
554     #define op frame->Xop
555     #define save_capture_last frame->Xsave_capture_last
556     #define save_offset1 frame->Xsave_offset1
557     #define save_offset2 frame->Xsave_offset2
558     #define save_offset3 frame->Xsave_offset3
559     #define stacksave frame->Xstacksave
560    
561     #define newptrb frame->Xnewptrb
562    
563     /* When recursion is being used, local variables are allocated on the stack and
564     get preserved during recursion in the normal way. In this environment, fi and
565     i, and fc and c, can be the same variables. */
566    
567 nigel 93 #else /* NO_RECURSE not defined */
568 nigel 77 #define fi i
569     #define fc c
570    
571 ph10 604 /* Many of the following variables are used only in small blocks of the code.
572     My normal style of coding would have declared them within each of those blocks.
573     However, in order to accommodate the version of this code that uses an external
574     "stack" implemented on the heap, it is easier to declare them all here, so the
575     declarations can be cut out in a block. The only declarations within blocks
576     below are for variables that do not have to be preserved over a recursive call
577     to RMATCH(). */
578 nigel 77
579 ph10 836 #ifdef SUPPORT_UTF
580     const pcre_uchar *charptr;
581 ph10 625 #endif
582 ph10 836 const pcre_uchar *callpat;
583     const pcre_uchar *data;
584     const pcre_uchar *next;
585     PCRE_PUCHAR pp;
586     const pcre_uchar *prev;
587     PCRE_PUCHAR saved_eptr;
588 ph10 625
589     recursion_info new_recursive;
590    
591     BOOL cur_is_word;
592 nigel 87 BOOL condition;
593 nigel 77 BOOL prev_is_word;
594    
595     #ifdef SUPPORT_UCP
596     int prop_type;
597 nigel 87 int prop_value;
598 nigel 77 int prop_fail_result;
599 ph10 115 int oclength;
600 ph10 836 pcre_uchar occhars[6];
601 nigel 77 #endif
602    
603 ph10 399 int codelink;
604 nigel 77 int ctype;
605     int length;
606     int max;
607     int min;
608     int number;
609     int offset;
610     int op;
611     int save_capture_last;
612     int save_offset1, save_offset2, save_offset3;
613     int stacksave[REC_STACK_SAVE_MAX];
614    
615     eptrblock newptrb;
616 nigel 93 #endif /* NO_RECURSE */
617 nigel 77
618 ph10 625 /* To save space on the stack and in the heap frame, I have doubled up on some
619     of the local variables that are used only in localised parts of the code, but
620     still need to be preserved over recursive calls of match(). These macros define
621 ph10 604 the alternative names that are used. */
622    
623     #define allow_zero cur_is_word
624     #define cbegroup condition
625     #define code_offset codelink
626     #define condassert condition
627     #define matched_once prev_is_word
628 ph10 836 #define foc number
629 ph10 604
630 nigel 77 /* These statements are here to stop the compiler complaining about unitialized
631     variables. */
632    
633     #ifdef SUPPORT_UCP
634 nigel 87 prop_value = 0;
635 nigel 77 prop_fail_result = 0;
636     #endif
637    
638 nigel 93
639 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
640     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
641     used. Thanks to Ian Taylor for noticing this possibility and sending the
642     original patch. */
643    
644     TAIL_RECURSE:
645    
646 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
647     are specified by the macro RMATCH and RRETURN is used to return. When
648     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
649 ph10 475 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
650 nigel 87 defined). However, RMATCH isn't like a function call because it's quite a
651     complicated macro. It has to be used in one particular way. This shouldn't,
652     however, impact performance when true recursion is being used. */
653 nigel 77
654 ph10 836 #ifdef SUPPORT_UTF
655     utf = md->utf; /* Local copy of the flag */
656 ph10 164 #else
657 ph10 836 utf = FALSE;
658 ph10 164 #endif
659    
660 nigel 87 /* First check that we haven't called match() too many times, or that we
661     haven't exceeded the recursive call limit. */
662    
663 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
664 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
665 nigel 77
666 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
667 ph10 625 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
668     done this way to save having to use another function argument, which would take
669 ph10 604 up space on the stack. See also MATCH_CONDASSERT below.
670 nigel 77
671 ph10 604 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
672     such remembered pointers, to be checked when we hit the closing ket, in order
673     to break infinite loops that match no characters. When match() is called in
674     other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
675     NOT be used with tail recursion, because the memory block that is used is on
676     the stack, so a new one may be required for each match(). */
677    
678     if (md->match_function_type == MATCH_CBEGROUP)
679 nigel 77 {
680 ph10 197 newptrb.epb_saved_eptr = eptr;
681     newptrb.epb_prev = eptrb;
682     eptrb = &newptrb;
683 ph10 604 md->match_function_type = 0;
684 nigel 77 }
685    
686 nigel 93 /* Now start processing the opcodes. */
687 nigel 77
688     for (;;)
689     {
690 nigel 93 minimize = possessive = FALSE;
691 nigel 77 op = *ecode;
692 ph10 625
693 nigel 93 switch(op)
694     {
695 ph10 510 case OP_MARK:
696 ph10 836 md->nomatch_mark = ecode + 2;
697     md->mark = NULL; /* In case previously set by assertion */
698     RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
699 ph10 604 eptrb, RM55);
700 ph10 836 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
701     md->mark == NULL) md->mark = ecode + 2;
702 ph10 512
703     /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
704     argument, and we must check whether that argument matches this MARK's
705     argument. It is passed back in md->start_match_ptr (an overloading of that
706     variable). If it does match, we reset that variable to the current subject
707     position and return MATCH_SKIP. Otherwise, pass back the return code
708 ph10 510 unaltered. */
709 ph10 512
710 ph10 836 else if (rrc == MATCH_SKIP_ARG &&
711     STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
712 ph10 510 {
713     md->start_match_ptr = eptr;
714     RRETURN(MATCH_SKIP);
715     }
716     RRETURN(rrc);
717    
718 ph10 210 case OP_FAIL:
719 ph10 836 RRETURN(MATCH_NOMATCH);
720 ph10 211
721 ph10 551 /* COMMIT overrides PRUNE, SKIP, and THEN */
722 ph10 553
723 ph10 510 case OP_COMMIT:
724 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
725 ph10 604 eptrb, RM52);
726 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
727 ph10 553 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
728     rrc != MATCH_THEN)
729 ph10 551 RRETURN(rrc);
730 ph10 836 RRETURN(MATCH_COMMIT);
731 ph10 510
732 ph10 551 /* PRUNE overrides THEN */
733 ph10 553
734 ph10 210 case OP_PRUNE:
735 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
736 ph10 604 eptrb, RM51);
737 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
738 ph10 836 RRETURN(MATCH_PRUNE);
739 ph10 211
740 ph10 510 case OP_PRUNE_ARG:
741 ph10 836 md->nomatch_mark = ecode + 2;
742     md->mark = NULL; /* In case previously set by assertion */
743     RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
744 ph10 604 eptrb, RM56);
745 ph10 836 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
746     md->mark == NULL) md->mark = ecode + 2;
747 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
748 ph10 510 RRETURN(MATCH_PRUNE);
749 ph10 211
750 ph10 551 /* SKIP overrides PRUNE and THEN */
751 ph10 553
752 ph10 210 case OP_SKIP:
753 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
754 ph10 604 eptrb, RM53);
755 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
756 ph10 551 RRETURN(rrc);
757 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
758 ph10 836 RRETURN(MATCH_SKIP);
759 ph10 211
760 ph10 836 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
761     nomatch_mark. There is a flag that disables this opcode when re-matching a
762     pattern that ended with a SKIP for which there was not a matching MARK. */
763    
764 ph10 510 case OP_SKIP_ARG:
765 ph10 836 if (md->ignore_skip_arg)
766     {
767     ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
768     break;
769     }
770     RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
771 ph10 604 eptrb, RM57);
772 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
773 ph10 551 RRETURN(rrc);
774 ph10 512
775     /* Pass back the current skip name by overloading md->start_match_ptr and
776     returning the special MATCH_SKIP_ARG return code. This will either be
777 ph10 836 caught by a matching MARK, or get to the top, where it causes a rematch
778     with the md->ignore_skip_arg flag set. */
779 ph10 512
780 ph10 510 md->start_match_ptr = ecode + 2;
781 ph10 512 RRETURN(MATCH_SKIP_ARG);
782 ph10 553
783 ph10 716 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
784     the branch in which it occurs can be determined. Overload the start of
785     match pointer to do this. */
786 ph10 512
787 ph10 210 case OP_THEN:
788 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
789 ph10 604 eptrb, RM54);
790 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
791 ph10 716 md->start_match_ptr = ecode;
792 ph10 836 RRETURN(MATCH_THEN);
793 ph10 510
794     case OP_THEN_ARG:
795 ph10 836 md->nomatch_mark = ecode + 2;
796     md->mark = NULL; /* In case previously set by assertion */
797     RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
798 ph10 716 md, eptrb, RM58);
799 ph10 836 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
800     md->mark == NULL) md->mark = ecode + 2;
801 ph10 510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 ph10 733 md->start_match_ptr = ecode;
803 ph10 212 RRETURN(MATCH_THEN);
804 ph10 733
805 ph10 723 /* Handle an atomic group that does not contain any capturing parentheses.
806 ph10 733 This can be handled like an assertion. Prior to 8.13, all atomic groups
807     were handled this way. In 8.13, the code was changed as below for ONCE, so
808     that backups pass through the group and thereby reset captured values.
809     However, this uses a lot more stack, so in 8.20, atomic groups that do not
810     contain any captures generate OP_ONCE_NC, which can be handled in the old,
811 ph10 723 less stack intensive way.
812 ph10 211
813 ph10 723 Check the alternative branches in turn - the matching won't pass the KET
814     for this kind of subpattern. If any one branch matches, we carry on as at
815     the end of a normal bracket, leaving the subject pointer, but resetting
816     the start-of-match value in case it was changed by \K. */
817    
818     case OP_ONCE_NC:
819     prev = ecode;
820     saved_eptr = eptr;
821     do
822     {
823     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
824     if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
825     {
826     mstart = md->start_match_ptr;
827     break;
828     }
829     if (rrc == MATCH_THEN)
830     {
831     next = ecode + GET(ecode,1);
832 ph10 733 if (md->start_match_ptr < next &&
833 ph10 723 (*ecode == OP_ALT || *next == OP_ALT))
834     rrc = MATCH_NOMATCH;
835 ph10 733 }
836    
837 ph10 723 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
838     ecode += GET(ecode,1);
839     }
840     while (*ecode == OP_ALT);
841    
842     /* If hit the end of the group (which could be repeated), fail */
843    
844     if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
845    
846     /* Continue as from after the group, updating the offsets high water
847     mark, since extracts may have been taken. */
848    
849     do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
850    
851     offset_top = md->end_offset_top;
852     eptr = md->end_match_ptr;
853    
854     /* For a non-repeating ket, just continue at this level. This also
855     happens for a repeating ket if no characters were matched in the group.
856     This is the forcible breaking of infinite loops as implemented in Perl
857     5.005. */
858    
859     if (*ecode == OP_KET || eptr == saved_eptr)
860     {
861     ecode += 1+LINK_SIZE;
862     break;
863     }
864    
865     /* The repeating kets try the rest of the pattern or restart from the
866     preceding bracket, in the appropriate order. The second "call" of match()
867     uses tail recursion, to avoid using another stack frame. */
868    
869     if (*ecode == OP_KETRMIN)
870     {
871     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
872     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
873     ecode = prev;
874     goto TAIL_RECURSE;
875     }
876     else /* OP_KETRMAX */
877     {
878 ph10 733 md->match_function_type = MATCH_CBEGROUP;
879 ph10 723 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
880     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
881     ecode += 1 + LINK_SIZE;
882     goto TAIL_RECURSE;
883     }
884     /* Control never gets here */
885    
886 ph10 604 /* Handle a capturing bracket, other than those that are possessive with an
887     unlimited repeat. If there is space in the offset vector, save the current
888     subject position in the working slot at the top of the vector. We mustn't
889     change the current values of the data slot, because they may be set from a
890     previous iteration of this group, and be referred to by a reference inside
891 ph10 625 the group. A failure to match might occur after the group has succeeded,
892 ph10 617 if something later on doesn't match. For this reason, we need to restore
893     the working value and also the values of the final offsets, in case they
894     were set by a previous iteration of the same bracket.
895 nigel 77
896 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
897     a non-capturing bracket. Don't worry about setting the flag for the error
898     case here; that is handled in the code for KET. */
899 nigel 77
900 nigel 93 case OP_CBRA:
901     case OP_SCBRA:
902     number = GET2(ecode, 1+LINK_SIZE);
903 nigel 77 offset = number << 1;
904 ph10 625
905 ph10 475 #ifdef PCRE_DEBUG
906 nigel 93 printf("start bracket %d\n", number);
907     printf("subject=");
908 nigel 77 pchars(eptr, 16, TRUE, md);
909     printf("\n");
910     #endif
911    
912     if (offset < md->offset_max)
913     {
914     save_offset1 = md->offset_vector[offset];
915     save_offset2 = md->offset_vector[offset+1];
916     save_offset3 = md->offset_vector[md->offset_end - number];
917     save_capture_last = md->capture_last;
918    
919     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
920 ph10 531 md->offset_vector[md->offset_end - number] =
921 ph10 530 (int)(eptr - md->start_subject);
922 nigel 77
923 ph10 604 for (;;)
924 nigel 77 {
925 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
926 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
927 ph10 604 eptrb, RM1);
928 ph10 618 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
929 ph10 733
930     /* If we backed up to a THEN, check whether it is within the current
931     branch by comparing the address of the THEN that is passed back with
932 ph10 716 the end of the branch. If it is within the current branch, and the
933     branch is one of two or more alternatives (it either starts or ends
934 ph10 733 with OP_ALT), we have reached the limit of THEN's action, so convert
935     the return code to NOMATCH, which will cause normal backtracking to
936 ph10 716 happen from now on. Otherwise, THEN is passed back to an outer
937 ph10 733 alternative. This implements Perl's treatment of parenthesized groups,
938     where a group not containing | does not affect the current alternative,
939 ph10 716 that is, (X) is NOT the same as (X|(*F)). */
940    
941     if (rrc == MATCH_THEN)
942     {
943     next = ecode + GET(ecode,1);
944 ph10 733 if (md->start_match_ptr < next &&
945 ph10 716 (*ecode == OP_ALT || *next == OP_ALT))
946     rrc = MATCH_NOMATCH;
947 ph10 733 }
948    
949 ph10 716 /* Anything other than NOMATCH is passed back. */
950    
951     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
952 nigel 77 md->capture_last = save_capture_last;
953     ecode += GET(ecode, 1);
954 ph10 625 if (*ecode != OP_ALT) break;
955 nigel 77 }
956    
957     DPRINTF(("bracket %d failed\n", number));
958     md->offset_vector[offset] = save_offset1;
959     md->offset_vector[offset+1] = save_offset2;
960     md->offset_vector[md->offset_end - number] = save_offset3;
961 ph10 625
962 ph10 716 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
963 nigel 77
964 ph10 716 RRETURN(rrc);
965 nigel 77 }
966    
967 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
968     as a non-capturing bracket. */
969 nigel 77
970 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
971     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
972    
973 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
974 nigel 77
975 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
976     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
977    
978 ph10 618 /* Non-capturing or atomic group, except for possessive with unlimited
979 ph10 723 repeat and ONCE group with no captures. Loop for all the alternatives.
980 ph10 708
981 ph10 702 When we get to the final alternative within the brackets, we used to return
982     the result of a recursive call to match() whatever happened so it was
983     possible to reduce stack usage by turning this into a tail recursion,
984     except in the case of a possibly empty group. However, now that there is
985     the possiblity of (*THEN) occurring in the final alternative, this
986     optimization is no longer always possible.
987 ph10 625
988 ph10 708 We can optimize if we know there are no (*THEN)s in the pattern; at present
989     this is the best that can be done.
990    
991 ph10 625 MATCH_ONCE is returned when the end of an atomic group is successfully
992     reached, but subsequent matching fails. It passes back up the tree (causing
993     captured values to be reset) until the original atomic group level is
994 ph10 618 reached. This is tested by comparing md->once_target with the start of the
995     group. At this point, the return is converted into MATCH_NOMATCH so that
996     previous backup points can be taken. */
997 nigel 77
998 ph10 618 case OP_ONCE:
999 nigel 93 case OP_BRA:
1000     case OP_SBRA:
1001     DPRINTF(("start non-capturing bracket\n"));
1002 ph10 618
1003 nigel 91 for (;;)
1004 nigel 77 {
1005 ph10 618 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1006 ph10 702
1007     /* If this is not a possibly empty group, and there are no (*THEN)s in
1008 ph10 708 the pattern, and this is the final alternative, optimize as described
1009 ph10 702 above. */
1010    
1011     else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1012     {
1013 ph10 836 ecode += PRIV(OP_lengths)[*ecode];
1014 ph10 702 goto TAIL_RECURSE;
1015 ph10 708 }
1016 ph10 702
1017     /* In all other cases, we have to make another call to match(). */
1018    
1019 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1020 ph10 604 RM2);
1021 ph10 733
1022 ph10 716 /* See comment in the code for capturing groups above about handling
1023     THEN. */
1024    
1025     if (rrc == MATCH_THEN)
1026 ph10 625 {
1027 ph10 716 next = ecode + GET(ecode,1);
1028 ph10 733 if (md->start_match_ptr < next &&
1029 ph10 716 (*ecode == OP_ALT || *next == OP_ALT))
1030     rrc = MATCH_NOMATCH;
1031 ph10 733 }
1032    
1033     if (rrc != MATCH_NOMATCH)
1034 ph10 716 {
1035 ph10 618 if (rrc == MATCH_ONCE)
1036     {
1037 ph10 836 const pcre_uchar *scode = ecode;
1038 ph10 618 if (*scode != OP_ONCE) /* If not at start, find it */
1039     {
1040     while (*scode == OP_ALT) scode += GET(scode, 1);
1041     scode -= GET(scode, 1);
1042 ph10 625 }
1043 ph10 618 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1044 ph10 625 }
1045 ph10 550 RRETURN(rrc);
1046 ph10 625 }
1047 nigel 77 ecode += GET(ecode, 1);
1048 ph10 625 if (*ecode != OP_ALT) break;
1049 nigel 77 }
1050 ph10 733
1051 ph10 609 RRETURN(MATCH_NOMATCH);
1052    
1053 ph10 625 /* Handle possessive capturing brackets with an unlimited repeat. We come
1054 ph10 604 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1055     handled similarly to the normal case above. However, the matching is
1056     different. The end of these brackets will always be OP_KETRPOS, which
1057     returns MATCH_KETRPOS without going further in the pattern. By this means
1058     we can handle the group by iteration rather than recursion, thereby
1059     reducing the amount of stack needed. */
1060 ph10 625
1061 ph10 604 case OP_CBRAPOS:
1062     case OP_SCBRAPOS:
1063     allow_zero = FALSE;
1064 ph10 625
1065 ph10 604 POSSESSIVE_CAPTURE:
1066     number = GET2(ecode, 1+LINK_SIZE);
1067     offset = number << 1;
1068    
1069     #ifdef PCRE_DEBUG
1070     printf("start possessive bracket %d\n", number);
1071     printf("subject=");
1072     pchars(eptr, 16, TRUE, md);
1073     printf("\n");
1074     #endif
1075    
1076     if (offset < md->offset_max)
1077     {
1078     matched_once = FALSE;
1079 ph10 836 code_offset = (int)(ecode - md->start_code);
1080 ph10 604
1081     save_offset1 = md->offset_vector[offset];
1082     save_offset2 = md->offset_vector[offset+1];
1083     save_offset3 = md->offset_vector[md->offset_end - number];
1084     save_capture_last = md->capture_last;
1085    
1086     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1087 ph10 625
1088     /* Each time round the loop, save the current subject position for use
1089     when the group matches. For MATCH_MATCH, the group has matched, so we
1090     restart it with a new subject starting position, remembering that we had
1091     at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1092     usual. If we haven't matched any alternatives in any iteration, check to
1093     see if a previous iteration matched. If so, the group has matched;
1094     continue from afterwards. Otherwise it has failed; restore the previous
1095 ph10 604 capture values before returning NOMATCH. */
1096 ph10 625
1097 ph10 604 for (;;)
1098     {
1099     md->offset_vector[md->offset_end - number] =
1100     (int)(eptr - md->start_subject);
1101 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1102 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1103 ph10 604 eptrb, RM63);
1104     if (rrc == MATCH_KETRPOS)
1105     {
1106     offset_top = md->end_offset_top;
1107     eptr = md->end_match_ptr;
1108 ph10 625 ecode = md->start_code + code_offset;
1109 ph10 604 save_capture_last = md->capture_last;
1110 ph10 625 matched_once = TRUE;
1111     continue;
1112     }
1113 ph10 733
1114 ph10 716 /* See comment in the code for capturing groups above about handling
1115     THEN. */
1116    
1117     if (rrc == MATCH_THEN)
1118     {
1119     next = ecode + GET(ecode,1);
1120 ph10 733 if (md->start_match_ptr < next &&
1121 ph10 716 (*ecode == OP_ALT || *next == OP_ALT))
1122     rrc = MATCH_NOMATCH;
1123 ph10 733 }
1124 ph10 716
1125     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1126 ph10 604 md->capture_last = save_capture_last;
1127     ecode += GET(ecode, 1);
1128 ph10 625 if (*ecode != OP_ALT) break;
1129 ph10 604 }
1130 ph10 610
1131 ph10 604 if (!matched_once)
1132 ph10 625 {
1133 ph10 604 md->offset_vector[offset] = save_offset1;
1134     md->offset_vector[offset+1] = save_offset2;
1135     md->offset_vector[md->offset_end - number] = save_offset3;
1136     }
1137 ph10 625
1138 ph10 604 if (allow_zero || matched_once)
1139 ph10 625 {
1140 ph10 604 ecode += 1 + LINK_SIZE;
1141     break;
1142 ph10 625 }
1143    
1144 ph10 604 RRETURN(MATCH_NOMATCH);
1145     }
1146 ph10 625
1147 ph10 604 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1148     as a non-capturing bracket. */
1149    
1150     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1151     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1152    
1153     DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1154    
1155     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1156     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1157    
1158 ph10 625 /* Non-capturing possessive bracket with unlimited repeat. We come here
1159 ph10 604 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1160     without the capturing complication. It is written out separately for speed
1161     and cleanliness. */
1162    
1163     case OP_BRAPOS:
1164     case OP_SBRAPOS:
1165 ph10 625 allow_zero = FALSE;
1166    
1167 ph10 604 POSSESSIVE_NON_CAPTURE:
1168     matched_once = FALSE;
1169 ph10 836 code_offset = (int)(ecode - md->start_code);
1170 ph10 604
1171     for (;;)
1172     {
1173 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1174 ph10 836 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1175 ph10 609 eptrb, RM48);
1176 ph10 604 if (rrc == MATCH_KETRPOS)
1177     {
1178 ph10 610 offset_top = md->end_offset_top;
1179 ph10 604 eptr = md->end_match_ptr;
1180 ph10 625 ecode = md->start_code + code_offset;
1181     matched_once = TRUE;
1182     continue;
1183     }
1184 ph10 733
1185 ph10 716 /* See comment in the code for capturing groups above about handling
1186     THEN. */
1187    
1188     if (rrc == MATCH_THEN)
1189     {
1190     next = ecode + GET(ecode,1);
1191 ph10 733 if (md->start_match_ptr < next &&
1192 ph10 716 (*ecode == OP_ALT || *next == OP_ALT))
1193     rrc = MATCH_NOMATCH;
1194 ph10 733 }
1195 ph10 716
1196     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1197 ph10 604 ecode += GET(ecode, 1);
1198 ph10 625 if (*ecode != OP_ALT) break;
1199 ph10 604 }
1200 ph10 625
1201     if (matched_once || allow_zero)
1202 ph10 604 {
1203     ecode += 1 + LINK_SIZE;
1204     break;
1205 ph10 625 }
1206 ph10 604 RRETURN(MATCH_NOMATCH);
1207    
1208     /* Control never reaches here. */
1209    
1210 nigel 77 /* Conditional group: compilation checked that there are no more than
1211     two branches. If the condition is false, skipping the first branch takes us
1212     past the end if there is only one branch, but that's OK because that is
1213 ph10 609 exactly what going to the ket would do. */
1214 nigel 77
1215     case OP_COND:
1216 nigel 93 case OP_SCOND:
1217 ph10 604 codelink = GET(ecode, 1);
1218 ph10 406
1219 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
1220     inserted between OP_COND and an assertion condition. */
1221 ph10 392
1222 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1223     {
1224 ph10 836 if (PUBL(callout) != NULL)
1225 ph10 381 {
1226 zherczeg 850 PUBL(callout_block) cb;
1227 ph10 645 cb.version = 2; /* Version 1 of the callout block */
1228 ph10 381 cb.callout_number = ecode[LINK_SIZE+2];
1229     cb.offset_vector = md->offset_vector;
1230 zherczeg 852 #ifdef COMPILE_PCRE8
1231 ph10 381 cb.subject = (PCRE_SPTR)md->start_subject;
1232 zherczeg 852 #else
1233     cb.subject = (PCRE_SPTR16)md->start_subject;
1234     #endif
1235 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1236     cb.start_match = (int)(mstart - md->start_subject);
1237     cb.current_position = (int)(eptr - md->start_subject);
1238 ph10 381 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1239     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1240     cb.capture_top = offset_top/2;
1241     cb.capture_last = md->capture_last;
1242     cb.callout_data = md->callout_data;
1243 ph10 836 cb.mark = md->nomatch_mark;
1244     if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1245 ph10 381 if (rrc < 0) RRETURN(rrc);
1246     }
1247 ph10 836 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1248 ph10 381 }
1249 ph10 392
1250 ph10 399 condcode = ecode[LINK_SIZE+1];
1251 ph10 406
1252 ph10 381 /* Now see what the actual condition is */
1253 ph10 392
1254 ph10 459 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1255 nigel 77 {
1256 ph10 459 if (md->recursive == NULL) /* Not recursing => FALSE */
1257     {
1258 ph10 461 condition = FALSE;
1259     ecode += GET(ecode, 1);
1260     }
1261 ph10 459 else
1262 ph10 461 {
1263 ph10 459 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1264 ph10 751 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1265 ph10 461
1266 ph10 459 /* If the test is for recursion into a specific subpattern, and it is
1267     false, but the test was set up by name, scan the table to see if the
1268     name refers to any other numbers, and test them. The condition is true
1269     if any one is set. */
1270 ph10 461
1271 ph10 751 if (!condition && condcode == OP_NRREF)
1272 ph10 459 {
1273 ph10 836 pcre_uchar *slotA = md->name_table;
1274 ph10 459 for (i = 0; i < md->name_count; i++)
1275 ph10 461 {
1276     if (GET2(slotA, 0) == recno) break;
1277 ph10 459 slotA += md->name_entry_size;
1278     }
1279 ph10 461
1280 ph10 459 /* Found a name for the number - there can be only one; duplicate
1281     names for different numbers are allowed, but not vice versa. First
1282     scan down for duplicates. */
1283 ph10 461
1284 ph10 459 if (i < md->name_count)
1285 ph10 461 {
1286 ph10 836 pcre_uchar *slotB = slotA;
1287 ph10 459 while (slotB > md->name_table)
1288     {
1289     slotB -= md->name_entry_size;
1290 ph10 836 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1291 ph10 459 {
1292     condition = GET2(slotB, 0) == md->recursive->group_num;
1293 ph10 461 if (condition) break;
1294     }
1295 ph10 459 else break;
1296 ph10 461 }
1297    
1298 ph10 459 /* Scan up for duplicates */
1299 ph10 461
1300 ph10 459 if (!condition)
1301 ph10 461 {
1302 ph10 459 slotB = slotA;
1303     for (i++; i < md->name_count; i++)
1304     {
1305     slotB += md->name_entry_size;
1306 ph10 836 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1307 ph10 459 {
1308     condition = GET2(slotB, 0) == md->recursive->group_num;
1309     if (condition) break;
1310 ph10 461 }
1311 ph10 459 else break;
1312 ph10 461 }
1313     }
1314 ph10 459 }
1315 ph10 461 }
1316    
1317 ph10 459 /* Chose branch according to the condition */
1318 ph10 461
1319 ph10 836 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1320 ph10 459 }
1321 ph10 461 }
1322 nigel 93
1323 ph10 459 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1324 nigel 93 {
1325 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1326 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1327 ph10 461
1328 ph10 459 /* If the numbered capture is unset, but the reference was by name,
1329 ph10 461 scan the table to see if the name refers to any other numbers, and test
1330     them. The condition is true if any one is set. This is tediously similar
1331     to the code above, but not close enough to try to amalgamate. */
1332    
1333 ph10 459 if (!condition && condcode == OP_NCREF)
1334     {
1335 ph10 461 int refno = offset >> 1;
1336 ph10 836 pcre_uchar *slotA = md->name_table;
1337 ph10 461
1338 ph10 459 for (i = 0; i < md->name_count; i++)
1339 ph10 461 {
1340     if (GET2(slotA, 0) == refno) break;
1341 ph10 459 slotA += md->name_entry_size;
1342     }
1343 ph10 461
1344     /* Found a name for the number - there can be only one; duplicate names
1345     for different numbers are allowed, but not vice versa. First scan down
1346 ph10 459 for duplicates. */
1347 ph10 461
1348 ph10 459 if (i < md->name_count)
1349 ph10 461 {
1350 ph10 836 pcre_uchar *slotB = slotA;
1351 ph10 459 while (slotB > md->name_table)
1352     {
1353     slotB -= md->name_entry_size;
1354 ph10 836 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1355 ph10 459 {
1356     offset = GET2(slotB, 0) << 1;
1357 ph10 461 condition = offset < offset_top &&
1358 ph10 459 md->offset_vector[offset] >= 0;
1359 ph10 461 if (condition) break;
1360     }
1361 ph10 459 else break;
1362 ph10 461 }
1363    
1364 ph10 459 /* Scan up for duplicates */
1365 ph10 461
1366 ph10 459 if (!condition)
1367 ph10 461 {
1368 ph10 459 slotB = slotA;
1369     for (i++; i < md->name_count; i++)
1370     {
1371     slotB += md->name_entry_size;
1372 ph10 836 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1373 ph10 459 {
1374     offset = GET2(slotB, 0) << 1;
1375 ph10 461 condition = offset < offset_top &&
1376 ph10 459 md->offset_vector[offset] >= 0;
1377 ph10 461 if (condition) break;
1378     }
1379 ph10 459 else break;
1380 ph10 461 }
1381     }
1382 ph10 459 }
1383 ph10 461 }
1384    
1385 ph10 459 /* Chose branch according to the condition */
1386    
1387 ph10 836 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1388 nigel 77 }
1389    
1390 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
1391 nigel 93 {
1392     condition = FALSE;
1393     ecode += GET(ecode, 1);
1394     }
1395    
1396 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
1397 ph10 604 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1398     an assertion. */
1399 nigel 77
1400     else
1401     {
1402 ph10 625 md->match_function_type = MATCH_CONDASSERT;
1403 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1404 nigel 77 if (rrc == MATCH_MATCH)
1405     {
1406 ph10 619 if (md->end_offset_top > offset_top)
1407     offset_top = md->end_offset_top; /* Captures may have happened */
1408 nigel 93 condition = TRUE;
1409     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1410 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1411     }
1412 ph10 733
1413 ph10 716 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1414 ph10 733 assertion; it is therefore treated as NOMATCH. */
1415 ph10 716
1416 ph10 733 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1417 nigel 77 {
1418     RRETURN(rrc); /* Need braces because of following else */
1419     }
1420 nigel 93 else
1421     {
1422     condition = FALSE;
1423 ph10 399 ecode += codelink;
1424 nigel 93 }
1425     }
1426 nigel 91
1427 ph10 716 /* We are now at the branch that is to be obeyed. As there is only one, can
1428     use tail recursion to avoid using another stack frame, except when there is
1429     unlimited repeat of a possibly empty group. In the latter case, a recursive
1430     call to match() is always required, unless the second alternative doesn't
1431     exist, in which case we can just plough on. Note that, for compatibility
1432     with Perl, the | in a conditional group is NOT treated as creating two
1433     alternatives. If a THEN is encountered in the branch, it propagates out to
1434     the enclosing alternative (unless nested in a deeper set of alternatives,
1435     of course). */
1436 nigel 91
1437 nigel 93 if (condition || *ecode == OP_ALT)
1438     {
1439 ph10 716 if (op != OP_SCOND)
1440 ph10 702 {
1441     ecode += 1 + LINK_SIZE;
1442     goto TAIL_RECURSE;
1443 ph10 708 }
1444 ph10 733
1445 ph10 716 md->match_function_type = MATCH_CBEGROUP;
1446 ph10 609 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1447     RRETURN(rrc);
1448 nigel 77 }
1449 ph10 708
1450 ph10 702 /* Condition false & no alternative; continue after the group. */
1451 ph10 708
1452 ph10 702 else
1453 nigel 93 {
1454     ecode += 1 + LINK_SIZE;
1455     }
1456     break;
1457 nigel 77
1458 ph10 461
1459 ph10 447 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1460     to close any currently open capturing brackets. */
1461 ph10 461
1462 ph10 447 case OP_CLOSE:
1463 ph10 461 number = GET2(ecode, 1);
1464 ph10 447 offset = number << 1;
1465 ph10 461
1466 ph10 475 #ifdef PCRE_DEBUG
1467 ph10 447 printf("end bracket %d at *ACCEPT", number);
1468     printf("\n");
1469     #endif
1470 nigel 77
1471 ph10 447 md->capture_last = number;
1472     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1473     {
1474     md->offset_vector[offset] =
1475     md->offset_vector[md->offset_end - number];
1476 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1477 ph10 447 if (offset_top <= offset) offset_top = offset + 2;
1478     }
1479 ph10 836 ecode += 1 + IMM2_SIZE;
1480 ph10 461 break;
1481 ph10 447
1482    
1483 ph10 619 /* End of the pattern, either real or forced. */
1484 nigel 77
1485 ph10 619 case OP_END:
1486 ph10 210 case OP_ACCEPT:
1487 ph10 625 case OP_ASSERT_ACCEPT:
1488    
1489 ph10 619 /* If we have matched an empty string, fail if not in an assertion and not
1490     in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1491 ph10 613 is set and we have matched at the start of the subject. In both cases,
1492     backtracking will then try other alternatives, if any. */
1493 ph10 443
1494 ph10 619 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1495 ph10 618 md->recursive == NULL &&
1496 ph10 619 (md->notempty ||
1497     (md->notempty_atstart &&
1498     mstart == md->start_subject + md->start_offset)))
1499 ph10 836 RRETURN(MATCH_NOMATCH);
1500 ph10 443
1501 ph10 442 /* Otherwise, we have a match. */
1502 ph10 625
1503 ph10 168 md->end_match_ptr = eptr; /* Record where we ended */
1504     md->end_offset_top = offset_top; /* and how many extracts were taken */
1505 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1506 nigel 77
1507 ph10 512 /* For some reason, the macros don't work properly if an expression is
1508 ph10 836 given as the argument to RRETURN when the heap is in use. */
1509 ph10 512
1510     rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1511 ph10 836 RRETURN(rrc);
1512 ph10 512
1513 nigel 77 /* Assertion brackets. Check the alternative branches in turn - the
1514     matching won't pass the KET for an assertion. If any one branch matches,
1515     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1516     start of each branch to move the current point backwards, so the code at
1517 ph10 625 this level is identical to the lookahead case. When the assertion is part
1518     of a condition, we want to return immediately afterwards. The caller of
1519     this incarnation of the match() function will have set MATCH_CONDASSERT in
1520     md->match_function type, and one of these opcodes will be the first opcode
1521     that is processed. We use a local variable that is preserved over calls to
1522 ph10 604 match() to remember this case. */
1523 nigel 77
1524     case OP_ASSERT:
1525     case OP_ASSERTBACK:
1526 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1527     {
1528     condassert = TRUE;
1529     md->match_function_type = 0;
1530     }
1531 ph10 625 else condassert = FALSE;
1532    
1533 nigel 77 do
1534     {
1535 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1536 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1537 ph10 500 {
1538     mstart = md->start_match_ptr; /* In case \K reset it */
1539     break;
1540 ph10 501 }
1541 ph10 733
1542     /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1543 ph10 716 as NOMATCH. */
1544 ph10 733
1545 ph10 716 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1546 nigel 77 ecode += GET(ecode, 1);
1547     }
1548     while (*ecode == OP_ALT);
1549 ph10 625
1550 ph10 836 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1551 nigel 77
1552     /* If checking an assertion for a condition, return MATCH_MATCH. */
1553    
1554 ph10 604 if (condassert) RRETURN(MATCH_MATCH);
1555 nigel 77
1556     /* Continue from after the assertion, updating the offsets high water
1557     mark, since extracts may have been taken during the assertion. */
1558    
1559     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1560     ecode += 1 + LINK_SIZE;
1561     offset_top = md->end_offset_top;
1562     continue;
1563    
1564 ph10 473 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1565 ph10 482 PRUNE, or COMMIT means we must assume failure without checking subsequent
1566 ph10 473 branches. */
1567 nigel 77
1568     case OP_ASSERT_NOT:
1569     case OP_ASSERTBACK_NOT:
1570 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1571     {
1572     condassert = TRUE;
1573     md->match_function_type = 0;
1574     }
1575 ph10 625 else condassert = FALSE;
1576 ph10 604
1577 nigel 77 do
1578     {
1579 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1580 ph10 836 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1581 ph10 473 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1582     {
1583     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1584 ph10 482 break;
1585     }
1586 ph10 716
1587 ph10 733 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1588 ph10 716 as NOMATCH. */
1589    
1590     if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1591 nigel 77 ecode += GET(ecode,1);
1592     }
1593     while (*ecode == OP_ALT);
1594    
1595 ph10 604 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1596 ph10 625
1597 nigel 77 ecode += 1 + LINK_SIZE;
1598     continue;
1599    
1600     /* Move the subject pointer back. This occurs only at the start of
1601     each branch of a lookbehind assertion. If we are too close to the start to
1602     move back, this match function fails. When working with UTF-8 we move
1603     back a number of characters, not bytes. */
1604    
1605     case OP_REVERSE:
1606 ph10 836 #ifdef SUPPORT_UTF
1607     if (utf)
1608 nigel 77 {
1609 nigel 93 i = GET(ecode, 1);
1610     while (i-- > 0)
1611 nigel 77 {
1612     eptr--;
1613 ph10 836 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1614 ph10 207 BACKCHAR(eptr);
1615 nigel 77 }
1616     }
1617     else
1618     #endif
1619    
1620     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1621    
1622     {
1623 nigel 93 eptr -= GET(ecode, 1);
1624 ph10 836 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1625 nigel 77 }
1626    
1627 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1628 nigel 77
1629 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1630 nigel 77 ecode += 1 + LINK_SIZE;
1631     break;
1632    
1633     /* The callout item calls an external function, if one is provided, passing
1634     details of the match so far. This is mainly for debugging, though the
1635     function is able to force a failure. */
1636    
1637     case OP_CALLOUT:
1638 ph10 836 if (PUBL(callout) != NULL)
1639 nigel 77 {
1640 zherczeg 850 PUBL(callout_block) cb;
1641 ph10 645 cb.version = 2; /* Version 1 of the callout block */
1642 nigel 77 cb.callout_number = ecode[1];
1643     cb.offset_vector = md->offset_vector;
1644 zherczeg 852 #ifdef COMPILE_PCRE8
1645 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1646 zherczeg 852 #else
1647     cb.subject = (PCRE_SPTR16)md->start_subject;
1648     #endif
1649 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1650     cb.start_match = (int)(mstart - md->start_subject);
1651     cb.current_position = (int)(eptr - md->start_subject);
1652 nigel 77 cb.pattern_position = GET(ecode, 2);
1653     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1654     cb.capture_top = offset_top/2;
1655     cb.capture_last = md->capture_last;
1656     cb.callout_data = md->callout_data;
1657 ph10 836 cb.mark = md->nomatch_mark;
1658     if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1659 nigel 77 if (rrc < 0) RRETURN(rrc);
1660     }
1661     ecode += 2 + 2*LINK_SIZE;
1662     break;
1663    
1664     /* Recursion either matches the current regex, or some subexpression. The
1665     offset data is the offset to the starting bracket from the start of the
1666     whole pattern. (This is so that it works from duplicated subpatterns.)
1667 ph10 625
1668 ph10 618 The state of the capturing groups is preserved over recursion, and
1669 ph10 625 re-instated afterwards. We don't know how many are started and not yet
1670 ph10 618 finished (offset_top records the completed total) so we just have to save
1671     all the potential data. There may be up to 65535 such values, which is too
1672     large to put on the stack, but using malloc for small numbers seems
1673     expensive. As a compromise, the stack is used when there are no more than
1674     REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1675 nigel 77
1676     There are also other values that have to be saved. We use a chained
1677     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1678 ph10 625 for the original version of this logic. It has, however, been hacked around
1679 ph10 618 a lot, so he is not to blame for the current way it works. */
1680 nigel 77
1681     case OP_RECURSE:
1682     {
1683 ph10 642 recursion_info *ri;
1684     int recno;
1685 ph10 654
1686 nigel 77 callpat = md->start_code + GET(ecode, 1);
1687 ph10 642 recno = (callpat == md->start_code)? 0 :
1688 ph10 654 GET2(callpat, 1 + LINK_SIZE);
1689    
1690     /* Check for repeating a recursion without advancing the subject pointer.
1691 ph10 642 This should catch convoluted mutual recursions. (Some simple cases are
1692 ph10 654 caught at compile time.) */
1693    
1694 ph10 642 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1695 ph10 654 if (recno == ri->group_num && eptr == ri->subject_position)
1696 ph10 642 RRETURN(PCRE_ERROR_RECURSELOOP);
1697 nigel 77
1698     /* Add to "recursing stack" */
1699    
1700 ph10 642 new_recursive.group_num = recno;
1701     new_recursive.subject_position = eptr;
1702 nigel 77 new_recursive.prevrec = md->recursive;
1703     md->recursive = &new_recursive;
1704    
1705 ph10 618 /* Where to continue from afterwards */
1706 nigel 77
1707     ecode += 1 + LINK_SIZE;
1708    
1709 ph10 618 /* Now save the offset data */
1710 nigel 77
1711     new_recursive.saved_max = md->offset_end;
1712     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1713     new_recursive.offset_save = stacksave;
1714     else
1715     {
1716     new_recursive.offset_save =
1717 ph10 836 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1718 nigel 77 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1719     }
1720     memcpy(new_recursive.offset_save, md->offset_vector,
1721     new_recursive.saved_max * sizeof(int));
1722 ph10 625
1723 ph10 618 /* OK, now we can do the recursion. After processing each alternative,
1724 ph10 625 restore the offset data. If there were nested recursions, md->recursive
1725 ph10 618 might be changed, so reset it before looping. */
1726 nigel 77
1727     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1728 ph10 604 cbegroup = (*callpat >= OP_SBRA);
1729 nigel 77 do
1730     {
1731 ph10 604 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1732 ph10 836 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1733 ph10 604 md, eptrb, RM6);
1734 ph10 618 memcpy(md->offset_vector, new_recursive.offset_save,
1735     new_recursive.saved_max * sizeof(int));
1736 ph10 681 md->recursive = new_recursive.prevrec;
1737 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1738 nigel 77 {
1739 nigel 87 DPRINTF(("Recursion matched\n"));
1740 nigel 77 if (new_recursive.offset_save != stacksave)
1741 ph10 836 (PUBL(free))(new_recursive.offset_save);
1742 ph10 618
1743     /* Set where we got to in the subject, and reset the start in case
1744 ph10 625 it was changed by \K. This *is* propagated back out of a recursion,
1745     for Perl compatibility. */
1746    
1747 ph10 618 eptr = md->end_match_ptr;
1748     mstart = md->start_match_ptr;
1749     goto RECURSION_MATCHED; /* Exit loop; end processing */
1750 nigel 77 }
1751 ph10 716
1752     /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1753     as NOMATCH. */
1754    
1755 ph10 733 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1756 nigel 87 {
1757     DPRINTF(("Recursion gave error %d\n", rrc));
1758 ph10 400 if (new_recursive.offset_save != stacksave)
1759 ph10 836 (PUBL(free))(new_recursive.offset_save);
1760 nigel 87 RRETURN(rrc);
1761     }
1762 nigel 77
1763     md->recursive = &new_recursive;
1764     callpat += GET(callpat, 1);
1765     }
1766     while (*callpat == OP_ALT);
1767    
1768     DPRINTF(("Recursion didn't match\n"));
1769     md->recursive = new_recursive.prevrec;
1770     if (new_recursive.offset_save != stacksave)
1771 ph10 836 (PUBL(free))(new_recursive.offset_save);
1772     RRETURN(MATCH_NOMATCH);
1773 nigel 77 }
1774 ph10 625
1775 ph10 618 RECURSION_MATCHED:
1776     break;
1777 nigel 77
1778     /* An alternation is the end of a branch; scan along to find the end of the
1779     bracketed group and go to there. */
1780    
1781     case OP_ALT:
1782     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1783     break;
1784    
1785 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1786     indicating that it may occur zero times. It may repeat infinitely, or not
1787     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1788     with fixed upper repeat limits are compiled as a number of copies, with the
1789     optional ones preceded by BRAZERO or BRAMINZERO. */
1790 ph10 625
1791 nigel 77 case OP_BRAZERO:
1792 ph10 604 next = ecode + 1;
1793     RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1794     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1795     do next += GET(next, 1); while (*next == OP_ALT);
1796     ecode = next + 1 + LINK_SIZE;
1797 nigel 77 break;
1798 ph10 625
1799 nigel 77 case OP_BRAMINZERO:
1800 ph10 604 next = ecode + 1;
1801     do next += GET(next, 1); while (*next == OP_ALT);
1802     RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1803     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1804     ecode++;
1805 nigel 77 break;
1806    
1807 ph10 335 case OP_SKIPZERO:
1808 ph10 604 next = ecode+1;
1809     do next += GET(next,1); while (*next == OP_ALT);
1810     ecode = next + 1 + LINK_SIZE;
1811 ph10 335 break;
1812 ph10 625
1813 ph10 604 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1814     here; just jump to the group, with allow_zero set TRUE. */
1815 ph10 625
1816 ph10 604 case OP_BRAPOSZERO:
1817 ph10 625 op = *(++ecode);
1818 ph10 604 allow_zero = TRUE;
1819     if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1820     goto POSSESSIVE_NON_CAPTURE;
1821 ph10 335
1822 nigel 93 /* End of a group, repeated or non-repeating. */
1823 nigel 77
1824     case OP_KET:
1825     case OP_KETRMIN:
1826     case OP_KETRMAX:
1827 ph10 625 case OP_KETRPOS:
1828 nigel 91 prev = ecode - GET(ecode, 1);
1829 ph10 625
1830 nigel 93 /* If this was a group that remembered the subject start, in order to break
1831     infinite repeats of empty string matches, retrieve the subject start from
1832     the chain. Otherwise, set it NULL. */
1833 nigel 77
1834 ph10 618 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1835 nigel 93 {
1836     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1837     eptrb = eptrb->epb_prev; /* Backup to previous group */
1838     }
1839     else saved_eptr = NULL;
1840 nigel 77
1841 ph10 733 /* If we are at the end of an assertion group or a non-capturing atomic
1842 ph10 723 group, stop matching and return MATCH_MATCH, but record the current high
1843     water mark for use by positive assertions. We also need to record the match
1844     start in case it was changed by \K. */
1845 nigel 93
1846 ph10 723 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1847 ph10 733 *prev == OP_ONCE_NC)
1848 nigel 91 {
1849 ph10 723 md->end_match_ptr = eptr; /* For ONCE_NC */
1850 nigel 91 md->end_offset_top = offset_top;
1851 ph10 500 md->start_match_ptr = mstart;
1852 ph10 836 RRETURN(MATCH_MATCH); /* Sets md->mark */
1853 nigel 91 }
1854 nigel 77
1855 nigel 93 /* For capturing groups we have to check the group number back at the start
1856     and if necessary complete handling an extraction by setting the offsets and
1857 ph10 618 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1858     into group 0, so it won't be picked up here. Instead, we catch it when the
1859     OP_END is reached. Other recursion is handled here. We just have to record
1860     the current subject position and start match pointer and give a MATCH
1861     return. */
1862 nigel 77
1863 ph10 604 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1864     *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1865 nigel 91 {
1866 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1867 nigel 91 offset = number << 1;
1868 ph10 461
1869 ph10 475 #ifdef PCRE_DEBUG
1870 nigel 91 printf("end bracket %d", number);
1871     printf("\n");
1872 nigel 77 #endif
1873    
1874 ph10 618 /* Handle a recursively called group. */
1875    
1876     if (md->recursive != NULL && md->recursive->group_num == number)
1877     {
1878     md->end_match_ptr = eptr;
1879     md->start_match_ptr = mstart;
1880     RRETURN(MATCH_MATCH);
1881     }
1882    
1883     /* Deal with capturing */
1884    
1885 nigel 93 md->capture_last = number;
1886     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1887 nigel 91 {
1888 ph10 625 /* If offset is greater than offset_top, it means that we are
1889     "skipping" a capturing group, and that group's offsets must be marked
1890     unset. In earlier versions of PCRE, all the offsets were unset at the
1891     start of matching, but this doesn't work because atomic groups and
1892 ph10 615 assertions can cause a value to be set that should later be unset.
1893     Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1894 ph10 625 part of the atomic group, but this is not on the final matching path,
1895     so must be unset when 2 is set. (If there is no group 2, there is no
1896 ph10 615 problem, because offset_top will then be 2, indicating no capture.) */
1897 ph10 625
1898 ph10 615 if (offset > offset_top)
1899     {
1900     register int *iptr = md->offset_vector + offset_top;
1901     register int *iend = md->offset_vector + offset;
1902     while (iptr < iend) *iptr++ = -1;
1903 ph10 625 }
1904    
1905 ph10 615 /* Now make the extraction */
1906    
1907 nigel 93 md->offset_vector[offset] =
1908     md->offset_vector[md->offset_end - number];
1909 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1910 nigel 93 if (offset_top <= offset) offset_top = offset + 2;
1911     }
1912 nigel 91 }
1913 nigel 77
1914 ph10 618 /* For an ordinary non-repeating ket, just continue at this level. This
1915     also happens for a repeating ket if no characters were matched in the
1916     group. This is the forcible breaking of infinite loops as implemented in
1917 ph10 723 Perl 5.005. For a non-repeating atomic group that includes captures,
1918     establish a backup point by processing the rest of the pattern at a lower
1919     level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1920     original OP_ONCE level, thereby bypassing intermediate backup points, but
1921     resetting any captures that happened along the way. */
1922 nigel 77
1923 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1924     {
1925 ph10 618 if (*prev == OP_ONCE)
1926     {
1927     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1928     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1929     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1930 ph10 625 RRETURN(MATCH_ONCE);
1931     }
1932 ph10 618 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1933 nigel 91 break;
1934     }
1935 ph10 625
1936     /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1937 ph10 604 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1938     at a time from the outer level, thus saving stack. */
1939 ph10 625
1940 ph10 604 if (*ecode == OP_KETRPOS)
1941 ph10 625 {
1942 ph10 604 md->end_match_ptr = eptr;
1943 ph10 625 md->end_offset_top = offset_top;
1944 ph10 604 RRETURN(MATCH_KETRPOS);
1945 ph10 625 }
1946 nigel 77
1947 ph10 604 /* The normal repeating kets try the rest of the pattern or restart from
1948     the preceding bracket, in the appropriate order. In the second case, we can
1949     use tail recursion to avoid using another stack frame, unless we have an
1950 ph10 618 an atomic group or an unlimited repeat of a group that can match an empty
1951     string. */
1952 nigel 77
1953 nigel 91 if (*ecode == OP_KETRMIN)
1954     {
1955 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1956 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1957 ph10 618 if (*prev == OP_ONCE)
1958     {
1959 ph10 623 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1960 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1961     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1962 ph10 625 RRETURN(MATCH_ONCE);
1963     }
1964 ph10 604 if (*prev >= OP_SBRA) /* Could match an empty string */
1965 ph10 197 {
1966 ph10 625 md->match_function_type = MATCH_CBEGROUP;
1967 ph10 604 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1968 ph10 197 RRETURN(rrc);
1969     }
1970 nigel 91 ecode = prev;
1971     goto TAIL_RECURSE;
1972 nigel 77 }
1973 nigel 91 else /* OP_KETRMAX */
1974     {
1975 ph10 625 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1976 ph10 604 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1977 ph10 618 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1978 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1979 ph10 618 if (*prev == OP_ONCE)
1980     {
1981 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1982 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1983     md->once_target = prev;
1984 ph10 625 RRETURN(MATCH_ONCE);
1985     }
1986 nigel 91 ecode += 1 + LINK_SIZE;
1987     goto TAIL_RECURSE;
1988     }
1989     /* Control never gets here */
1990 nigel 77
1991 ph10 602 /* Not multiline mode: start of subject assertion, unless notbol. */
1992 nigel 77
1993     case OP_CIRC:
1994 ph10 836 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1995 ph10 625
1996 nigel 77 /* Start of subject assertion */
1997    
1998     case OP_SOD:
1999 ph10 836 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2000 nigel 77 ecode++;
2001     break;
2002 ph10 625
2003 ph10 602 /* Multiline mode: start of subject unless notbol, or after any newline. */
2004 nigel 77
2005 ph10 602 case OP_CIRCM:
2006 ph10 836 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2007 ph10 602 if (eptr != md->start_subject &&
2008     (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2009 ph10 836 RRETURN(MATCH_NOMATCH);
2010 ph10 602 ecode++;
2011     break;
2012    
2013 nigel 77 /* Start of match assertion */
2014    
2015     case OP_SOM:
2016 ph10 836 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2017 nigel 77 ecode++;
2018     break;
2019 ph10 172
2020 ph10 168 /* Reset the start of match point */
2021 ph10 172
2022 ph10 168 case OP_SET_SOM:
2023     mstart = eptr;
2024 ph10 172 ecode++;
2025     break;
2026 nigel 77
2027 ph10 602 /* Multiline mode: assert before any newline, or before end of subject
2028     unless noteol is set. */
2029 nigel 77
2030 ph10 602 case OP_DOLLM:
2031     if (eptr < md->end_subject)
2032 ph10 836 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
2033 ph10 602 else
2034 nigel 77 {
2035 ph10 836 if (md->noteol) RRETURN(MATCH_NOMATCH);
2036 ph10 602 SCHECK_PARTIAL();
2037 nigel 77 }
2038 ph10 602 ecode++;
2039     break;
2040 ph10 579
2041 ph10 625 /* Not multiline mode: assert before a terminating newline or before end of
2042 ph10 602 subject unless noteol is set. */
2043    
2044     case OP_DOLL:
2045 ph10 836 if (md->noteol) RRETURN(MATCH_NOMATCH);
2046 ph10 602 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2047    
2048 nigel 91 /* ... else fall through for endonly */
2049 nigel 77
2050     /* End of subject assertion (\z) */
2051    
2052     case OP_EOD:
2053 ph10 836 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2054 ph10 553 SCHECK_PARTIAL();
2055 nigel 77 ecode++;
2056     break;
2057    
2058     /* End of subject or ending \n assertion (\Z) */
2059    
2060     case OP_EODN:
2061 ph10 553 ASSERT_NL_OR_EOS:
2062     if (eptr < md->end_subject &&
2063 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2064 ph10 836 RRETURN(MATCH_NOMATCH);
2065 ph10 579
2066 ph10 553 /* Either at end of string or \n before end. */
2067 ph10 579
2068 ph10 553 SCHECK_PARTIAL();
2069 nigel 77 ecode++;
2070     break;
2071    
2072     /* Word boundary assertions */
2073    
2074     case OP_NOT_WORD_BOUNDARY:
2075     case OP_WORD_BOUNDARY:
2076     {
2077    
2078     /* Find out if the previous and current characters are "word" characters.
2079     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2080 ph10 443 be "non-word" characters. Remember the earliest consulted character for
2081 ph10 435 partial matching. */
2082 nigel 77
2083 ph10 836 #ifdef SUPPORT_UTF
2084     if (utf)
2085 nigel 77 {
2086 ph10 518 /* Get status of previous character */
2087 ph10 527
2088 nigel 77 if (eptr == md->start_subject) prev_is_word = FALSE; else
2089     {
2090 ph10 836 PCRE_PUCHAR lastptr = eptr - 1;
2091     BACKCHAR(lastptr);
2092 ph10 443 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2093 nigel 77 GETCHAR(c, lastptr);
2094 ph10 527 #ifdef SUPPORT_UCP
2095 ph10 518 if (md->use_ucp)
2096     {
2097     if (c == '_') prev_is_word = TRUE; else
2098 ph10 527 {
2099 ph10 518 int cat = UCD_CATEGORY(c);
2100     prev_is_word = (cat == ucp_L || cat == ucp_N);
2101 ph10 527 }
2102     }
2103     else
2104     #endif
2105 nigel 77 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2106     }
2107 ph10 527
2108 ph10 518 /* Get status of next character */
2109 ph10 527
2110 ph10 443 if (eptr >= md->end_subject)
2111 nigel 77 {
2112 ph10 443 SCHECK_PARTIAL();
2113     cur_is_word = FALSE;
2114 ph10 428 }
2115     else
2116     {
2117 nigel 77 GETCHAR(c, eptr);
2118 ph10 527 #ifdef SUPPORT_UCP
2119 ph10 518 if (md->use_ucp)
2120     {
2121     if (c == '_') cur_is_word = TRUE; else
2122 ph10 527 {
2123 ph10 518 int cat = UCD_CATEGORY(c);
2124     cur_is_word = (cat == ucp_L || cat == ucp_N);
2125 ph10 527 }
2126     }
2127     else
2128     #endif
2129 nigel 77 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2130     }
2131     }
2132     else
2133     #endif
2134    
2135 ph10 527 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2136 ph10 518 consistency with the behaviour of \w we do use it in this case. */
2137 nigel 77
2138     {
2139 ph10 518 /* Get status of previous character */
2140 ph10 527
2141 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
2142     {
2143 ph10 443 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2144 ph10 527 #ifdef SUPPORT_UCP
2145 ph10 518 if (md->use_ucp)
2146     {
2147 ph10 527 c = eptr[-1];
2148 ph10 518 if (c == '_') prev_is_word = TRUE; else
2149 ph10 527 {
2150 ph10 518 int cat = UCD_CATEGORY(c);
2151     prev_is_word = (cat == ucp_L || cat == ucp_N);
2152 ph10 527 }
2153     }
2154     else
2155     #endif
2156 ph10 836 prev_is_word = MAX_255(eptr[-1])
2157     && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2158 ph10 435 }
2159 ph10 527
2160 ph10 518 /* Get status of next character */
2161 ph10 527
2162 ph10 443 if (eptr >= md->end_subject)
2163 ph10 428 {
2164 ph10 443 SCHECK_PARTIAL();
2165     cur_is_word = FALSE;
2166 ph10 428 }
2167 ph10 527 else
2168     #ifdef SUPPORT_UCP
2169 ph10 518 if (md->use_ucp)
2170     {
2171 ph10 527 c = *eptr;
2172 ph10 518 if (c == '_') cur_is_word = TRUE; else
2173 ph10 527 {
2174 ph10 518 int cat = UCD_CATEGORY(c);
2175     cur_is_word = (cat == ucp_L || cat == ucp_N);
2176 ph10 527 }
2177     }
2178     else
2179     #endif
2180 ph10 836 cur_is_word = MAX_255(*eptr)
2181     && ((md->ctypes[*eptr] & ctype_word) != 0);
2182 nigel 77 }
2183    
2184     /* Now see if the situation is what we want */
2185    
2186     if ((*ecode++ == OP_WORD_BOUNDARY)?
2187     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2188 ph10 836 RRETURN(MATCH_NOMATCH);
2189 nigel 77 }
2190     break;
2191    
2192     /* Match a single character type; inline for speed */
2193    
2194     case OP_ANY:
2195 ph10 836 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2196 ph10 345 /* Fall through */
2197    
2198 ph10 341 case OP_ALLANY:
2199 ph10 648 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2200     { /* not be updated before SCHECK_PARTIAL. */
2201 ph10 443 SCHECK_PARTIAL();
2202 ph10 836 RRETURN(MATCH_NOMATCH);
2203 ph10 443 }
2204 ph10 648 eptr++;
2205 ph10 836 #ifdef SUPPORT_UTF
2206     if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2207     #endif
2208 nigel 77 ecode++;
2209     break;
2210    
2211     /* Match a single byte, even in UTF-8 mode. This opcode really does match
2212     any byte, even newline, independent of the setting of PCRE_DOTALL. */
2213    
2214     case OP_ANYBYTE:
2215 ph10 648 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2216     { /* not be updated before SCHECK_PARTIAL. */
2217 ph10 443 SCHECK_PARTIAL();
2218 ph10 836 RRETURN(MATCH_NOMATCH);
2219 ph10 443 }
2220 ph10 654 eptr++;
2221 nigel 77 ecode++;
2222     break;
2223    
2224     case OP_NOT_DIGIT:
2225 ph10 443 if (eptr >= md->end_subject)
2226 ph10 428 {
2227 ph10 443 SCHECK_PARTIAL();
2228 ph10 836 RRETURN(MATCH_NOMATCH);
2229 ph10 443 }
2230 nigel 77 GETCHARINCTEST(c, eptr);
2231     if (
2232 ph10 836 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2233 nigel 77 c < 256 &&
2234     #endif
2235     (md->ctypes[c] & ctype_digit) != 0
2236     )
2237 ph10 836 RRETURN(MATCH_NOMATCH);
2238 nigel 77 ecode++;
2239     break;
2240    
2241     case OP_DIGIT:
2242 ph10 443 if (eptr >= md->end_subject)
2243 ph10 428 {
2244 ph10 443 SCHECK_PARTIAL();
2245 ph10 836 RRETURN(MATCH_NOMATCH);
2246 ph10 443 }
2247 nigel 77 GETCHARINCTEST(c, eptr);
2248     if (
2249 ph10 836 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2250     c > 255 ||
2251 nigel 77 #endif
2252     (md->ctypes[c] & ctype_digit) == 0
2253     )
2254 ph10 836 RRETURN(MATCH_NOMATCH);
2255 nigel 77 ecode++;
2256     break;
2257    
2258     case OP_NOT_WHITESPACE:
2259 ph10 443 if (eptr >= md->end_subject)
2260 ph10 428 {
2261 ph10 443 SCHECK_PARTIAL();
2262 ph10 836 RRETURN(MATCH_NOMATCH);
2263 ph10 443 }
2264 nigel 77 GETCHARINCTEST(c, eptr);
2265     if (
2266 ph10 836 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2267 nigel 77 c < 256 &&
2268     #endif
2269     (md->ctypes[c] & ctype_space) != 0
2270     )
2271 ph10 836 RRETURN(MATCH_NOMATCH);
2272 nigel 77 ecode++;
2273     break;
2274    
2275     case OP_WHITESPACE:
2276 ph10 443 if (eptr >= md->end_subject)
2277 ph10 428 {
2278 ph10 443 SCHECK_PARTIAL();
2279 ph10 836 RRETURN(MATCH_NOMATCH);
2280 ph10 443 }
2281 nigel 77 GETCHARINCTEST(c, eptr);
2282     if (
2283 ph10 836 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2284     c > 255 ||
2285 nigel 77 #endif
2286     (md->ctypes[c] & ctype_space) == 0
2287     )
2288 ph10 836 RRETURN(MATCH_NOMATCH);
2289 nigel 77 ecode++;
2290     break;
2291    
2292     case OP_NOT_WORDCHAR:
2293 ph10 443 if (eptr >= md->end_subject)
2294 ph10 428 {
2295 ph10 443 SCHECK_PARTIAL();
2296 ph10 836 RRETURN(MATCH_NOMATCH);
2297 ph10 443 }
2298 nigel 77 GETCHARINCTEST(c, eptr);
2299     if (
2300 ph10 836 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2301 nigel 77 c < 256 &&
2302     #endif
2303     (md->ctypes[c] & ctype_word) != 0
2304     )
2305 ph10 836 RRETURN(MATCH_NOMATCH);
2306 nigel 77 ecode++;
2307     break;
2308    
2309     case OP_WORDCHAR:
2310 ph10 443 if (eptr >= md->end_subject)
2311 ph10 428 {
2312 ph10 443 SCHECK_PARTIAL();
2313 ph10 836 RRETURN(MATCH_NOMATCH);
2314 ph10 443 }
2315 nigel 77 GETCHARINCTEST(c, eptr);
2316     if (
2317 ph10 836 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2318     c > 255 ||
2319 nigel 77 #endif
2320     (md->ctypes[c] & ctype_word) == 0
2321     )
2322 ph10 836 RRETURN(MATCH_NOMATCH);
2323 nigel 77 ecode++;
2324     break;
2325    
2326 nigel 93 case OP_ANYNL:
2327 ph10 443 if (eptr >= md->end_subject)
2328 ph10 428 {
2329 ph10 443 SCHECK_PARTIAL();
2330 ph10 836 RRETURN(MATCH_NOMATCH);
2331 ph10 443 }
2332 nigel 93 GETCHARINCTEST(c, eptr);
2333     switch(c)
2334     {
2335 ph10 836 default: RRETURN(MATCH_NOMATCH);
2336 ph10 625
2337 nigel 93 case 0x000d:
2338     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2339     break;
2340 ph10 231
2341 nigel 93 case 0x000a:
2342 ph10 231 break;
2343    
2344 nigel 93 case 0x000b:
2345     case 0x000c:
2346     case 0x0085:
2347     case 0x2028:
2348     case 0x2029:
2349 ph10 836 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2350 nigel 93 break;
2351     }
2352     ecode++;
2353     break;
2354    
2355 ph10 178 case OP_NOT_HSPACE:
2356 ph10 443 if (eptr >= md->end_subject)
2357 ph10 428 {
2358 ph10 443 SCHECK_PARTIAL();
2359 ph10 836 RRETURN(MATCH_NOMATCH);
2360 ph10 443 }
2361 ph10 178 GETCHARINCTEST(c, eptr);
2362     switch(c)
2363     {
2364     default: break;
2365     case 0x09: /* HT */
2366     case 0x20: /* SPACE */
2367     case 0xa0: /* NBSP */
2368     case 0x1680: /* OGHAM SPACE MARK */
2369     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2370     case 0x2000: /* EN QUAD */
2371     case 0x2001: /* EM QUAD */
2372     case 0x2002: /* EN SPACE */
2373     case 0x2003: /* EM SPACE */
2374     case 0x2004: /* THREE-PER-EM SPACE */
2375     case 0x2005: /* FOUR-PER-EM SPACE */
2376     case 0x2006: /* SIX-PER-EM SPACE */
2377     case 0x2007: /* FIGURE SPACE */
2378     case 0x2008: /* PUNCTUATION SPACE */
2379     case 0x2009: /* THIN SPACE */
2380     case 0x200A: /* HAIR SPACE */
2381     case 0x202f: /* NARROW NO-BREAK SPACE */
2382     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2383     case 0x3000: /* IDEOGRAPHIC SPACE */
2384 ph10 836 RRETURN(MATCH_NOMATCH);
2385 ph10 178 }
2386     ecode++;
2387     break;
2388    
2389     case OP_HSPACE:
2390 ph10 443 if (eptr >= md->end_subject)
2391 ph10 428 {
2392 ph10 443 SCHECK_PARTIAL();
2393 ph10 836 RRETURN(MATCH_NOMATCH);
2394 ph10 443 }
2395 ph10 178 GETCHARINCTEST(c, eptr);
2396     switch(c)
2397     {
2398 ph10 836 default: RRETURN(MATCH_NOMATCH);
2399 ph10 178 case 0x09: /* HT */
2400     case 0x20: /* SPACE */
2401     case 0xa0: /* NBSP */
2402     case 0x1680: /* OGHAM SPACE MARK */
2403     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2404     case 0x2000: /* EN QUAD */
2405     case 0x2001: /* EM QUAD */
2406     case 0x2002: /* EN SPACE */
2407     case 0x2003: /* EM SPACE */
2408     case 0x2004: /* THREE-PER-EM SPACE */
2409     case 0x2005: /* FOUR-PER-EM SPACE */
2410     case 0x2006: /* SIX-PER-EM SPACE */
2411     case 0x2007: /* FIGURE SPACE */
2412     case 0x2008: /* PUNCTUATION SPACE */
2413     case 0x2009: /* THIN SPACE */
2414     case 0x200A: /* HAIR SPACE */
2415     case 0x202f: /* NARROW NO-BREAK SPACE */
2416     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2417     case 0x3000: /* IDEOGRAPHIC SPACE */
2418     break;
2419     }
2420     ecode++;
2421     break;
2422    
2423     case OP_NOT_VSPACE:
2424 ph10 443 if (eptr >= md->end_subject)
2425 ph10 428 {
2426 ph10 443 SCHECK_PARTIAL();
2427 ph10 836 RRETURN(MATCH_NOMATCH);
2428 ph10 443 }
2429 ph10 178 GETCHARINCTEST(c, eptr);
2430     switch(c)
2431     {
2432     default: break;
2433     case 0x0a: /* LF */
2434     case 0x0b: /* VT */
2435     case 0x0c: /* FF */
2436     case 0x0d: /* CR */
2437     case 0x85: /* NEL */
2438     case 0x2028: /* LINE SEPARATOR */
2439     case 0x2029: /* PARAGRAPH SEPARATOR */
2440 ph10 836 RRETURN(MATCH_NOMATCH);
2441 ph10 178 }
2442     ecode++;
2443     break;
2444    
2445     case OP_VSPACE:
2446 ph10 443 if (eptr >= md->end_subject)
2447 ph10 428 {
2448 ph10 443 SCHECK_PARTIAL();
2449 ph10 836 RRETURN(MATCH_NOMATCH);
2450 ph10 443 }
2451 ph10 178 GETCHARINCTEST(c, eptr);
2452     switch(c)
2453     {
2454 ph10 836 default: RRETURN(MATCH_NOMATCH);
2455 ph10 178 case 0x0a: /* LF */
2456     case 0x0b: /* VT */
2457     case 0x0c: /* FF */
2458     case 0x0d: /* CR */
2459     case 0x85: /* NEL */
2460     case 0x2028: /* LINE SEPARATOR */
2461     case 0x2029: /* PARAGRAPH SEPARATOR */
2462     break;
2463     }
2464     ecode++;
2465     break;
2466    
2467 nigel 77 #ifdef SUPPORT_UCP
2468     /* Check the next character by Unicode property. We will get here only
2469     if the support is in the binary; otherwise a compile-time error occurs. */
2470    
2471     case OP_PROP:
2472     case OP_NOTPROP:
2473 ph10 443 if (eptr >= md->end_subject)
2474 ph10 428 {
2475 ph10 443 SCHECK_PARTIAL();
2476 ph10 836 RRETURN(MATCH_NOMATCH);
2477 ph10 443 }
2478 nigel 77 GETCHARINCTEST(c, eptr);
2479     {
2480 ph10 384 const ucd_record *prop = GET_UCD(c);
2481 nigel 77
2482 nigel 87 switch(ecode[1])
2483     {
2484     case PT_ANY:
2485 ph10 836 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2486 nigel 87 break;
2487 nigel 77
2488 nigel 87 case PT_LAMP:
2489 ph10 349 if ((prop->chartype == ucp_Lu ||
2490     prop->chartype == ucp_Ll ||
2491     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2492 ph10 836 RRETURN(MATCH_NOMATCH);
2493 ph10 517 break;
2494 nigel 87
2495     case PT_GC:
2496 ph10 836 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2497     RRETURN(MATCH_NOMATCH);
2498 nigel 87 break;
2499    
2500     case PT_PC:
2501 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2502 ph10 836 RRETURN(MATCH_NOMATCH);
2503 nigel 87 break;
2504    
2505     case PT_SC:
2506 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
2507 ph10 836 RRETURN(MATCH_NOMATCH);
2508 nigel 87 break;
2509 ph10 527
2510 ph10 517 /* These are specials */
2511 ph10 527
2512 ph10 517 case PT_ALNUM:
2513 ph10 836 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2514     PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2515     RRETURN(MATCH_NOMATCH);
2516 ph10 527 break;
2517    
2518 ph10 517 case PT_SPACE: /* Perl space */
2519 ph10 836 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2520 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2521     == (op == OP_NOTPROP))
2522 ph10 836 RRETURN(MATCH_NOMATCH);
2523 ph10 527 break;
2524    
2525 ph10 517 case PT_PXSPACE: /* POSIX space */
2526 ph10 836 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2527 ph10 527 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2528 ph10 517 c == CHAR_FF || c == CHAR_CR)
2529     == (op == OP_NOTPROP))
2530 ph10 836 RRETURN(MATCH_NOMATCH);
2531 ph10 527 break;
2532 nigel 87
2533 ph10 527 case PT_WORD:
2534 ph10 836 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2535     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2536 ph10 517 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2537 ph10 836 RRETURN(MATCH_NOMATCH);
2538 ph10 527 break;
2539    
2540 ph10 517 /* This should never occur */
2541    
2542 nigel 87 default:
2543     RRETURN(PCRE_ERROR_INTERNAL);
2544 nigel 77 }
2545 nigel 87
2546     ecode += 3;
2547 nigel 77 }
2548     break;
2549    
2550     /* Match an extended Unicode sequence. We will get here only if the support
2551     is in the binary; otherwise a compile-time error occurs. */
2552    
2553     case OP_EXTUNI:
2554 ph10 443 if (eptr >= md->end_subject)
2555 ph10 428 {
2556 ph10 443 SCHECK_PARTIAL();
2557 ph10 836 RRETURN(MATCH_NOMATCH);
2558 ph10 443 }
2559 nigel 77 GETCHARINCTEST(c, eptr);
2560 ph10 836 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2561 ph10 623 while (eptr < md->end_subject)
2562 nigel 77 {
2563 ph10 623 int len = 1;
2564 ph10 836 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2565 ph10 623 if (UCD_CATEGORY(c) != ucp_M) break;
2566     eptr += len;
2567 nigel 77 }
2568     ecode++;
2569     break;
2570     #endif
2571    
2572    
2573     /* Match a back reference, possibly repeatedly. Look past the end of the
2574     item to see if there is repeat information following. The code is similar
2575     to that for character classes, but repeated for efficiency. Then obey
2576     similar code to character type repeats - written out again for speed.
2577     However, if the referenced string is the empty string, always treat
2578     it as matched, any number of times (otherwise there could be infinite
2579     loops). */
2580    
2581     case OP_REF:
2582 ph10 625 case OP_REFI:
2583     caseless = op == OP_REFI;
2584 ph10 595 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2585 ph10 836 ecode += 1 + IMM2_SIZE;
2586 ph10 345
2587 ph10 595 /* If the reference is unset, there are two possibilities:
2588 ph10 345
2589 ph10 595 (a) In the default, Perl-compatible state, set the length negative;
2590     this ensures that every attempt at a match fails. We can't just fail
2591     here, because of the possibility of quantifiers with zero minima.
2592 ph10 345
2593 ph10 595 (b) If the JavaScript compatibility flag is set, set the length to zero
2594     so that the back reference matches an empty string.
2595 ph10 345
2596 ph10 595 Otherwise, set the length to the length of what was matched by the
2597     referenced subpattern. */
2598 ph10 345
2599 ph10 595 if (offset >= offset_top || md->offset_vector[offset] < 0)
2600     length = (md->jscript_compat)? 0 : -1;
2601     else
2602     length = md->offset_vector[offset+1] - md->offset_vector[offset];
2603 nigel 77
2604 ph10 595 /* Set up for repetition, or handle the non-repeated case */
2605 nigel 77
2606 ph10 595 switch (*ecode)
2607     {
2608     case OP_CRSTAR:
2609     case OP_CRMINSTAR:
2610     case OP_CRPLUS:
2611     case OP_CRMINPLUS:
2612     case OP_CRQUERY:
2613     case OP_CRMINQUERY:
2614     c = *ecode++ - OP_CRSTAR;
2615     minimize = (c & 1) != 0;
2616     min = rep_min[c]; /* Pick up values from tables; */
2617     max = rep_max[c]; /* zero for max => infinity */
2618     if (max == 0) max = INT_MAX;
2619     break;
2620 nigel 77
2621 ph10 595 case OP_CRRANGE:
2622     case OP_CRMINRANGE:
2623     minimize = (*ecode == OP_CRMINRANGE);
2624     min = GET2(ecode, 1);
2625 ph10 836 max = GET2(ecode, 1 + IMM2_SIZE);
2626 ph10 595 if (max == 0) max = INT_MAX;
2627 ph10 836 ecode += 1 + 2 * IMM2_SIZE;
2628 ph10 595 break;
2629 nigel 77
2630 ph10 595 default: /* No repeat follows */
2631 ph10 602 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2632 ph10 595 {
2633     CHECK_PARTIAL();
2634 ph10 836 RRETURN(MATCH_NOMATCH);
2635 nigel 77 }
2636 ph10 595 eptr += length;
2637     continue; /* With the main loop */
2638     }
2639 nigel 77
2640 ph10 595 /* Handle repeated back references. If the length of the reference is
2641 ph10 836 zero, just continue with the main loop. If the length is negative, it
2642 ph10 842 means the reference is unset in non-Java-compatible mode. If the minimum is
2643     zero, we can continue at the same level without recursion. For any other
2644 ph10 836 minimum, carrying on will result in NOMATCH. */
2645 ph10 443
2646 ph10 595 if (length == 0) continue;
2647 ph10 836 if (length < 0 && min == 0) continue;
2648 nigel 77
2649 ph10 595 /* First, ensure the minimum number of matches are present. We get back
2650     the length of the reference string explicitly rather than passing the
2651     address of eptr, so that eptr can be a register variable. */
2652 nigel 77
2653 ph10 595 for (i = 1; i <= min; i++)
2654     {
2655 ph10 625 int slength;
2656 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2657 nigel 77 {
2658 ph10 595 CHECK_PARTIAL();
2659 ph10 836 RRETURN(MATCH_NOMATCH);
2660 nigel 77 }
2661 ph10 595 eptr += slength;
2662     }
2663 nigel 77
2664 ph10 595 /* If min = max, continue at the same level without recursion.
2665     They are not both allowed to be zero. */
2666 nigel 77
2667 ph10 595 if (min == max) continue;
2668 nigel 77
2669 ph10 595 /* If minimizing, keep trying and advancing the pointer */
2670 nigel 77
2671 ph10 595 if (minimize)
2672     {
2673     for (fi = min;; fi++)
2674 nigel 77 {
2675 ph10 625 int slength;
2676 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2677 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2678 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
2679 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2680 nigel 77 {
2681 ph10 595 CHECK_PARTIAL();
2682 ph10 836 RRETURN(MATCH_NOMATCH);
2683 nigel 77 }
2684 ph10 595 eptr += slength;
2685 nigel 77 }
2686 ph10 595 /* Control never gets here */
2687     }
2688 nigel 77
2689 ph10 595 /* If maximizing, find the longest string and work backwards */
2690 nigel 77
2691 ph10 595 else
2692     {
2693     pp = eptr;
2694     for (i = min; i < max; i++)
2695 nigel 77 {
2696 ph10 625 int slength;
2697 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2698 nigel 77 {
2699 ph10 595 CHECK_PARTIAL();
2700     break;
2701 nigel 77 }
2702 ph10 595 eptr += slength;
2703 nigel 77 }
2704 ph10 595 while (eptr >= pp)
2705     {
2706 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2707 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2708     eptr -= length;
2709     }
2710 ph10 836 RRETURN(MATCH_NOMATCH);
2711 nigel 77 }
2712     /* Control never gets here */
2713    
2714     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2715     used when all the characters in the class have values in the range 0-255,
2716     and either the matching is caseful, or the characters are in the range
2717     0-127 when UTF-8 processing is enabled. The only difference between
2718     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2719     encountered.
2720    
2721     First, look past the end of the item to see if there is repeat information
2722     following. Then obey similar code to character type repeats - written out
2723     again for speed. */
2724    
2725     case OP_NCLASS:
2726     case OP_CLASS:
2727     {
2728 ph10 836 /* The data variable is saved across frames, so the byte map needs to
2729     be stored there. */
2730     #define BYTE_MAP ((pcre_uint8 *)data)
2731 nigel 77 data = ecode + 1; /* Save for matching */
2732 ph10 836 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2733 nigel 77
2734     switch (*ecode)
2735     {
2736     case OP_CRSTAR:
2737     case OP_CRMINSTAR:
2738     case OP_CRPLUS:
2739     case OP_CRMINPLUS:
2740     case OP_CRQUERY:
2741     case OP_CRMINQUERY:
2742     c = *ecode++ - OP_CRSTAR;
2743     minimize = (c & 1) != 0;
2744     min = rep_min[c]; /* Pick up values from tables; */
2745     max = rep_max[c]; /* zero for max => infinity */
2746     if (max == 0) max = INT_MAX;
2747     break;
2748    
2749     case OP_CRRANGE:
2750     case OP_CRMINRANGE:
2751     minimize = (*ecode == OP_CRMINRANGE);
2752     min = GET2(ecode, 1);
2753 ph10 836 max = GET2(ecode, 1 + IMM2_SIZE);
2754 nigel 77 if (max == 0) max = INT_MAX;
2755 ph10 836 ecode += 1 + 2 * IMM2_SIZE;
2756 nigel 77 break;
2757    
2758     default: /* No repeat follows */
2759     min = max = 1;
2760     break;
2761     }
2762    
2763     /* First, ensure the minimum number of matches are present. */
2764    
2765 ph10 836 #ifdef SUPPORT_UTF
2766     if (utf)
2767 nigel 77 {
2768     for (i = 1; i <= min; i++)
2769     {
2770 ph10 427 if (eptr >= md->end_subject)
2771 ph10 426 {
2772 ph10 428 SCHECK_PARTIAL();
2773 ph10 836 RRETURN(MATCH_NOMATCH);
2774 ph10 427 }
2775 nigel 77 GETCHARINC(c, eptr);
2776     if (c > 255)
2777     {
2778 ph10 836 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2779 nigel 77 }
2780     else
2781 ph10 836 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2782 nigel 77 }
2783     }
2784     else
2785     #endif
2786 ph10 836 /* Not UTF mode */
2787 nigel 77 {
2788     for (i = 1; i <= min; i++)
2789     {
2790 ph10 427 if (eptr >= md->end_subject)
2791 ph10 426 {
2792 ph10 428 SCHECK_PARTIAL();
2793 ph10 836 RRETURN(MATCH_NOMATCH);
2794 ph10 427 }
2795 nigel 77 c = *eptr++;
2796 ph10 836 #ifndef COMPILE_PCRE8
2797     if (c > 255)
2798     {
2799     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2800     }
2801     else
2802     #endif
2803     if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2804 nigel 77 }
2805     }
2806    
2807     /* If max == min we can continue with the main loop without the
2808     need to recurse. */
2809    
2810     if (min == max) continue;
2811    
2812     /* If minimizing, keep testing the rest of the expression and advancing
2813     the pointer while it matches the class. */
2814    
2815     if (minimize)
2816     {
2817 ph10 836 #ifdef SUPPORT_UTF
2818     if (utf)
2819 nigel 77 {
2820     for (fi = min;; fi++)
2821     {
2822 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2823 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2824 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
2825 ph10 427 if (eptr >= md->end_subject)
2826 ph10 426 {
2827 ph10 427 SCHECK_PARTIAL();
2828 ph10 836 RRETURN(MATCH_NOMATCH);
2829 ph10 427 }
2830 nigel 77 GETCHARINC(c, eptr);
2831     if (c > 255)
2832     {
2833 ph10 836 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2834 nigel 77 }
2835     else
2836 ph10 836 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2837 nigel 77 }
2838     }
2839     else
2840     #endif
2841 ph10 836 /* Not UTF mode */
2842 nigel 77 {
2843     for (fi = min;; fi++)
2844     {
2845 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2846 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2847 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
2848 ph10 427 if (eptr >= md->end_subject)
2849 ph10 426 {
2850 ph10 427 SCHECK_PARTIAL();
2851 ph10 836 RRETURN(MATCH_NOMATCH);
2852 ph10 427 }
2853 nigel 77 c = *eptr++;
2854 ph10 836 #ifndef COMPILE_PCRE8
2855     if (c > 255)
2856     {
2857     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2858     }
2859     else
2860     #endif
2861     if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2862 nigel 77 }
2863     }
2864     /* Control never gets here */
2865     }
2866    
2867     /* If maximizing, find the longest possible run, then work backwards. */
2868    
2869     else
2870     {
2871     pp = eptr;
2872    
2873 ph10 836 #ifdef SUPPORT_UTF
2874     if (utf)
2875 nigel 77 {
2876     for (i = min; i < max; i++)
2877     {
2878     int len = 1;
2879 ph10 463 if (eptr >= md->end_subject)
2880 ph10 462 {
2881 ph10 463 SCHECK_PARTIAL();
2882 ph10 462 break;
2883 ph10 463 }
2884 nigel 77 GETCHARLEN(c, eptr, len);
2885     if (c > 255)
2886     {
2887     if (op == OP_CLASS) break;
2888     }
2889     else
2890 ph10 836 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2891 nigel 77 eptr += len;
2892     }
2893     for (;;)
2894     {
2895 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2896 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2897     if (eptr-- == pp) break; /* Stop if tried at original pos */
2898     BACKCHAR(eptr);
2899     }
2900     }
2901     else
2902     #endif
2903 ph10 836 /* Not UTF mode */
2904 nigel 77 {
2905     for (i = min; i < max; i++)
2906     {
2907 ph10 463 if (eptr >= md->end_subject)
2908 ph10 462 {
2909 ph10 463 SCHECK_PARTIAL();
2910 ph10 462 break;
2911 ph10 463 }
2912 nigel 77 c = *eptr;
2913 ph10 836 #ifndef COMPILE_PCRE8
2914     if (c > 255)
2915     {
2916     if (op == OP_CLASS) break;
2917     }
2918     else
2919     #endif
2920     if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2921 nigel 77 eptr++;
2922     }
2923     while (eptr >= pp)
2924     {
2925 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2926 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2927 nigel 77 eptr--;
2928     }
2929     }
2930    
2931 ph10 836 RRETURN(MATCH_NOMATCH);
2932 nigel 77 }
2933 ph10 836 #undef BYTE_MAP
2934 nigel 77 }
2935     /* Control never gets here */
2936    
2937    
2938     /* Match an extended character class. This opcode is encountered only
2939 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2940     mode, because Unicode properties are supported in non-UTF-8 mode. */
2941 nigel 77
2942 ph10 836 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2943 nigel 77 case OP_XCLASS:
2944     {
2945     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2946     ecode += GET(ecode, 1); /* Advance past the item */
2947    
2948     switch (*ecode)
2949     {
2950     case OP_CRSTAR:
2951     case OP_CRMINSTAR:
2952     case OP_CRPLUS:
2953     case OP_CRMINPLUS:
2954     case OP_CRQUERY:
2955     case OP_CRMINQUERY:
2956     c = *ecode++ - OP_CRSTAR;
2957     minimize = (c & 1) != 0;
2958     min = rep_min[c]; /* Pick up values from tables; */
2959     max = rep_max[c]; /* zero for max => infinity */
2960     if (max == 0) max = INT_MAX;
2961     break;
2962    
2963     case OP_CRRANGE:
2964     case OP_CRMINRANGE:
2965     minimize = (*ecode == OP_CRMINRANGE);
2966     min = GET2(ecode, 1);
2967 ph10 836 max = GET2(ecode, 1 + IMM2_SIZE);
2968 nigel 77 if (max == 0) max = INT_MAX;
2969 ph10 836 ecode += 1 + 2 * IMM2_SIZE;
2970 nigel 77 break;
2971    
2972     default: /* No repeat follows */
2973     min = max = 1;
2974     break;
2975     }
2976    
2977     /* First, ensure the minimum number of matches are present. */
2978    
2979     for (i = 1; i <= min; i++)
2980     {
2981 ph10 427 if (eptr >= md->end_subject)
2982 ph10 426 {
2983     SCHECK_PARTIAL();
2984 ph10 836 RRETURN(MATCH_NOMATCH);
2985 ph10 427 }
2986 ph10 384 GETCHARINCTEST(c, eptr);
2987 ph10 836 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
2988 nigel 77 }
2989    
2990     /* If max == min we can continue with the main loop without the
2991     need to recurse. */
2992    
2993     if (min == max) continue;
2994    
2995     /* If minimizing, keep testing the rest of the expression and advancing
2996     the pointer while it matches the class. */
2997    
2998     if (minimize)
2999     {
3000     for (fi = min;; fi++)
3001     {
3002 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3003 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3004 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
3005 ph10 427 if (eptr >= md->end_subject)
3006 ph10 426 {
3007 ph10 427 SCHECK_PARTIAL();
3008 ph10 836 RRETURN(MATCH_NOMATCH);
3009 ph10 427 }
3010 ph10 384 GETCHARINCTEST(c, eptr);
3011 ph10 836 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3012 nigel 77 }
3013     /* Control never gets here */
3014     }
3015    
3016     /* If maximizing, find the longest possible run, then work backwards. */
3017    
3018     else
3019     {
3020     pp = eptr;
3021     for (i = min; i < max; i++)
3022     {
3023     int len = 1;
3024 ph10 463 if (eptr >= md->end_subject)
3025 ph10 462 {
3026 ph10 463 SCHECK_PARTIAL();
3027 ph10 462 break;
3028 ph10 463 }
3029 ph10 836 #ifdef SUPPORT_UTF
3030 ph10 384 GETCHARLENTEST(c, eptr, len);
3031 ph10 836 #else
3032     c = *eptr;
3033     #endif
3034     if (!PRIV(xclass)(c, data, utf)) break;
3035 nigel 77 eptr += len;
3036     }
3037     for(;;)
3038     {
3039 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3040 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3041     if (eptr-- == pp) break; /* Stop if tried at original pos */
3042 ph10 836 #ifdef SUPPORT_UTF
3043     if (utf) BACKCHAR(eptr);
3044     #endif
3045 nigel 77 }
3046 ph10 836 RRETURN(MATCH_NOMATCH);
3047 nigel 77 }
3048    
3049     /* Control never gets here */
3050     }
3051     #endif /* End of XCLASS */
3052    
3053     /* Match a single character, casefully */
3054    
3055     case OP_CHAR:
3056 ph10 836 #ifdef SUPPORT_UTF
3057     if (utf)
3058 nigel 77 {
3059     length = 1;
3060     ecode++;
3061     GETCHARLEN(fc, ecode, length);
3062 ph10 443 if (length > md->end_subject - eptr)
3063 ph10 428 {
3064     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3065 ph10 836 RRETURN(MATCH_NOMATCH);
3066 ph10 443 }
3067 ph10 836 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3068 nigel 77 }
3069     else
3070     #endif
3071 ph10 836 /* Not UTF mode */
3072 nigel 77 {
3073 ph10 443 if (md->end_subject - eptr < 1)
3074 ph10 428 {
3075     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3076 ph10 836 RRETURN(MATCH_NOMATCH);
3077 ph10 443 }
3078 ph10 836 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3079 nigel 77 ecode += 2;
3080     }
3081     break;
3082    
3083 ph10 836 /* Match a single character, caselessly. If we are at the end of the
3084     subject, give up immediately. */
3085 nigel 77
3086 ph10 602 case OP_CHARI:
3087 ph10 836 if (eptr >= md->end_subject)
3088 nigel 77 {
3089 ph10 836 SCHECK_PARTIAL();
3090     RRETURN(MATCH_NOMATCH);
3091     }
3092    
3093     #ifdef SUPPORT_UTF
3094     if (utf)
3095     {
3096 nigel 77 length = 1;
3097     ecode++;
3098     GETCHARLEN(fc, ecode, length);
3099 ph10 788
3100 nigel 77 /* If the pattern character's value is < 128, we have only one byte, and
3101 ph10 836 we know that its other case must also be one byte long, so we can use the
3102     fast lookup table. We know that there is at least one byte left in the
3103     subject. */
3104 nigel 77
3105     if (fc < 128)
3106     {
3107 ph10 836 if (md->lcc[fc]
3108     != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3109     ecode++;
3110     eptr++;
3111 nigel 77 }
3112    
3113 ph10 836 /* Otherwise we must pick up the subject character. Note that we cannot
3114     use the value of "length" to check for sufficient bytes left, because the
3115     other case of the character may have more or fewer bytes. */
3116 nigel 77
3117     else
3118     {
3119 nigel 93 unsigned int dc;
3120 nigel 77 GETCHARINC(dc, eptr);
3121     ecode += length;
3122    
3123     /* If we have Unicode property support, we can use it to test the other
3124 nigel 87 case of the character, if there is one. */
3125 nigel 77
3126     if (fc != dc)
3127     {
3128     #ifdef SUPPORT_UCP
3129 ph10 349 if (dc != UCD_OTHERCASE(fc))
3130 nigel 77 #endif
3131 ph10 836 RRETURN(MATCH_NOMATCH);
3132 nigel 77 }
3133     }
3134     }
3135     else
3136 ph10 836 #endif /* SUPPORT_UTF */
3137 nigel 77
3138 ph10 836 /* Not UTF mode */
3139 nigel 77 {
3140 ph10 836 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3141     != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3142     eptr++;
3143 nigel 77 ecode += 2;
3144     }
3145     break;
3146    
3147 nigel 93 /* Match a single character repeatedly. */
3148 nigel 77
3149     case OP_EXACT:
3150 ph10 602 case OP_EXACTI:
3151 nigel 77 min = max = GET2(ecode, 1);
3152 ph10 836 ecode += 1 + IMM2_SIZE;
3153 nigel 77 goto REPEATCHAR;
3154    
3155 nigel 93 case OP_POSUPTO:
3156 ph10 602 case OP_POSUPTOI:
3157 nigel 93 possessive = TRUE;
3158     /* Fall through */
3159    
3160 nigel 77 case OP_UPTO:
3161 ph10 602 case OP_UPTOI:
3162 nigel 77 case OP_MINUPTO:
3163 ph10 602 case OP_MINUPTOI:
3164 nigel 77 min = 0;
3165     max = GET2(ecode, 1);
3166 ph10 602 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3167 ph10 836 ecode += 1 + IMM2_SIZE;
3168 nigel 77 goto REPEATCHAR;
3169    
3170 nigel 93 case OP_POSSTAR:
3171 ph10 602 case OP_POSSTARI:
3172 nigel 93 possessive = TRUE;
3173     min = 0;
3174     max = INT_MAX;
3175     ecode++;
3176     goto REPEATCHAR;
3177    
3178     case OP_POSPLUS:
3179 ph10 602 case OP_POSPLUSI:
3180 nigel 93 possessive = TRUE;
3181     min = 1;
3182     max = INT_MAX;
3183     ecode++;
3184     goto REPEATCHAR;
3185    
3186     case OP_POSQUERY:
3187 ph10 602 case OP_POSQUERYI:
3188 nigel 93 possessive = TRUE;
3189     min = 0;
3190     max = 1;
3191     ecode++;
3192     goto REPEATCHAR;
3193    
3194 nigel 77 case OP_STAR:
3195 ph10 602 case OP_STARI:
3196 nigel 77 case OP_MINSTAR:
3197 ph10 602 case OP_MINSTARI:
3198 nigel 77 case OP_PLUS:
3199 ph10 602 case OP_PLUSI:
3200 nigel 77 case OP_MINPLUS:
3201 ph10 602 case OP_MINPLUSI:
3202 nigel 77 case OP_QUERY:
3203 ph10 602 case OP_QUERYI:
3204 nigel 77 case OP_MINQUERY:
3205 ph10 602 case OP_MINQUERYI:
3206     c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3207 nigel 77 minimize = (c & 1) != 0;
3208     min = rep_min[c]; /* Pick up values from tables; */
3209     max = rep_max[c]; /* zero for max => infinity */
3210     if (max == 0) max = INT_MAX;
3211    
3212 ph10 426 /* Common code for all repeated single-character matches. */
3213 nigel 77
3214     REPEATCHAR:
3215 ph10 836 #ifdef SUPPORT_UTF
3216     if (utf)
3217 nigel 77 {
3218     length = 1;
3219     charptr = ecode;
3220     GETCHARLEN(fc, ecode, length);
3221     ecode += length;
3222    
3223     /* Handle multibyte character matching specially here. There is
3224     support for caseless matching if UCP support is present. */
3225    
3226     if (length > 1)
3227     {
3228     #ifdef SUPPORT_UCP
3229 nigel 93 unsigned int othercase;
3230 ph10 602 if (op >= OP_STARI && /* Caseless */
3231 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
3232 ph10 836 oclength = PRIV(ord2utf)(othercase, occhars);
3233 ph10 115 else oclength = 0;
3234 nigel 77 #endif /* SUPPORT_UCP */
3235    
3236     for (i = 1; i <= min; i++)
3237     {
3238 ph10 426 if (eptr <= md->end_subject - length &&
3239 ph10 836 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3240 ph10 123 #ifdef SUPPORT_UCP
3241 ph10 426 else if (oclength > 0 &&
3242     eptr <= md->end_subject - oclength &&
3243 ph10 836 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3244 ph10 426 #endif /* SUPPORT_UCP */
3245 nigel 77 else
3246     {
3247 ph10 426 CHECK_PARTIAL();
3248 ph10 836 RRETURN(MATCH_NOMATCH);
3249 nigel 77 }
3250     }
3251    
3252     if (min == max) continue;
3253    
3254     if (minimize)
3255     {
3256     for (fi = min;; fi++)
3257     {
3258 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3259 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3260 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
3261 ph10 426 if (eptr <= md->end_subject - length &&
3262 ph10 836 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3263 ph10 123 #ifdef SUPPORT_UCP
3264 ph10 426 else if (oclength > 0 &&
3265     eptr <= md->end_subject - oclength &&
3266 ph10 836 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3267 ph10 426 #endif /* SUPPORT_UCP */
3268 nigel 77 else
3269     {
3270 ph10 426 CHECK_PARTIAL();
3271 ph10 836 RRETURN(MATCH_NOMATCH);
3272 nigel 77 }
3273     }
3274     /* Control never gets here */
3275     }
3276 nigel 93
3277     else /* Maximize */
3278 nigel 77 {
3279     pp = eptr;
3280     for (i = min; i < max; i++)
3281     {
3282 ph10 426 if (eptr <= md->end_subject - length &&
3283 ph10 836 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3284 ph10 123 #ifdef SUPPORT_UCP
3285 ph10 426 else if (oclength > 0 &&
3286     eptr <= md->end_subject - oclength &&
3287 ph10 836 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3288 ph10 426 #endif /* SUPPORT_UCP */
3289 ph10 463 else
3290 ph10 462 {
3291 ph10 463 CHECK_PARTIAL();
3292 ph10 462 break;
3293 ph10 463 }
3294 nigel 77 }
3295 nigel 93
3296     if (possessive) continue;
3297 ph10 427
3298 ph10 120 for(;;)
3299 ph10 426 {
3300 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3301 ph10 426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3302 ph10 836 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3303 ph10 115 #ifdef SUPPORT_UCP
3304 ph10 426 eptr--;
3305     BACKCHAR(eptr);
3306 ph10 123 #else /* without SUPPORT_UCP */
3307 ph10 426 eptr -= length;
3308 ph10 123 #endif /* SUPPORT_UCP */
3309 ph10 426 }
3310 nigel 77 }
3311     /* Control never gets here */
3312     }
3313    
3314     /* If the length of a UTF-8 character is 1, we fall through here, and
3315     obey the code as for non-UTF-8 characters below, though in this case the
3316     value of fc will always be < 128. */
3317     }
3318     else
3319 ph10 836 #endif /* SUPPORT_UTF */
3320     /* When not in UTF-8 mode, load a single-byte character. */
3321     fc = *ecode++;
3322 nigel 77
3323 ph10 836 /* The value of fc at this point is always one character, though we may
3324     or may not be in UTF mode. The code is duplicated for the caseless and
3325 nigel 77 caseful cases, for speed, since matching characters is likely to be quite
3326     common. First, ensure the minimum number of matches are present. If min =
3327     max, continue at the same level without recursing. Otherwise, if
3328     minimizing, keep trying the rest of the expression and advancing one
3329     matching character if failing, up to the maximum. Alternatively, if
3330     maximizing, find the maximum number of characters and work backwards. */
3331    
3332     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3333     max, eptr));
3334    
3335 ph10 602 if (op >= OP_STARI) /* Caseless */
3336 nigel 77 {
3337 ph10 836 #ifdef COMPILE_PCRE8
3338     /* fc must be < 128 if UTF is enabled. */
3339     foc = md->fcc[fc];
3340     #else
3341     #ifdef SUPPORT_UTF
3342     #ifdef SUPPORT_UCP
3343     if (utf && fc > 127)
3344     foc = UCD_OTHERCASE(fc);
3345     #else
3346     if (utf && fc > 127)
3347     foc = fc;
3348     #endif /* SUPPORT_UCP */
3349     else
3350     #endif /* SUPPORT_UTF */
3351     foc = TABLE_GET(fc, md->fcc, fc);
3352     #endif /* COMPILE_PCRE8 */
3353    
3354 nigel 77 for (i = 1; i <= min; i++)
3355 ph10 426 {
3356     if (eptr >= md->end_subject)
3357     {
3358     SCHECK_PARTIAL();
3359 ph10 836 RRETURN(MATCH_NOMATCH);
3360 ph10 426 }
3361 ph10 836 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3362     eptr++;
3363 ph10 426 }
3364 nigel 77 if (min == max) continue;
3365     if (minimize)
3366     {
3367     for (fi = min;; fi++)
3368     {
3369 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3370 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3371 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
3372 ph10 426 if (eptr >= md->end_subject)
3373     {
3374 ph10 427 SCHECK_PARTIAL();
3375 ph10 836 RRETURN(MATCH_NOMATCH);
3376 ph10 426 }
3377 ph10 836 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3378     eptr++;
3379 nigel 77 }
3380     /* Control never gets here */
3381     }
3382 nigel 93 else /* Maximize */
3383 nigel 77 {
3384     pp = eptr;
3385     for (i = min; i < max; i++)
3386     {
3387 ph10 463 if (eptr >= md->end_subject)
3388 ph10 462 {
3389     SCHECK_PARTIAL();
3390     break;
3391 ph10 463 }
3392 ph10 836 if (fc != *eptr && foc != *eptr) break;
3393 nigel 77 eptr++;
3394     }
3395 ph10 427
3396 nigel 93 if (possessive) continue;
3397 ph10 427
3398 nigel 77 while (eptr >= pp)
3399     {
3400 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3401 nigel 77 eptr--;
3402     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3403     }
3404 ph10 836 RRETURN(MATCH_NOMATCH);
3405 nigel 77 }
3406     /* Control never gets here */
3407     }
3408    
3409     /* Caseful comparisons (includes all multi-byte characters) */
3410    
3411     else
3412     {
3413 ph10 427 for (i = 1; i <= min; i++)
3414 ph10 426 {
3415     if (eptr >= md->end_subject)
3416     {
3417     SCHECK_PARTIAL();
3418 ph10 836 RRETURN(MATCH_NOMATCH);
3419 ph10 426 }
3420 ph10 836 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3421 ph10 427 }
3422 ph10 443
3423 nigel 77 if (min == max) continue;
3424 ph10 443
3425 nigel 77 if (minimize)
3426     {
3427     for (fi = min;; fi++)
3428     {
3429 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3430 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3431 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
3432 ph10 426 if (eptr >= md->end_subject)
3433 ph10 427 {
3434 ph10 426 SCHECK_PARTIAL();
3435 ph10 836 RRETURN(MATCH_NOMATCH);
3436 ph10 427 }
3437 ph10 836 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3438 nigel 77 }
3439     /* Control never gets here */
3440     }
3441 nigel 93 else /* Maximize */
3442 nigel 77 {
3443     pp = eptr;
3444     for (i = min; i < max; i++)
3445     {
3446 ph10 463 if (eptr >= md->end_subject)
3447 ph10 462 {
3448 ph10 463 SCHECK_PARTIAL();
3449 ph10 462 break;
3450 ph10 463 }
3451 ph10 462 if (fc != *eptr) break;
3452 nigel 77 eptr++;
3453     }
3454 nigel 93 if (possessive) continue;
3455 ph10 443
3456 nigel 77 while (eptr >= pp)
3457     {
3458 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3459 nigel 77 eptr--;
3460     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3461     }
3462 ph10 836 RRETURN(MATCH_NOMATCH);
3463 nigel 77 }
3464     }
3465     /* Control never gets here */
3466    
3467     /* Match a negated single one-byte character. The character we are
3468     checking can be multibyte. */
3469    
3470     case OP_NOT:
3471 ph10 625 case OP_NOTI:
3472 ph10 443 if (eptr >= md->end_subject)
3473 ph10 428 {
3474 ph10 443 SCHECK_PARTIAL();
3475 ph10 836 RRETURN(MATCH_NOMATCH);
3476 ph10 443 }
3477 nigel 77 ecode++;
3478     GETCHARINCTEST(c, eptr);
3479 ph10 602 if (op == OP_NOTI) /* The caseless case */
3480 nigel 77 {
3481 ph10 836 register int ch, och;
3482     ch = *ecode++;
3483     #ifdef COMPILE_PCRE8
3484     /* ch must be < 128 if UTF is enabled. */
3485     och = md->fcc[ch];
3486     #else
3487     #ifdef SUPPORT_UTF
3488     #ifdef SUPPORT_UCP
3489     if (utf && ch > 127)
3490     och = UCD_OTHERCASE(ch);
3491     #else
3492     if (utf && ch > 127)
3493     och = ch;
3494     #endif /* SUPPORT_UCP */
3495     else
3496     #endif /* SUPPORT_UTF */
3497     och = TABLE_GET(ch, md->fcc, ch);
3498     #endif /* COMPILE_PCRE8 */
3499     if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3500 nigel 77 }
3501 ph10 602 else /* Caseful */
3502 nigel 77 {
3503 ph10 836 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
3504 nigel 77 }
3505     break;
3506    
3507     /* Match a negated single one-byte character repeatedly. This is almost a
3508     repeat of the code for a repeated single character, but I haven't found a
3509     nice way of commoning these up that doesn't require a test of the
3510     positive/negative option for each character match. Maybe that wouldn't add
3511     very much to the time taken, but character matching *is* what this is all
3512     about... */
3513    
3514     case OP_NOTEXACT:
3515 ph10 602 case OP_NOTEXACTI:
3516 nigel 77 min = max = GET2(ecode, 1);
3517 ph10 836 ecode += 1 + IMM2_SIZE;
3518 nigel 77 goto REPEATNOTCHAR;
3519    
3520     case OP_NOTUPTO:
3521 ph10 602 case OP_NOTUPTOI:
3522 nigel 77 case OP_NOTMINUPTO:
3523 ph10 602 case OP_NOTMINUPTOI:
3524 nigel 77 min = 0;
3525     max = GET2(ecode, 1);
3526 ph10 602 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3527 ph10 836 ecode += 1 + IMM2_SIZE;
3528 nigel 77 goto REPEATNOTCHAR;
3529    
3530 nigel 93 case OP_NOTPOSSTAR:
3531 ph10 602 case OP_NOTPOSSTARI:
3532 nigel 93 possessive = TRUE;
3533     min = 0;
3534     max = INT_MAX;
3535     ecode++;
3536     goto REPEATNOTCHAR;
3537    
3538     case OP_NOTPOSPLUS:
3539 ph10 602 case OP_NOTPOSPLUSI:
3540 nigel 93 possessive = TRUE;
3541     min = 1;
3542     max = INT_MAX;
3543     ecode++;
3544     goto REPEATNOTCHAR;
3545    
3546     case OP_NOTPOSQUERY:
3547 ph10 602 case OP_NOTPOSQUERYI:
3548 nigel 93 possessive = TRUE;
3549     min = 0;
3550     max = 1;
3551     ecode++;
3552     goto REPEATNOTCHAR;
3553    
3554     case OP_NOTPOSUPTO:
3555 ph10 602 case OP_NOTPOSUPTOI:
3556 nigel 93 possessive = TRUE;
3557     min = 0;
3558     max = GET2(ecode, 1);
3559 ph10 836 ecode += 1 + IMM2_SIZE;
3560 nigel 93 goto REPEATNOTCHAR;
3561    
3562 nigel 77 case OP_NOTSTAR:
3563 ph10 602 case OP_NOTSTARI:
3564 nigel 77 case OP_NOTMINSTAR:
3565 ph10 602 case OP_NOTMINSTARI:
3566 nigel 77 case OP_NOTPLUS:
3567 ph10 602 case OP_NOTPLUSI:
3568 nigel 77 case OP_NOTMINPLUS:
3569 ph10 602 case OP_NOTMINPLUSI:
3570 nigel 77 case OP_NOTQUERY:
3571 ph10 602 case OP_NOTQUERYI:
3572 nigel 77 case OP_NOTMINQUERY:
3573 ph10 602 case OP_NOTMINQUERYI:
3574     c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3575 nigel 77 minimize = (c & 1) != 0;
3576     min = rep_min[c]; /* Pick up values from tables; */
3577     max = rep_max[c]; /* zero for max => infinity */
3578     if (max == 0) max = INT_MAX;
3579    
3580 ph10 426 /* Common code for all repeated single-byte matches. */
3581 nigel 77
3582     REPEATNOTCHAR:
3583     fc = *ecode++;
3584    
3585     /* The code is duplicated for the caseless and caseful cases, for speed,
3586     since matching characters is likely to be quite common. First, ensure the
3587     minimum number of matches are present. If min = max, continue at the same
3588     level without recursing. Otherwise, if minimizing, keep trying the rest of
3589     the expression and advancing one matching character if failing, up to the
3590     maximum. Alternatively, if maximizing, find the maximum number of
3591     characters and work backwards. */
3592    
3593     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3594     max, eptr));
3595    
3596 ph10 602 if (op >= OP_NOTSTARI) /* Caseless */
3597 nigel 77 {
3598 ph10 836 #ifdef COMPILE_PCRE8
3599     /* fc must be < 128 if UTF is enabled. */
3600     foc = md->fcc[fc];
3601     #else
3602     #ifdef SUPPORT_UTF
3603     #ifdef SUPPORT_UCP
3604     if (utf && fc > 127)
3605     foc = UCD_OTHERCASE(fc);
3606     #else
3607     if (utf && fc > 127)
3608     foc = fc;
3609     #endif /* SUPPORT_UCP */
3610     else
3611     #endif /* SUPPORT_UTF */
3612     foc = TABLE_GET(fc, md->fcc, fc);
3613     #endif /* COMPILE_PCRE8 */
3614 nigel 77
3615 ph10 836 #ifdef SUPPORT_UTF
3616     if (utf)
3617 nigel 77 {
3618 nigel 93 register unsigned int d;
3619 nigel 77 for (i = 1; i <= min; i++)
3620     {
3621 ph10 426 if (eptr >= md->end_subject)
3622     {
3623     SCHECK_PARTIAL();
3624 ph10 836 RRETURN(MATCH_NOMATCH);
3625 ph10 427 }
3626 nigel 77 GETCHARINC(d, eptr);
3627 ph10 836 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3628 nigel 77 }
3629     }
3630     else
3631     #endif
3632 ph10 836 /* Not UTF mode */
3633 nigel 77 {
3634     for (i = 1; i <= min; i++)
3635 ph10 426 {
3636     if (eptr >= md->end_subject)
3637     {
3638     SCHECK_PARTIAL();
3639 ph10 836 RRETURN(MATCH_NOMATCH);
3640 ph10 427 }
3641 ph10 836 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3642     eptr++;
3643 ph10 427 }
3644 nigel 77 }
3645    
3646     if (min == max) continue;
3647    
3648     if (minimize)
3649     {
3650 ph10 836 #ifdef SUPPORT_UTF
3651     if (utf)
3652 nigel 77 {
3653 nigel 93 register unsigned int d;
3654 nigel 77 for (fi = min;; fi++)
3655     {
3656 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3657 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3658 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
3659 ph10 427 if (eptr >= md->end_subject)
3660 ph10 426 {
3661 ph10 427 SCHECK_PARTIAL();
3662 ph10 836 RRETURN(MATCH_NOMATCH);
3663 ph10 427 }
3664 nigel 77 GETCHARINC(d, eptr);
3665 ph10 836 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3666 nigel 77 }
3667     }
3668     else
3669     #endif
3670 ph10 836 /* Not UTF mode */
3671 nigel 77 {
3672     for (fi = min;; fi++)
3673     {
3674 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3675 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3676 ph10 836 if (fi >= max) RRETURN(MATCH_NOMATCH);
3677 ph10 426 if (eptr >= md->end_subject)
3678     {
3679     SCHECK_PARTIAL();
3680 ph10 836 RRETURN(MATCH_NOMATCH);
3681 ph10 426 }
3682 ph10 836 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3683     eptr++;
3684 nigel 77 }
3685     }
3686     /* Control never gets here */
3687     }
3688    
3689     /* Maximize case */
3690    
3691     else
3692     {
3693     pp = eptr;
3694    
3695 ph10 836 #ifdef SUPPORT_UTF
3696     if (utf)
3697 nigel 77 {
3698 nigel 93 register unsigned int d;
3699 nigel 77 for (i = min; i <