/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 742 - (hide annotations) (download)
Sun Nov 6 08:05:33 2011 UTC (2 years, 9 months ago) by zherczeg
File MIME type: text/plain
File size: 201315 byte(s)
Fix cache-flush issue on PowerPC, adding some comments and a check for disabled PCRE_EXTRA_TABLES.


1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 ph10 625 /* Values for setting in md->match_function_type to indicate two special types
61     of call to match(). We do it this way to save on using another stack variable,
62 ph10 604 as stack usage is to be discouraged. */
63 nigel 77
64 ph10 604 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65     #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66 nigel 77
67     /* Non-error returns from the match() function. Error returns are externally
68     defined PCRE_ERROR_xxx codes, which are all negative. */
69    
70     #define MATCH_MATCH 1
71     #define MATCH_NOMATCH 0
72    
73 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
74 ph10 210 negative to avoid the external error codes. */
75    
76 ph10 511 #define MATCH_ACCEPT (-999)
77     #define MATCH_COMMIT (-998)
78 ph10 604 #define MATCH_KETRPOS (-997)
79 ph10 618 #define MATCH_ONCE (-996)
80     #define MATCH_PRUNE (-995)
81     #define MATCH_SKIP (-994)
82     #define MATCH_SKIP_ARG (-993)
83     #define MATCH_THEN (-992)
84 ph10 210
85 ph10 510 /* This is a convenience macro for code that occurs many times. */
86    
87     #define MRRETURN(ra) \
88     { \
89     md->mark = markptr; \
90     RRETURN(ra); \
91     }
92    
93 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
94     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95     because the offset vector is always a multiple of 3 long. */
96    
97     #define REC_STACK_SAVE_MAX 30
98    
99     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100    
101     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103    
104    
105    
106 ph10 475 #ifdef PCRE_DEBUG
107 nigel 77 /*************************************************
108     * Debugging function to print chars *
109     *************************************************/
110    
111     /* Print a sequence of chars in printable format, stopping at the end of the
112     subject if the requested.
113    
114     Arguments:
115     p points to characters
116     length number to print
117     is_subject TRUE if printing from within md->start_subject
118     md pointer to matching data block, if is_subject is TRUE
119    
120     Returns: nothing
121     */
122    
123     static void
124     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
125     {
126 nigel 93 unsigned int c;
127 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128     while (length-- > 0)
129     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130     }
131     #endif
132    
133    
134    
135     /*************************************************
136     * Match a back-reference *
137     *************************************************/
138    
139 ph10 595 /* Normally, if a back reference hasn't been set, the length that is passed is
140     negative, so the match always fails. However, in JavaScript compatibility mode,
141 ph10 625 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 ph10 595 subject bytes matched may be different to the number of reference bytes.
143 nigel 77
144     Arguments:
145     offset index into the offset vector
146 ph10 595 eptr pointer into the subject
147     length length of reference to be matched (number of bytes)
148 nigel 77 md points to match data block
149 ph10 602 caseless TRUE if caseless
150 nigel 77
151 ph10 595 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 nigel 77 */
153    
154 ph10 595 static int
155 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
156 ph10 602 BOOL caseless)
157 nigel 77 {
158 ph10 595 USPTR eptr_start = eptr;
159     register USPTR p = md->start_subject + md->offset_vector[offset];
160 nigel 77
161 ph10 475 #ifdef PCRE_DEBUG
162 nigel 77 if (eptr >= md->end_subject)
163     printf("matching subject <null>");
164     else
165     {
166     printf("matching subject ");
167     pchars(eptr, length, TRUE, md);
168     }
169     printf(" against backref ");
170     pchars(p, length, FALSE, md);
171     printf("\n");
172     #endif
173    
174 ph10 595 /* Always fail if reference not set (and not JavaScript compatible). */
175 nigel 77
176 ph10 595 if (length < 0) return -1;
177 nigel 77
178 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179     properly if Unicode properties are supported. Otherwise, we can check only
180     ASCII characters. */
181 nigel 77
182 ph10 602 if (caseless)
183 nigel 77 {
184 ph10 354 #ifdef SUPPORT_UTF8
185     #ifdef SUPPORT_UCP
186     if (md->utf8)
187     {
188 ph10 625 /* Match characters up to the end of the reference. NOTE: the number of
189 ph10 595 bytes matched may differ, because there are some characters whose upper and
190     lower case versions code as different numbers of bytes. For example, U+023A
191     (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192     a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 ph10 625 the latter. It is important, therefore, to check the length along the
194 ph10 595 reference, not along the subject (earlier code did this wrong). */
195 ph10 625
196 ph10 595 USPTR endptr = p + length;
197     while (p < endptr)
198 ph10 354 {
199 ph10 358 int c, d;
200 ph10 597 if (eptr >= md->end_subject) return -1;
201 ph10 354 GETCHARINC(c, eptr);
202     GETCHARINC(d, p);
203 ph10 595 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 ph10 358 }
205     }
206 ph10 354 else
207     #endif
208     #endif
209    
210     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211     is no UCP support. */
212 ph10 597 {
213 ph10 625 if (eptr + length > md->end_subject) return -1;
214 ph10 597 while (length-- > 0)
215     { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 ph10 625 }
217 nigel 77 }
218 ph10 358
219 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
220     are in UTF-8 mode. */
221 ph10 358
222 nigel 77 else
223 ph10 625 {
224     if (eptr + length > md->end_subject) return -1;
225     while (length-- > 0) if (*p++ != *eptr++) return -1;
226 ph10 597 }
227 nigel 77
228 ph10 595 return eptr - eptr_start;
229 nigel 77 }
230    
231    
232    
233     /***************************************************************************
234     ****************************************************************************
235     RECURSION IN THE match() FUNCTION
236    
237 nigel 87 The match() function is highly recursive, though not every recursive call
238     increases the recursive depth. Nevertheless, some regular expressions can cause
239     it to recurse to a great depth. I was writing for Unix, so I just let it call
240     itself recursively. This uses the stack for saving everything that has to be
241     saved for a recursive call. On Unix, the stack can be large, and this works
242     fine.
243 nigel 77
244 nigel 87 It turns out that on some non-Unix-like systems there are problems with
245     programs that use a lot of stack. (This despite the fact that every last chip
246     has oodles of memory these days, and techniques for extending the stack have
247     been known for decades.) So....
248 nigel 77
249     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250     calls by keeping local variables that need to be preserved in blocks of memory
251 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
252 nigel 77 achieve this so that the actual code doesn't look very different to what it
253     always used to.
254 ph10 164
255 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
256 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
257     Switzer, the use of longjmp() has been abolished, at the cost of having to
258     provide a unique number for each call to RMATCH. There is no way of generating
259     a sequence of numbers at compile time in C. I have given them names, to make
260     them stand out more clearly.
261    
262     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
265     don't have indeterminate values; this has meant that the frame size can be
266 ph10 164 reduced because the result can be "passed back" by straight setting of the
267     variable instead of being passed in the frame.
268 nigel 77 ****************************************************************************
269     ***************************************************************************/
270    
271 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272     below must be updated in sync. */
273 nigel 77
274 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 ph10 527 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 ph10 723 RM61, RM62, RM63, RM64, RM65, RM66 };
281 ph10 164
282 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
283 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 ph10 501 actually used in this definition. */
285 nigel 77
286     #ifndef NO_RECURSE
287     #define REGISTER register
288 ph10 164
289 ph10 475 #ifdef PCRE_DEBUG
290 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 nigel 87 { \
292     printf("match() called in line %d\n", __LINE__); \
293 ph10 604 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 nigel 87 printf("to line %d\n", __LINE__); \
295     }
296     #define RRETURN(ra) \
297     { \
298     printf("match() returned %d from line %d ", ra, __LINE__); \
299     return ra; \
300     }
301     #else
302 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
303     rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 nigel 77 #define RRETURN(ra) return ra
305 nigel 87 #endif
306    
307 nigel 77 #else
308    
309    
310 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
311     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312     argument of match(), which never changes. */
313 nigel 77
314     #define REGISTER
315    
316 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 nigel 77 {\
318 ph10 563 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 ph10 534 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 ph10 164 frame->Xwhere = rw; \
321     newframe->Xeptr = ra;\
322     newframe->Xecode = rb;\
323 ph10 168 newframe->Xmstart = mstart;\
324 ph10 501 newframe->Xmarkptr = markptr;\
325 ph10 164 newframe->Xoffset_top = rc;\
326 ph10 602 newframe->Xeptrb = re;\
327 ph10 164 newframe->Xrdepth = frame->Xrdepth + 1;\
328     newframe->Xprevframe = frame;\
329     frame = newframe;\
330     DPRINTF(("restarting from line %d\n", __LINE__));\
331     goto HEAP_RECURSE;\
332     L_##rw:\
333     DPRINTF(("jumped back to line %d\n", __LINE__));\
334 nigel 77 }
335    
336     #define RRETURN(ra)\
337     {\
338 ph10 527 heapframe *oldframe = frame;\
339     frame = oldframe->Xprevframe;\
340     (pcre_stack_free)(oldframe);\
341 nigel 77 if (frame != NULL)\
342     {\
343 ph10 164 rrc = ra;\
344     goto HEAP_RETURN;\
345 nigel 77 }\
346     return ra;\
347     }
348    
349    
350     /* Structure for remembering the local variables in a private frame */
351    
352     typedef struct heapframe {
353     struct heapframe *Xprevframe;
354    
355     /* Function arguments that may change */
356    
357 ph10 409 USPTR Xeptr;
358 nigel 77 const uschar *Xecode;
359 ph10 409 USPTR Xmstart;
360 ph10 501 USPTR Xmarkptr;
361 nigel 77 int Xoffset_top;
362     eptrblock *Xeptrb;
363 nigel 91 unsigned int Xrdepth;
364 nigel 77
365     /* Function local variables */
366    
367 ph10 409 USPTR Xcallpat;
368 ph10 406 #ifdef SUPPORT_UTF8
369 ph10 409 USPTR Xcharptr;
370 ph10 406 #endif
371 ph10 409 USPTR Xdata;
372     USPTR Xnext;
373     USPTR Xpp;
374     USPTR Xprev;
375     USPTR Xsaved_eptr;
376 nigel 77
377     recursion_info Xnew_recursive;
378    
379     BOOL Xcur_is_word;
380     BOOL Xcondition;
381     BOOL Xprev_is_word;
382    
383     #ifdef SUPPORT_UCP
384     int Xprop_type;
385 nigel 87 int Xprop_value;
386 nigel 77 int Xprop_fail_result;
387 ph10 123 int Xoclength;
388     uschar Xocchars[8];
389 nigel 77 #endif
390    
391 ph10 403 int Xcodelink;
392 nigel 77 int Xctype;
393 nigel 93 unsigned int Xfc;
394 nigel 77 int Xfi;
395     int Xlength;
396     int Xmax;
397     int Xmin;
398     int Xnumber;
399     int Xoffset;
400     int Xop;
401     int Xsave_capture_last;
402     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403     int Xstacksave[REC_STACK_SAVE_MAX];
404    
405     eptrblock Xnewptrb;
406    
407 ph10 164 /* Where to jump back to */
408 nigel 77
409 ph10 164 int Xwhere;
410 ph10 165
411 nigel 77 } heapframe;
412    
413     #endif
414    
415    
416     /***************************************************************************
417     ***************************************************************************/
418    
419    
420    
421     /*************************************************
422     * Match from current position *
423     *************************************************/
424    
425 nigel 93 /* This function is called recursively in many circumstances. Whenever it
426 nigel 77 returns a negative (error) response, the outer incarnation must also return the
427 ph10 426 same response. */
428 nigel 77
429 ph10 426 /* These macros pack up tests that are used for partial matching, and which
430     appears several times in the code. We set the "hit end" flag if the pointer is
431     at the end of the subject and also past the start of the subject (i.e.
432 ph10 427 something has been matched). For hard partial matching, we then return
433     immediately. The second one is used when we already know we are past the end of
434     the subject. */
435 ph10 426
436     #define CHECK_PARTIAL()\
437 ph10 553 if (md->partial != 0 && eptr >= md->end_subject && \
438     eptr > md->start_used_ptr) \
439     { \
440     md->hitend = TRUE; \
441     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 ph10 427 }
443 ph10 426
444     #define SCHECK_PARTIAL()\
445 ph10 553 if (md->partial != 0 && eptr > md->start_used_ptr) \
446     { \
447     md->hitend = TRUE; \
448     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 ph10 427 }
450 ph10 426
451 ph10 427
452 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
453     the md structure (e.g. utf8, end_subject) into individual variables to improve
454 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455     made performance worse.
456    
457     Arguments:
458 nigel 93 eptr pointer to current character in subject
459     ecode pointer to current position in compiled code
460 ph10 168 mstart pointer to the current match start position (can be modified
461 ph10 172 by encountering \K)
462 ph10 501 markptr pointer to the most recent MARK name, or NULL
463 nigel 77 offset_top current top pointer
464     md pointer to "static" info for the match
465     eptrb pointer to chain of blocks containing eptr at start of
466     brackets - for testing for empty matches
467 nigel 87 rdepth the recursion depth
468 nigel 77
469     Returns: MATCH_MATCH if matched ) these values are >= 0
470     MATCH_NOMATCH if failed to match )
471 ph10 510 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 nigel 77 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 nigel 87 (e.g. stopped by repeated call or recursion limit)
474 nigel 77 */
475    
476     static int
477 ph10 510 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
478 ph10 625 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
479 ph10 604 unsigned int rdepth)
480 nigel 77 {
481     /* These variables do not need to be preserved over recursion in this function,
482 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
483     "register" because they are used a lot in loops. */
484 nigel 77
485 nigel 91 register int rrc; /* Returns from recursive calls */
486     register int i; /* Used for loops not involving calls to RMATCH() */
487 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
489 nigel 77
490 nigel 93 BOOL minimize, possessive; /* Quantifier options */
491 ph10 602 BOOL caseless;
492 ph10 403 int condcode;
493 nigel 93
494 nigel 77 /* When recursion is not being used, all "local" variables that have to be
495     preserved over calls to RMATCH() are part of a "frame" which is obtained from
496     heap storage. Set up the top-level frame here; others are obtained from the
497     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498    
499     #ifdef NO_RECURSE
500 ph10 563 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 ph10 531 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 nigel 77 frame->Xprevframe = NULL; /* Marks the top level */
503    
504     /* Copy in the original argument variables */
505    
506     frame->Xeptr = eptr;
507     frame->Xecode = ecode;
508 ph10 168 frame->Xmstart = mstart;
509 ph10 501 frame->Xmarkptr = markptr;
510 nigel 77 frame->Xoffset_top = offset_top;
511     frame->Xeptrb = eptrb;
512 nigel 87 frame->Xrdepth = rdepth;
513 nigel 77
514     /* This is where control jumps back to to effect "recursion" */
515    
516     HEAP_RECURSE:
517    
518     /* Macros make the argument variables come from the current frame */
519    
520     #define eptr frame->Xeptr
521     #define ecode frame->Xecode
522 ph10 168 #define mstart frame->Xmstart
523 ph10 501 #define markptr frame->Xmarkptr
524 nigel 77 #define offset_top frame->Xoffset_top
525     #define eptrb frame->Xeptrb
526 nigel 87 #define rdepth frame->Xrdepth
527 nigel 77
528     /* Ditto for the local variables */
529    
530     #ifdef SUPPORT_UTF8
531     #define charptr frame->Xcharptr
532     #endif
533     #define callpat frame->Xcallpat
534 ph10 403 #define codelink frame->Xcodelink
535 nigel 77 #define data frame->Xdata
536     #define next frame->Xnext
537     #define pp frame->Xpp
538     #define prev frame->Xprev
539     #define saved_eptr frame->Xsaved_eptr
540    
541     #define new_recursive frame->Xnew_recursive
542    
543     #define cur_is_word frame->Xcur_is_word
544     #define condition frame->Xcondition
545     #define prev_is_word frame->Xprev_is_word
546    
547     #ifdef SUPPORT_UCP
548     #define prop_type frame->Xprop_type
549 nigel 87 #define prop_value frame->Xprop_value
550 nigel 77 #define prop_fail_result frame->Xprop_fail_result
551 ph10 115 #define oclength frame->Xoclength
552     #define occhars frame->Xocchars
553 nigel 77 #endif
554    
555     #define ctype frame->Xctype
556     #define fc frame->Xfc
557     #define fi frame->Xfi
558     #define length frame->Xlength
559     #define max frame->Xmax
560     #define min frame->Xmin
561     #define number frame->Xnumber
562     #define offset frame->Xoffset
563     #define op frame->Xop
564     #define save_capture_last frame->Xsave_capture_last
565     #define save_offset1 frame->Xsave_offset1
566     #define save_offset2 frame->Xsave_offset2
567     #define save_offset3 frame->Xsave_offset3
568     #define stacksave frame->Xstacksave
569    
570     #define newptrb frame->Xnewptrb
571    
572     /* When recursion is being used, local variables are allocated on the stack and
573     get preserved during recursion in the normal way. In this environment, fi and
574     i, and fc and c, can be the same variables. */
575    
576 nigel 93 #else /* NO_RECURSE not defined */
577 nigel 77 #define fi i
578     #define fc c
579    
580 ph10 604 /* Many of the following variables are used only in small blocks of the code.
581     My normal style of coding would have declared them within each of those blocks.
582     However, in order to accommodate the version of this code that uses an external
583     "stack" implemented on the heap, it is easier to declare them all here, so the
584     declarations can be cut out in a block. The only declarations within blocks
585     below are for variables that do not have to be preserved over a recursive call
586     to RMATCH(). */
587 nigel 77
588 ph10 625 #ifdef SUPPORT_UTF8
589     const uschar *charptr;
590     #endif
591     const uschar *callpat;
592     const uschar *data;
593     const uschar *next;
594     USPTR pp;
595     const uschar *prev;
596     USPTR saved_eptr;
597    
598     recursion_info new_recursive;
599    
600     BOOL cur_is_word;
601 nigel 87 BOOL condition;
602 nigel 77 BOOL prev_is_word;
603    
604     #ifdef SUPPORT_UCP
605     int prop_type;
606 nigel 87 int prop_value;
607 nigel 77 int prop_fail_result;
608 ph10 115 int oclength;
609     uschar occhars[8];
610 nigel 77 #endif
611    
612 ph10 399 int codelink;
613 nigel 77 int ctype;
614     int length;
615     int max;
616     int min;
617     int number;
618     int offset;
619     int op;
620     int save_capture_last;
621     int save_offset1, save_offset2, save_offset3;
622     int stacksave[REC_STACK_SAVE_MAX];
623    
624     eptrblock newptrb;
625 nigel 93 #endif /* NO_RECURSE */
626 nigel 77
627 ph10 625 /* To save space on the stack and in the heap frame, I have doubled up on some
628     of the local variables that are used only in localised parts of the code, but
629     still need to be preserved over recursive calls of match(). These macros define
630 ph10 604 the alternative names that are used. */
631    
632     #define allow_zero cur_is_word
633     #define cbegroup condition
634     #define code_offset codelink
635     #define condassert condition
636     #define matched_once prev_is_word
637    
638 nigel 77 /* These statements are here to stop the compiler complaining about unitialized
639     variables. */
640    
641     #ifdef SUPPORT_UCP
642 nigel 87 prop_value = 0;
643 nigel 77 prop_fail_result = 0;
644     #endif
645    
646 nigel 93
647 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
648     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
649     used. Thanks to Ian Taylor for noticing this possibility and sending the
650     original patch. */
651    
652     TAIL_RECURSE:
653    
654 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
655     are specified by the macro RMATCH and RRETURN is used to return. When
656     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
657 ph10 475 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
658 nigel 87 defined). However, RMATCH isn't like a function call because it's quite a
659     complicated macro. It has to be used in one particular way. This shouldn't,
660     however, impact performance when true recursion is being used. */
661 nigel 77
662 ph10 164 #ifdef SUPPORT_UTF8
663     utf8 = md->utf8; /* Local copy of the flag */
664     #else
665     utf8 = FALSE;
666     #endif
667    
668 nigel 87 /* First check that we haven't called match() too many times, or that we
669     haven't exceeded the recursive call limit. */
670    
671 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
672 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
673 nigel 77
674 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
675 ph10 625 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
676     done this way to save having to use another function argument, which would take
677 ph10 604 up space on the stack. See also MATCH_CONDASSERT below.
678 nigel 77
679 ph10 604 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
680     such remembered pointers, to be checked when we hit the closing ket, in order
681     to break infinite loops that match no characters. When match() is called in
682     other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
683     NOT be used with tail recursion, because the memory block that is used is on
684     the stack, so a new one may be required for each match(). */
685    
686     if (md->match_function_type == MATCH_CBEGROUP)
687 nigel 77 {
688 ph10 197 newptrb.epb_saved_eptr = eptr;
689     newptrb.epb_prev = eptrb;
690     eptrb = &newptrb;
691 ph10 604 md->match_function_type = 0;
692 nigel 77 }
693    
694 nigel 93 /* Now start processing the opcodes. */
695 nigel 77
696     for (;;)
697     {
698 nigel 93 minimize = possessive = FALSE;
699 nigel 77 op = *ecode;
700 ph10 625
701 nigel 93 switch(op)
702     {
703 ph10 510 case OP_MARK:
704     markptr = ecode + 2;
705     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
706 ph10 604 eptrb, RM55);
707 ph10 512
708     /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
709     argument, and we must check whether that argument matches this MARK's
710     argument. It is passed back in md->start_match_ptr (an overloading of that
711     variable). If it does match, we reset that variable to the current subject
712     position and return MATCH_SKIP. Otherwise, pass back the return code
713 ph10 510 unaltered. */
714 ph10 512
715     if (rrc == MATCH_SKIP_ARG &&
716 ph10 510 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
717     {
718     md->start_match_ptr = eptr;
719     RRETURN(MATCH_SKIP);
720     }
721    
722 ph10 512 if (md->mark == NULL) md->mark = markptr;
723 ph10 510 RRETURN(rrc);
724    
725 ph10 210 case OP_FAIL:
726 ph10 510 MRRETURN(MATCH_NOMATCH);
727 ph10 211
728 ph10 551 /* COMMIT overrides PRUNE, SKIP, and THEN */
729 ph10 553
730 ph10 510 case OP_COMMIT:
731     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
732 ph10 604 eptrb, RM52);
733 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
734 ph10 553 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
735     rrc != MATCH_THEN)
736 ph10 551 RRETURN(rrc);
737 ph10 510 MRRETURN(MATCH_COMMIT);
738    
739 ph10 551 /* PRUNE overrides THEN */
740 ph10 553
741 ph10 210 case OP_PRUNE:
742     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
743 ph10 604 eptrb, RM51);
744 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
745 ph10 510 MRRETURN(MATCH_PRUNE);
746 ph10 211
747 ph10 510 case OP_PRUNE_ARG:
748     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
749 ph10 604 eptrb, RM56);
750 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 ph10 510 md->mark = ecode + 2;
752     RRETURN(MATCH_PRUNE);
753 ph10 211
754 ph10 551 /* SKIP overrides PRUNE and THEN */
755 ph10 553
756 ph10 210 case OP_SKIP:
757     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
758 ph10 604 eptrb, RM53);
759 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
760 ph10 551 RRETURN(rrc);
761 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
762 ph10 510 MRRETURN(MATCH_SKIP);
763 ph10 211
764 ph10 510 case OP_SKIP_ARG:
765     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
766 ph10 604 eptrb, RM57);
767 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 ph10 551 RRETURN(rrc);
769 ph10 512
770     /* Pass back the current skip name by overloading md->start_match_ptr and
771     returning the special MATCH_SKIP_ARG return code. This will either be
772     caught by a matching MARK, or get to the top, where it is treated the same
773 ph10 510 as PRUNE. */
774 ph10 512
775 ph10 510 md->start_match_ptr = ecode + 2;
776 ph10 512 RRETURN(MATCH_SKIP_ARG);
777 ph10 553
778 ph10 716 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
779     the branch in which it occurs can be determined. Overload the start of
780     match pointer to do this. */
781 ph10 512
782 ph10 210 case OP_THEN:
783     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
784 ph10 604 eptrb, RM54);
785 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
786 ph10 716 md->start_match_ptr = ecode;
787 ph10 510 MRRETURN(MATCH_THEN);
788    
789     case OP_THEN_ARG:
790 ph10 733 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top,
791 ph10 716 md, eptrb, RM58);
792 ph10 510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
793 ph10 733 md->start_match_ptr = ecode;
794 ph10 716 md->mark = ecode + 2;
795 ph10 212 RRETURN(MATCH_THEN);
796 ph10 733
797 ph10 723 /* Handle an atomic group that does not contain any capturing parentheses.
798 ph10 733 This can be handled like an assertion. Prior to 8.13, all atomic groups
799     were handled this way. In 8.13, the code was changed as below for ONCE, so
800     that backups pass through the group and thereby reset captured values.
801     However, this uses a lot more stack, so in 8.20, atomic groups that do not
802     contain any captures generate OP_ONCE_NC, which can be handled in the old,
803 ph10 723 less stack intensive way.
804 ph10 211
805 ph10 723 Check the alternative branches in turn - the matching won't pass the KET
806     for this kind of subpattern. If any one branch matches, we carry on as at
807     the end of a normal bracket, leaving the subject pointer, but resetting
808     the start-of-match value in case it was changed by \K. */
809    
810     case OP_ONCE_NC:
811     prev = ecode;
812     saved_eptr = eptr;
813     do
814     {
815     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
816     if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
817     {
818     mstart = md->start_match_ptr;
819     break;
820     }
821     if (rrc == MATCH_THEN)
822     {
823     next = ecode + GET(ecode,1);
824 ph10 733 if (md->start_match_ptr < next &&
825 ph10 723 (*ecode == OP_ALT || *next == OP_ALT))
826     rrc = MATCH_NOMATCH;
827 ph10 733 }
828    
829 ph10 723 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
830     ecode += GET(ecode,1);
831     }
832     while (*ecode == OP_ALT);
833    
834     /* If hit the end of the group (which could be repeated), fail */
835    
836     if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
837    
838     /* Continue as from after the group, updating the offsets high water
839     mark, since extracts may have been taken. */
840    
841     do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
842    
843     offset_top = md->end_offset_top;
844     eptr = md->end_match_ptr;
845    
846     /* For a non-repeating ket, just continue at this level. This also
847     happens for a repeating ket if no characters were matched in the group.
848     This is the forcible breaking of infinite loops as implemented in Perl
849     5.005. */
850    
851     if (*ecode == OP_KET || eptr == saved_eptr)
852     {
853     ecode += 1+LINK_SIZE;
854     break;
855     }
856    
857     /* The repeating kets try the rest of the pattern or restart from the
858     preceding bracket, in the appropriate order. The second "call" of match()
859     uses tail recursion, to avoid using another stack frame. */
860    
861     if (*ecode == OP_KETRMIN)
862     {
863     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
864     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
865     ecode = prev;
866     goto TAIL_RECURSE;
867     }
868     else /* OP_KETRMAX */
869     {
870 ph10 733 md->match_function_type = MATCH_CBEGROUP;
871 ph10 723 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
872     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
873     ecode += 1 + LINK_SIZE;
874     goto TAIL_RECURSE;
875     }
876     /* Control never gets here */
877    
878 ph10 604 /* Handle a capturing bracket, other than those that are possessive with an
879     unlimited repeat. If there is space in the offset vector, save the current
880     subject position in the working slot at the top of the vector. We mustn't
881     change the current values of the data slot, because they may be set from a
882     previous iteration of this group, and be referred to by a reference inside
883 ph10 625 the group. A failure to match might occur after the group has succeeded,
884 ph10 617 if something later on doesn't match. For this reason, we need to restore
885     the working value and also the values of the final offsets, in case they
886     were set by a previous iteration of the same bracket.
887 nigel 77
888 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
889     a non-capturing bracket. Don't worry about setting the flag for the error
890     case here; that is handled in the code for KET. */
891 nigel 77
892 nigel 93 case OP_CBRA:
893     case OP_SCBRA:
894     number = GET2(ecode, 1+LINK_SIZE);
895 nigel 77 offset = number << 1;
896 ph10 625
897 ph10 475 #ifdef PCRE_DEBUG
898 nigel 93 printf("start bracket %d\n", number);
899     printf("subject=");
900 nigel 77 pchars(eptr, 16, TRUE, md);
901     printf("\n");
902     #endif
903    
904     if (offset < md->offset_max)
905     {
906     save_offset1 = md->offset_vector[offset];
907     save_offset2 = md->offset_vector[offset+1];
908     save_offset3 = md->offset_vector[md->offset_end - number];
909     save_capture_last = md->capture_last;
910    
911     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
912 ph10 531 md->offset_vector[md->offset_end - number] =
913 ph10 530 (int)(eptr - md->start_subject);
914 nigel 77
915 ph10 604 for (;;)
916 nigel 77 {
917 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
918     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
919 ph10 604 eptrb, RM1);
920 ph10 618 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
921 ph10 733
922     /* If we backed up to a THEN, check whether it is within the current
923     branch by comparing the address of the THEN that is passed back with
924 ph10 716 the end of the branch. If it is within the current branch, and the
925     branch is one of two or more alternatives (it either starts or ends
926 ph10 733 with OP_ALT), we have reached the limit of THEN's action, so convert
927     the return code to NOMATCH, which will cause normal backtracking to
928 ph10 716 happen from now on. Otherwise, THEN is passed back to an outer
929 ph10 733 alternative. This implements Perl's treatment of parenthesized groups,
930     where a group not containing | does not affect the current alternative,
931 ph10 716 that is, (X) is NOT the same as (X|(*F)). */
932    
933     if (rrc == MATCH_THEN)
934     {
935     next = ecode + GET(ecode,1);
936 ph10 733 if (md->start_match_ptr < next &&
937 ph10 716 (*ecode == OP_ALT || *next == OP_ALT))
938     rrc = MATCH_NOMATCH;
939 ph10 733 }
940    
941 ph10 716 /* Anything other than NOMATCH is passed back. */
942    
943     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
944 nigel 77 md->capture_last = save_capture_last;
945     ecode += GET(ecode, 1);
946 ph10 625 if (*ecode != OP_ALT) break;
947 nigel 77 }
948    
949     DPRINTF(("bracket %d failed\n", number));
950     md->offset_vector[offset] = save_offset1;
951     md->offset_vector[offset+1] = save_offset2;
952     md->offset_vector[md->offset_end - number] = save_offset3;
953 ph10 625
954 ph10 716 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
955 nigel 77
956 ph10 716 if (md->mark == NULL) md->mark = markptr;
957     RRETURN(rrc);
958 nigel 77 }
959    
960 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
961     as a non-capturing bracket. */
962 nigel 77
963 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
964     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
965    
966 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
967 nigel 77
968 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
969     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
970    
971 ph10 618 /* Non-capturing or atomic group, except for possessive with unlimited
972 ph10 723 repeat and ONCE group with no captures. Loop for all the alternatives.
973 ph10 708
974 ph10 702 When we get to the final alternative within the brackets, we used to return
975     the result of a recursive call to match() whatever happened so it was
976     possible to reduce stack usage by turning this into a tail recursion,
977     except in the case of a possibly empty group. However, now that there is
978     the possiblity of (*THEN) occurring in the final alternative, this
979     optimization is no longer always possible.
980 ph10 625
981 ph10 708 We can optimize if we know there are no (*THEN)s in the pattern; at present
982     this is the best that can be done.
983    
984 ph10 625 MATCH_ONCE is returned when the end of an atomic group is successfully
985     reached, but subsequent matching fails. It passes back up the tree (causing
986     captured values to be reset) until the original atomic group level is
987 ph10 618 reached. This is tested by comparing md->once_target with the start of the
988     group. At this point, the return is converted into MATCH_NOMATCH so that
989     previous backup points can be taken. */
990 nigel 77
991 ph10 618 case OP_ONCE:
992 nigel 93 case OP_BRA:
993     case OP_SBRA:
994     DPRINTF(("start non-capturing bracket\n"));
995 ph10 618
996 nigel 91 for (;;)
997 nigel 77 {
998 ph10 618 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
999 ph10 702
1000     /* If this is not a possibly empty group, and there are no (*THEN)s in
1001 ph10 708 the pattern, and this is the final alternative, optimize as described
1002 ph10 702 above. */
1003    
1004     else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1005     {
1006     ecode += _pcre_OP_lengths[*ecode];
1007     goto TAIL_RECURSE;
1008 ph10 708 }
1009 ph10 702
1010     /* In all other cases, we have to make another call to match(). */
1011    
1012 ph10 708 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
1013 ph10 604 RM2);
1014 ph10 733
1015 ph10 716 /* See comment in the code for capturing groups above about handling
1016     THEN. */
1017    
1018     if (rrc == MATCH_THEN)
1019 ph10 625 {
1020 ph10 716 next = ecode + GET(ecode,1);
1021 ph10 733 if (md->start_match_ptr < next &&
1022 ph10 716 (*ecode == OP_ALT || *next == OP_ALT))
1023     rrc = MATCH_NOMATCH;
1024 ph10 733 }
1025    
1026     if (rrc != MATCH_NOMATCH)
1027 ph10 716 {
1028 ph10 618 if (rrc == MATCH_ONCE)
1029     {
1030     const uschar *scode = ecode;
1031     if (*scode != OP_ONCE) /* If not at start, find it */
1032     {
1033     while (*scode == OP_ALT) scode += GET(scode, 1);
1034     scode -= GET(scode, 1);
1035 ph10 625 }
1036 ph10 618 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1037 ph10 625 }
1038 ph10 550 RRETURN(rrc);
1039 ph10 625 }
1040 nigel 77 ecode += GET(ecode, 1);
1041 ph10 625 if (*ecode != OP_ALT) break;
1042 nigel 77 }
1043 ph10 733
1044 ph10 716 if (md->mark == NULL) md->mark = markptr;
1045 ph10 609 RRETURN(MATCH_NOMATCH);
1046    
1047 ph10 625 /* Handle possessive capturing brackets with an unlimited repeat. We come
1048 ph10 604 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1049     handled similarly to the normal case above. However, the matching is
1050     different. The end of these brackets will always be OP_KETRPOS, which
1051     returns MATCH_KETRPOS without going further in the pattern. By this means
1052     we can handle the group by iteration rather than recursion, thereby
1053     reducing the amount of stack needed. */
1054 ph10 625
1055 ph10 604 case OP_CBRAPOS:
1056     case OP_SCBRAPOS:
1057     allow_zero = FALSE;
1058 ph10 625
1059 ph10 604 POSSESSIVE_CAPTURE:
1060     number = GET2(ecode, 1+LINK_SIZE);
1061     offset = number << 1;
1062    
1063     #ifdef PCRE_DEBUG
1064     printf("start possessive bracket %d\n", number);
1065     printf("subject=");
1066     pchars(eptr, 16, TRUE, md);
1067     printf("\n");
1068     #endif
1069    
1070     if (offset < md->offset_max)
1071     {
1072     matched_once = FALSE;
1073 ph10 625 code_offset = ecode - md->start_code;
1074 ph10 604
1075     save_offset1 = md->offset_vector[offset];
1076     save_offset2 = md->offset_vector[offset+1];
1077     save_offset3 = md->offset_vector[md->offset_end - number];
1078     save_capture_last = md->capture_last;
1079    
1080     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1081 ph10 625
1082     /* Each time round the loop, save the current subject position for use
1083     when the group matches. For MATCH_MATCH, the group has matched, so we
1084     restart it with a new subject starting position, remembering that we had
1085     at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1086     usual. If we haven't matched any alternatives in any iteration, check to
1087     see if a previous iteration matched. If so, the group has matched;
1088     continue from afterwards. Otherwise it has failed; restore the previous
1089 ph10 604 capture values before returning NOMATCH. */
1090 ph10 625
1091 ph10 604 for (;;)
1092     {
1093     md->offset_vector[md->offset_end - number] =
1094     (int)(eptr - md->start_subject);
1095 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1096 ph10 604 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1097     eptrb, RM63);
1098     if (rrc == MATCH_KETRPOS)
1099     {
1100     offset_top = md->end_offset_top;
1101     eptr = md->end_match_ptr;
1102 ph10 625 ecode = md->start_code + code_offset;
1103 ph10 604 save_capture_last = md->capture_last;
1104 ph10 625 matched_once = TRUE;
1105     continue;
1106     }
1107 ph10 733
1108 ph10 716 /* See comment in the code for capturing groups above about handling
1109     THEN. */
1110    
1111     if (rrc == MATCH_THEN)
1112     {
1113     next = ecode + GET(ecode,1);
1114 ph10 733 if (md->start_match_ptr < next &&
1115 ph10 716 (*ecode == OP_ALT || *next == OP_ALT))
1116     rrc = MATCH_NOMATCH;
1117 ph10 733 }
1118 ph10 716
1119     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1120 ph10 604 md->capture_last = save_capture_last;
1121     ecode += GET(ecode, 1);
1122 ph10 625 if (*ecode != OP_ALT) break;
1123 ph10 604 }
1124 ph10 610
1125 ph10 604 if (!matched_once)
1126 ph10 625 {
1127 ph10 604 md->offset_vector[offset] = save_offset1;
1128     md->offset_vector[offset+1] = save_offset2;
1129     md->offset_vector[md->offset_end - number] = save_offset3;
1130     }
1131 ph10 625
1132 ph10 716 if (md->mark == NULL) md->mark = markptr;
1133 ph10 604 if (allow_zero || matched_once)
1134 ph10 625 {
1135 ph10 604 ecode += 1 + LINK_SIZE;
1136     break;
1137 ph10 625 }
1138    
1139 ph10 604 RRETURN(MATCH_NOMATCH);
1140     }
1141 ph10 625
1142 ph10 604 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1143     as a non-capturing bracket. */
1144    
1145     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1146     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1147    
1148     DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1149    
1150     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1151     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1152    
1153 ph10 625 /* Non-capturing possessive bracket with unlimited repeat. We come here
1154 ph10 604 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1155     without the capturing complication. It is written out separately for speed
1156     and cleanliness. */
1157    
1158     case OP_BRAPOS:
1159     case OP_SBRAPOS:
1160 ph10 625 allow_zero = FALSE;
1161    
1162 ph10 604 POSSESSIVE_NON_CAPTURE:
1163     matched_once = FALSE;
1164 ph10 625 code_offset = ecode - md->start_code;
1165 ph10 604
1166     for (;;)
1167     {
1168 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1169 ph10 604 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1170 ph10 609 eptrb, RM48);
1171 ph10 604 if (rrc == MATCH_KETRPOS)
1172     {
1173 ph10 610 offset_top = md->end_offset_top;
1174 ph10 604 eptr = md->end_match_ptr;
1175 ph10 625 ecode = md->start_code + code_offset;
1176     matched_once = TRUE;
1177     continue;
1178     }
1179 ph10 733
1180 ph10 716 /* See comment in the code for capturing groups above about handling
1181     THEN. */
1182    
1183     if (rrc == MATCH_THEN)
1184     {
1185     next = ecode + GET(ecode,1);
1186 ph10 733 if (md->start_match_ptr < next &&
1187 ph10 716 (*ecode == OP_ALT || *next == OP_ALT))
1188     rrc = MATCH_NOMATCH;
1189 ph10 733 }
1190 ph10 716
1191     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1192 ph10 604 ecode += GET(ecode, 1);
1193 ph10 625 if (*ecode != OP_ALT) break;
1194 ph10 604 }
1195 ph10 625
1196     if (matched_once || allow_zero)
1197 ph10 604 {
1198     ecode += 1 + LINK_SIZE;
1199     break;
1200 ph10 625 }
1201 ph10 604 RRETURN(MATCH_NOMATCH);
1202    
1203     /* Control never reaches here. */
1204    
1205 nigel 77 /* Conditional group: compilation checked that there are no more than
1206     two branches. If the condition is false, skipping the first branch takes us
1207     past the end if there is only one branch, but that's OK because that is
1208 ph10 609 exactly what going to the ket would do. */
1209 nigel 77
1210     case OP_COND:
1211 nigel 93 case OP_SCOND:
1212 ph10 604 codelink = GET(ecode, 1);
1213 ph10 406
1214 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
1215     inserted between OP_COND and an assertion condition. */
1216 ph10 392
1217 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1218     {
1219     if (pcre_callout != NULL)
1220     {
1221     pcre_callout_block cb;
1222 ph10 645 cb.version = 2; /* Version 1 of the callout block */
1223 ph10 381 cb.callout_number = ecode[LINK_SIZE+2];
1224     cb.offset_vector = md->offset_vector;
1225     cb.subject = (PCRE_SPTR)md->start_subject;
1226 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1227     cb.start_match = (int)(mstart - md->start_subject);
1228     cb.current_position = (int)(eptr - md->start_subject);
1229 ph10 381 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1230     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1231     cb.capture_top = offset_top/2;
1232     cb.capture_last = md->capture_last;
1233     cb.callout_data = md->callout_data;
1234 ph10 654 cb.mark = markptr;
1235 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1236 ph10 381 if (rrc < 0) RRETURN(rrc);
1237     }
1238     ecode += _pcre_OP_lengths[OP_CALLOUT];
1239     }
1240 ph10 392
1241 ph10 399 condcode = ecode[LINK_SIZE+1];
1242 ph10 406
1243 ph10 381 /* Now see what the actual condition is */
1244 ph10 392
1245 ph10 459 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1246 nigel 77 {
1247 ph10 459 if (md->recursive == NULL) /* Not recursing => FALSE */
1248     {
1249 ph10 461 condition = FALSE;
1250     ecode += GET(ecode, 1);
1251     }
1252 ph10 459 else
1253 ph10 461 {
1254 ph10 459 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1255     condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1256 ph10 461
1257 ph10 459 /* If the test is for recursion into a specific subpattern, and it is
1258     false, but the test was set up by name, scan the table to see if the
1259     name refers to any other numbers, and test them. The condition is true
1260     if any one is set. */
1261 ph10 461
1262 ph10 459 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1263     {
1264     uschar *slotA = md->name_table;
1265     for (i = 0; i < md->name_count; i++)
1266 ph10 461 {
1267     if (GET2(slotA, 0) == recno) break;
1268 ph10 459 slotA += md->name_entry_size;
1269     }
1270 ph10 461
1271 ph10 459 /* Found a name for the number - there can be only one; duplicate
1272     names for different numbers are allowed, but not vice versa. First
1273     scan down for duplicates. */
1274 ph10 461
1275 ph10 459 if (i < md->name_count)
1276 ph10 461 {
1277 ph10 459 uschar *slotB = slotA;
1278     while (slotB > md->name_table)
1279     {
1280     slotB -= md->name_entry_size;
1281     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1282     {
1283     condition = GET2(slotB, 0) == md->recursive->group_num;
1284 ph10 461 if (condition) break;
1285     }
1286 ph10 459 else break;
1287 ph10 461 }
1288    
1289 ph10 459 /* Scan up for duplicates */
1290 ph10 461
1291 ph10 459 if (!condition)
1292 ph10 461 {
1293 ph10 459 slotB = slotA;
1294     for (i++; i < md->name_count; i++)
1295     {
1296     slotB += md->name_entry_size;
1297     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1298     {
1299     condition = GET2(slotB, 0) == md->recursive->group_num;
1300     if (condition) break;
1301 ph10 461 }
1302 ph10 459 else break;
1303 ph10 461 }
1304     }
1305 ph10 459 }
1306 ph10 461 }
1307    
1308 ph10 459 /* Chose branch according to the condition */
1309 ph10 461
1310 ph10 459 ecode += condition? 3 : GET(ecode, 1);
1311     }
1312 ph10 461 }
1313 nigel 93
1314 ph10 459 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1315 nigel 93 {
1316 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1317 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1318 ph10 461
1319 ph10 459 /* If the numbered capture is unset, but the reference was by name,
1320 ph10 461 scan the table to see if the name refers to any other numbers, and test
1321     them. The condition is true if any one is set. This is tediously similar
1322     to the code above, but not close enough to try to amalgamate. */
1323    
1324 ph10 459 if (!condition && condcode == OP_NCREF)
1325     {
1326 ph10 461 int refno = offset >> 1;
1327 ph10 459 uschar *slotA = md->name_table;
1328 ph10 461
1329 ph10 459 for (i = 0; i < md->name_count; i++)
1330 ph10 461 {
1331     if (GET2(slotA, 0) == refno) break;
1332 ph10 459 slotA += md->name_entry_size;
1333     }
1334 ph10 461
1335     /* Found a name for the number - there can be only one; duplicate names
1336     for different numbers are allowed, but not vice versa. First scan down
1337 ph10 459 for duplicates. */
1338 ph10 461
1339 ph10 459 if (i < md->name_count)
1340 ph10 461 {
1341 ph10 459 uschar *slotB = slotA;
1342     while (slotB > md->name_table)
1343     {
1344     slotB -= md->name_entry_size;
1345     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1346     {
1347     offset = GET2(slotB, 0) << 1;
1348 ph10 461 condition = offset < offset_top &&
1349 ph10 459 md->offset_vector[offset] >= 0;
1350 ph10 461 if (condition) break;
1351     }
1352 ph10 459 else break;
1353 ph10 461 }
1354    
1355 ph10 459 /* Scan up for duplicates */
1356 ph10 461
1357 ph10 459 if (!condition)
1358 ph10 461 {
1359 ph10 459 slotB = slotA;
1360     for (i++; i < md->name_count; i++)
1361     {
1362     slotB += md->name_entry_size;
1363     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1364     {
1365     offset = GET2(slotB, 0) << 1;
1366 ph10 461 condition = offset < offset_top &&
1367 ph10 459 md->offset_vector[offset] >= 0;
1368 ph10 461 if (condition) break;
1369     }
1370 ph10 459 else break;
1371 ph10 461 }
1372     }
1373 ph10 459 }
1374 ph10 461 }
1375    
1376 ph10 459 /* Chose branch according to the condition */
1377    
1378 nigel 93 ecode += condition? 3 : GET(ecode, 1);
1379 nigel 77 }
1380    
1381 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
1382 nigel 93 {
1383     condition = FALSE;
1384     ecode += GET(ecode, 1);
1385     }
1386    
1387 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
1388 ph10 604 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1389     an assertion. */
1390 nigel 77
1391     else
1392     {
1393 ph10 625 md->match_function_type = MATCH_CONDASSERT;
1394 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1395 nigel 77 if (rrc == MATCH_MATCH)
1396     {
1397 ph10 619 if (md->end_offset_top > offset_top)
1398     offset_top = md->end_offset_top; /* Captures may have happened */
1399 nigel 93 condition = TRUE;
1400     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1401 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1402     }
1403 ph10 733
1404 ph10 716 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1405 ph10 733 assertion; it is therefore treated as NOMATCH. */
1406 ph10 716
1407 ph10 733 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1408 nigel 77 {
1409     RRETURN(rrc); /* Need braces because of following else */
1410     }
1411 nigel 93 else
1412     {
1413     condition = FALSE;
1414 ph10 399 ecode += codelink;
1415 nigel 93 }
1416     }
1417 nigel 91
1418 ph10 716 /* We are now at the branch that is to be obeyed. As there is only one, can
1419     use tail recursion to avoid using another stack frame, except when there is
1420     unlimited repeat of a possibly empty group. In the latter case, a recursive
1421     call to match() is always required, unless the second alternative doesn't
1422     exist, in which case we can just plough on. Note that, for compatibility
1423     with Perl, the | in a conditional group is NOT treated as creating two
1424     alternatives. If a THEN is encountered in the branch, it propagates out to
1425     the enclosing alternative (unless nested in a deeper set of alternatives,
1426     of course). */
1427 nigel 91
1428 nigel 93 if (condition || *ecode == OP_ALT)
1429     {
1430 ph10 716 if (op != OP_SCOND)
1431 ph10 702 {
1432     ecode += 1 + LINK_SIZE;
1433     goto TAIL_RECURSE;
1434 ph10 708 }
1435 ph10 733
1436 ph10 716 md->match_function_type = MATCH_CBEGROUP;
1437 ph10 609 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1438     RRETURN(rrc);
1439 nigel 77 }
1440 ph10 708
1441 ph10 702 /* Condition false & no alternative; continue after the group. */
1442 ph10 708
1443 ph10 702 else
1444 nigel 93 {
1445     ecode += 1 + LINK_SIZE;
1446     }
1447     break;
1448 nigel 77
1449 ph10 461
1450 ph10 447 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1451     to close any currently open capturing brackets. */
1452 ph10 461
1453 ph10 447 case OP_CLOSE:
1454 ph10 461 number = GET2(ecode, 1);
1455 ph10 447 offset = number << 1;
1456 ph10 461
1457 ph10 475 #ifdef PCRE_DEBUG
1458 ph10 447 printf("end bracket %d at *ACCEPT", number);
1459     printf("\n");
1460     #endif
1461 nigel 77
1462 ph10 447 md->capture_last = number;
1463     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1464     {
1465     md->offset_vector[offset] =
1466     md->offset_vector[md->offset_end - number];
1467 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1468 ph10 447 if (offset_top <= offset) offset_top = offset + 2;
1469     }
1470     ecode += 3;
1471 ph10 461 break;
1472 ph10 447
1473    
1474 ph10 619 /* End of the pattern, either real or forced. */
1475 nigel 77
1476 ph10 619 case OP_END:
1477 ph10 210 case OP_ACCEPT:
1478 ph10 625 case OP_ASSERT_ACCEPT:
1479    
1480 ph10 619 /* If we have matched an empty string, fail if not in an assertion and not
1481     in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1482 ph10 613 is set and we have matched at the start of the subject. In both cases,
1483     backtracking will then try other alternatives, if any. */
1484 ph10 443
1485 ph10 619 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1486 ph10 618 md->recursive == NULL &&
1487 ph10 619 (md->notempty ||
1488     (md->notempty_atstart &&
1489     mstart == md->start_subject + md->start_offset)))
1490 ph10 510 MRRETURN(MATCH_NOMATCH);
1491 ph10 443
1492 ph10 442 /* Otherwise, we have a match. */
1493 ph10 625
1494 ph10 168 md->end_match_ptr = eptr; /* Record where we ended */
1495     md->end_offset_top = offset_top; /* and how many extracts were taken */
1496 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1497 nigel 77
1498 ph10 512 /* For some reason, the macros don't work properly if an expression is
1499     given as the argument to MRRETURN when the heap is in use. */
1500    
1501     rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1502     MRRETURN(rrc);
1503    
1504 nigel 77 /* Assertion brackets. Check the alternative branches in turn - the
1505     matching won't pass the KET for an assertion. If any one branch matches,
1506     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1507     start of each branch to move the current point backwards, so the code at
1508 ph10 625 this level is identical to the lookahead case. When the assertion is part
1509     of a condition, we want to return immediately afterwards. The caller of
1510     this incarnation of the match() function will have set MATCH_CONDASSERT in
1511     md->match_function type, and one of these opcodes will be the first opcode
1512     that is processed. We use a local variable that is preserved over calls to
1513 ph10 604 match() to remember this case. */
1514 nigel 77
1515     case OP_ASSERT:
1516     case OP_ASSERTBACK:
1517 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1518     {
1519     condassert = TRUE;
1520     md->match_function_type = 0;
1521     }
1522 ph10 625 else condassert = FALSE;
1523    
1524 nigel 77 do
1525     {
1526 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1527 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1528 ph10 500 {
1529     mstart = md->start_match_ptr; /* In case \K reset it */
1530 ph10 630 markptr = md->mark;
1531 ph10 500 break;
1532 ph10 501 }
1533 ph10 733
1534     /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1535 ph10 716 as NOMATCH. */
1536 ph10 733
1537 ph10 716 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1538 nigel 77 ecode += GET(ecode, 1);
1539     }
1540     while (*ecode == OP_ALT);
1541 ph10 625
1542 ph10 510 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1543 nigel 77
1544     /* If checking an assertion for a condition, return MATCH_MATCH. */
1545    
1546 ph10 604 if (condassert) RRETURN(MATCH_MATCH);
1547 nigel 77
1548     /* Continue from after the assertion, updating the offsets high water
1549     mark, since extracts may have been taken during the assertion. */
1550    
1551     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1552     ecode += 1 + LINK_SIZE;
1553     offset_top = md->end_offset_top;
1554     continue;
1555    
1556 ph10 473 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1557 ph10 482 PRUNE, or COMMIT means we must assume failure without checking subsequent
1558 ph10 473 branches. */
1559 nigel 77
1560     case OP_ASSERT_NOT:
1561     case OP_ASSERTBACK_NOT:
1562 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1563     {
1564     condassert = TRUE;
1565     md->match_function_type = 0;
1566     }
1567 ph10 625 else condassert = FALSE;
1568 ph10 604
1569 nigel 77 do
1570     {
1571 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1572 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1573 ph10 473 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1574     {
1575     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1576 ph10 482 break;
1577     }
1578 ph10 716
1579 ph10 733 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1580 ph10 716 as NOMATCH. */
1581    
1582     if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1583 nigel 77 ecode += GET(ecode,1);
1584     }
1585     while (*ecode == OP_ALT);
1586    
1587 ph10 604 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1588 ph10 625
1589 nigel 77 ecode += 1 + LINK_SIZE;
1590     continue;
1591    
1592     /* Move the subject pointer back. This occurs only at the start of
1593     each branch of a lookbehind assertion. If we are too close to the start to
1594     move back, this match function fails. When working with UTF-8 we move
1595     back a number of characters, not bytes. */
1596    
1597     case OP_REVERSE:
1598     #ifdef SUPPORT_UTF8
1599     if (utf8)
1600     {
1601 nigel 93 i = GET(ecode, 1);
1602     while (i-- > 0)
1603 nigel 77 {
1604     eptr--;
1605 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1606 ph10 207 BACKCHAR(eptr);
1607 nigel 77 }
1608     }
1609     else
1610     #endif
1611    
1612     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1613    
1614     {
1615 nigel 93 eptr -= GET(ecode, 1);
1616 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1617 nigel 77 }
1618    
1619 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1620 nigel 77
1621 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1622 nigel 77 ecode += 1 + LINK_SIZE;
1623     break;
1624    
1625     /* The callout item calls an external function, if one is provided, passing
1626     details of the match so far. This is mainly for debugging, though the
1627     function is able to force a failure. */
1628    
1629     case OP_CALLOUT:
1630     if (pcre_callout != NULL)
1631     {
1632     pcre_callout_block cb;
1633 ph10 645 cb.version = 2; /* Version 1 of the callout block */
1634 nigel 77 cb.callout_number = ecode[1];
1635     cb.offset_vector = md->offset_vector;
1636 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1637 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1638     cb.start_match = (int)(mstart - md->start_subject);
1639     cb.current_position = (int)(eptr - md->start_subject);
1640 nigel 77 cb.pattern_position = GET(ecode, 2);
1641     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1642     cb.capture_top = offset_top/2;
1643     cb.capture_last = md->capture_last;
1644     cb.callout_data = md->callout_data;
1645 ph10 654 cb.mark = markptr;
1646 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1647 nigel 77 if (rrc < 0) RRETURN(rrc);
1648     }
1649     ecode += 2 + 2*LINK_SIZE;
1650     break;
1651    
1652     /* Recursion either matches the current regex, or some subexpression. The
1653     offset data is the offset to the starting bracket from the start of the
1654     whole pattern. (This is so that it works from duplicated subpatterns.)
1655 ph10 625
1656 ph10 618 The state of the capturing groups is preserved over recursion, and
1657 ph10 625 re-instated afterwards. We don't know how many are started and not yet
1658 ph10 618 finished (offset_top records the completed total) so we just have to save
1659     all the potential data. There may be up to 65535 such values, which is too
1660     large to put on the stack, but using malloc for small numbers seems
1661     expensive. As a compromise, the stack is used when there are no more than
1662     REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1663 nigel 77
1664     There are also other values that have to be saved. We use a chained
1665     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1666 ph10 625 for the original version of this logic. It has, however, been hacked around
1667 ph10 618 a lot, so he is not to blame for the current way it works. */
1668 nigel 77
1669     case OP_RECURSE:
1670     {
1671 ph10 642 recursion_info *ri;
1672     int recno;
1673 ph10 654
1674 nigel 77 callpat = md->start_code + GET(ecode, 1);
1675 ph10 642 recno = (callpat == md->start_code)? 0 :
1676 ph10 654 GET2(callpat, 1 + LINK_SIZE);
1677    
1678     /* Check for repeating a recursion without advancing the subject pointer.
1679 ph10 642 This should catch convoluted mutual recursions. (Some simple cases are
1680 ph10 654 caught at compile time.) */
1681    
1682 ph10 642 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1683 ph10 654 if (recno == ri->group_num && eptr == ri->subject_position)
1684 ph10 642 RRETURN(PCRE_ERROR_RECURSELOOP);
1685 nigel 77
1686     /* Add to "recursing stack" */
1687    
1688 ph10 642 new_recursive.group_num = recno;
1689     new_recursive.subject_position = eptr;
1690 nigel 77 new_recursive.prevrec = md->recursive;
1691     md->recursive = &new_recursive;
1692    
1693 ph10 618 /* Where to continue from afterwards */
1694 nigel 77
1695     ecode += 1 + LINK_SIZE;
1696    
1697 ph10 618 /* Now save the offset data */
1698 nigel 77
1699     new_recursive.saved_max = md->offset_end;
1700     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1701     new_recursive.offset_save = stacksave;
1702     else
1703     {
1704     new_recursive.offset_save =
1705     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1706     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1707     }
1708     memcpy(new_recursive.offset_save, md->offset_vector,
1709     new_recursive.saved_max * sizeof(int));
1710 ph10 625
1711 ph10 618 /* OK, now we can do the recursion. After processing each alternative,
1712 ph10 625 restore the offset data. If there were nested recursions, md->recursive
1713 ph10 618 might be changed, so reset it before looping. */
1714 nigel 77
1715     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1716 ph10 604 cbegroup = (*callpat >= OP_SBRA);
1717 nigel 77 do
1718     {
1719 ph10 604 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1720 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1721 ph10 604 md, eptrb, RM6);
1722 ph10 618 memcpy(md->offset_vector, new_recursive.offset_save,
1723     new_recursive.saved_max * sizeof(int));
1724 ph10 681 md->recursive = new_recursive.prevrec;
1725 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1726 nigel 77 {
1727 nigel 87 DPRINTF(("Recursion matched\n"));
1728 nigel 77 if (new_recursive.offset_save != stacksave)
1729     (pcre_free)(new_recursive.offset_save);
1730 ph10 618
1731     /* Set where we got to in the subject, and reset the start in case
1732 ph10 625 it was changed by \K. This *is* propagated back out of a recursion,
1733     for Perl compatibility. */
1734    
1735 ph10 618 eptr = md->end_match_ptr;
1736     mstart = md->start_match_ptr;
1737     goto RECURSION_MATCHED; /* Exit loop; end processing */
1738 nigel 77 }
1739 ph10 716
1740     /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1741     as NOMATCH. */
1742    
1743 ph10 733 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1744 nigel 87 {
1745     DPRINTF(("Recursion gave error %d\n", rrc));
1746 ph10 400 if (new_recursive.offset_save != stacksave)
1747     (pcre_free)(new_recursive.offset_save);
1748 nigel 87 RRETURN(rrc);
1749     }
1750 nigel 77
1751     md->recursive = &new_recursive;
1752     callpat += GET(callpat, 1);
1753     }
1754     while (*callpat == OP_ALT);
1755    
1756     DPRINTF(("Recursion didn't match\n"));
1757     md->recursive = new_recursive.prevrec;
1758     if (new_recursive.offset_save != stacksave)
1759     (pcre_free)(new_recursive.offset_save);
1760 ph10 510 MRRETURN(MATCH_NOMATCH);
1761 nigel 77 }
1762 ph10 625
1763 ph10 618 RECURSION_MATCHED:
1764     break;
1765 nigel 77
1766     /* An alternation is the end of a branch; scan along to find the end of the
1767     bracketed group and go to there. */
1768    
1769     case OP_ALT:
1770     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1771     break;
1772    
1773 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1774     indicating that it may occur zero times. It may repeat infinitely, or not
1775     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1776     with fixed upper repeat limits are compiled as a number of copies, with the
1777     optional ones preceded by BRAZERO or BRAMINZERO. */
1778 ph10 625
1779 nigel 77 case OP_BRAZERO:
1780 ph10 604 next = ecode + 1;
1781     RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1782     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1783     do next += GET(next, 1); while (*next == OP_ALT);
1784     ecode = next + 1 + LINK_SIZE;
1785 nigel 77 break;
1786 ph10 625
1787 nigel 77 case OP_BRAMINZERO:
1788 ph10 604 next = ecode + 1;
1789     do next += GET(next, 1); while (*next == OP_ALT);
1790     RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1791     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1792     ecode++;
1793 nigel 77 break;
1794    
1795 ph10 335 case OP_SKIPZERO:
1796 ph10 604 next = ecode+1;
1797     do next += GET(next,1); while (*next == OP_ALT);
1798     ecode = next + 1 + LINK_SIZE;
1799 ph10 335 break;
1800 ph10 625
1801 ph10 604 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1802     here; just jump to the group, with allow_zero set TRUE. */
1803 ph10 625
1804 ph10 604 case OP_BRAPOSZERO:
1805 ph10 625 op = *(++ecode);
1806 ph10 604 allow_zero = TRUE;
1807     if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1808     goto POSSESSIVE_NON_CAPTURE;
1809 ph10 335
1810 nigel 93 /* End of a group, repeated or non-repeating. */
1811 nigel 77
1812     case OP_KET:
1813     case OP_KETRMIN:
1814     case OP_KETRMAX:
1815 ph10 625 case OP_KETRPOS:
1816 nigel 91 prev = ecode - GET(ecode, 1);
1817 ph10 625
1818 nigel 93 /* If this was a group that remembered the subject start, in order to break
1819     infinite repeats of empty string matches, retrieve the subject start from
1820     the chain. Otherwise, set it NULL. */
1821 nigel 77
1822 ph10 618 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1823 nigel 93 {
1824     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1825     eptrb = eptrb->epb_prev; /* Backup to previous group */
1826     }
1827     else saved_eptr = NULL;
1828 nigel 77
1829 ph10 733 /* If we are at the end of an assertion group or a non-capturing atomic
1830 ph10 723 group, stop matching and return MATCH_MATCH, but record the current high
1831     water mark for use by positive assertions. We also need to record the match
1832     start in case it was changed by \K. */
1833 nigel 93
1834 ph10 723 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1835 ph10 733 *prev == OP_ONCE_NC)
1836 nigel 91 {
1837 ph10 723 md->end_match_ptr = eptr; /* For ONCE_NC */
1838 nigel 91 md->end_offset_top = offset_top;
1839 ph10 500 md->start_match_ptr = mstart;
1840 ph10 630 MRRETURN(MATCH_MATCH); /* Sets md->mark */
1841 nigel 91 }
1842 nigel 77
1843 nigel 93 /* For capturing groups we have to check the group number back at the start
1844     and if necessary complete handling an extraction by setting the offsets and
1845 ph10 618 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1846     into group 0, so it won't be picked up here. Instead, we catch it when the
1847     OP_END is reached. Other recursion is handled here. We just have to record
1848     the current subject position and start match pointer and give a MATCH
1849     return. */
1850 nigel 77
1851 ph10 604 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1852     *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1853 nigel 91 {
1854 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1855 nigel 91 offset = number << 1;
1856 ph10 461
1857 ph10 475 #ifdef PCRE_DEBUG
1858 nigel 91 printf("end bracket %d", number);
1859     printf("\n");
1860 nigel 77 #endif
1861    
1862 ph10 618 /* Handle a recursively called group. */
1863    
1864     if (md->recursive != NULL && md->recursive->group_num == number)
1865     {
1866     md->end_match_ptr = eptr;
1867     md->start_match_ptr = mstart;
1868     RRETURN(MATCH_MATCH);
1869     }
1870    
1871     /* Deal with capturing */
1872    
1873 nigel 93 md->capture_last = number;
1874     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1875 nigel 91 {
1876 ph10 625 /* If offset is greater than offset_top, it means that we are
1877     "skipping" a capturing group, and that group's offsets must be marked
1878     unset. In earlier versions of PCRE, all the offsets were unset at the
1879     start of matching, but this doesn't work because atomic groups and
1880 ph10 615 assertions can cause a value to be set that should later be unset.
1881     Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1882 ph10 625 part of the atomic group, but this is not on the final matching path,
1883     so must be unset when 2 is set. (If there is no group 2, there is no
1884 ph10 615 problem, because offset_top will then be 2, indicating no capture.) */
1885 ph10 625
1886 ph10 615 if (offset > offset_top)
1887     {
1888     register int *iptr = md->offset_vector + offset_top;
1889     register int *iend = md->offset_vector + offset;
1890     while (iptr < iend) *iptr++ = -1;
1891 ph10 625 }
1892    
1893 ph10 615 /* Now make the extraction */
1894    
1895 nigel 93 md->offset_vector[offset] =
1896     md->offset_vector[md->offset_end - number];
1897 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1898 nigel 93 if (offset_top <= offset) offset_top = offset + 2;
1899     }
1900 nigel 91 }
1901 nigel 77
1902 ph10 618 /* For an ordinary non-repeating ket, just continue at this level. This
1903     also happens for a repeating ket if no characters were matched in the
1904     group. This is the forcible breaking of infinite loops as implemented in
1905 ph10 723 Perl 5.005. For a non-repeating atomic group that includes captures,
1906     establish a backup point by processing the rest of the pattern at a lower
1907     level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1908     original OP_ONCE level, thereby bypassing intermediate backup points, but
1909     resetting any captures that happened along the way. */
1910 nigel 77
1911 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1912     {
1913 ph10 618 if (*prev == OP_ONCE)
1914     {
1915     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1916     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1917     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1918 ph10 625 RRETURN(MATCH_ONCE);
1919     }
1920 ph10 618 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1921 nigel 91 break;
1922     }
1923 ph10 625
1924     /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1925 ph10 604 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1926     at a time from the outer level, thus saving stack. */
1927 ph10 625
1928 ph10 604 if (*ecode == OP_KETRPOS)
1929 ph10 625 {
1930 ph10 604 md->end_match_ptr = eptr;
1931 ph10 625 md->end_offset_top = offset_top;
1932 ph10 604 RRETURN(MATCH_KETRPOS);
1933 ph10 625 }
1934 nigel 77
1935 ph10 604 /* The normal repeating kets try the rest of the pattern or restart from
1936     the preceding bracket, in the appropriate order. In the second case, we can
1937     use tail recursion to avoid using another stack frame, unless we have an
1938 ph10 618 an atomic group or an unlimited repeat of a group that can match an empty
1939     string. */
1940 nigel 77
1941 nigel 91 if (*ecode == OP_KETRMIN)
1942     {
1943 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1944 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1945 ph10 618 if (*prev == OP_ONCE)
1946     {
1947 ph10 623 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1948 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1949     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1950 ph10 625 RRETURN(MATCH_ONCE);
1951     }
1952 ph10 604 if (*prev >= OP_SBRA) /* Could match an empty string */
1953 ph10 197 {
1954 ph10 625 md->match_function_type = MATCH_CBEGROUP;
1955 ph10 604 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1956 ph10 197 RRETURN(rrc);
1957     }
1958 nigel 91 ecode = prev;
1959     goto TAIL_RECURSE;
1960 nigel 77 }
1961 nigel 91 else /* OP_KETRMAX */
1962     {
1963 ph10 625 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1964 ph10 604 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1965 ph10 618 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1966 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1967 ph10 618 if (*prev == OP_ONCE)
1968     {
1969 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1970 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1971     md->once_target = prev;
1972 ph10 625 RRETURN(MATCH_ONCE);
1973     }
1974 nigel 91 ecode += 1 + LINK_SIZE;
1975     goto TAIL_RECURSE;
1976     }
1977     /* Control never gets here */
1978 nigel 77
1979 ph10 602 /* Not multiline mode: start of subject assertion, unless notbol. */
1980 nigel 77
1981     case OP_CIRC:
1982 ph10 510 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1983 ph10 625
1984 nigel 77 /* Start of subject assertion */
1985    
1986     case OP_SOD:
1987 ph10 510 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1988 nigel 77 ecode++;
1989     break;
1990 ph10 625
1991 ph10 602 /* Multiline mode: start of subject unless notbol, or after any newline. */
1992 nigel 77
1993 ph10 602 case OP_CIRCM:
1994     if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1995     if (eptr != md->start_subject &&
1996     (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1997     MRRETURN(MATCH_NOMATCH);
1998     ecode++;
1999     break;
2000    
2001 nigel 77 /* Start of match assertion */
2002    
2003     case OP_SOM:
2004 ph10 510 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
2005 nigel 77 ecode++;
2006     break;
2007 ph10 172
2008 ph10 168 /* Reset the start of match point */
2009 ph10 172
2010 ph10 168 case OP_SET_SOM:
2011     mstart = eptr;
2012 ph10 172 ecode++;
2013     break;
2014 nigel 77
2015 ph10 602 /* Multiline mode: assert before any newline, or before end of subject
2016     unless noteol is set. */
2017 nigel 77
2018 ph10 602 case OP_DOLLM:
2019     if (eptr < md->end_subject)
2020     { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
2021     else
2022 nigel 77 {
2023 ph10 510 if (md->noteol) MRRETURN(MATCH_NOMATCH);
2024 ph10 602 SCHECK_PARTIAL();
2025 nigel 77 }
2026 ph10 602 ecode++;
2027     break;
2028 ph10 579
2029 ph10 625 /* Not multiline mode: assert before a terminating newline or before end of
2030 ph10 602 subject unless noteol is set. */
2031    
2032     case OP_DOLL:
2033     if (md->noteol) MRRETURN(MATCH_NOMATCH);
2034     if (!md->endonly) goto ASSERT_NL_OR_EOS;
2035    
2036 nigel 91 /* ... else fall through for endonly */
2037 nigel 77
2038     /* End of subject assertion (\z) */
2039    
2040     case OP_EOD:
2041 ph10 510 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
2042 ph10 553 SCHECK_PARTIAL();
2043 nigel 77 ecode++;
2044     break;
2045    
2046     /* End of subject or ending \n assertion (\Z) */
2047    
2048     case OP_EODN:
2049 ph10 553 ASSERT_NL_OR_EOS:
2050     if (eptr < md->end_subject &&
2051 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2052 ph10 510 MRRETURN(MATCH_NOMATCH);
2053 ph10 579
2054 ph10 553 /* Either at end of string or \n before end. */
2055 ph10 579
2056 ph10 553 SCHECK_PARTIAL();
2057 nigel 77 ecode++;
2058     break;
2059    
2060     /* Word boundary assertions */
2061    
2062     case OP_NOT_WORD_BOUNDARY:
2063     case OP_WORD_BOUNDARY:
2064     {
2065    
2066     /* Find out if the previous and current characters are "word" characters.
2067     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2068 ph10 443 be "non-word" characters. Remember the earliest consulted character for
2069 ph10 435 partial matching. */
2070 nigel 77
2071     #ifdef SUPPORT_UTF8
2072     if (utf8)
2073     {
2074 ph10 518 /* Get status of previous character */
2075 ph10 527
2076 nigel 77 if (eptr == md->start_subject) prev_is_word = FALSE; else
2077     {
2078 ph10 409 USPTR lastptr = eptr - 1;
2079 nigel 77 while((*lastptr & 0xc0) == 0x80) lastptr--;
2080 ph10 443 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2081 nigel 77 GETCHAR(c, lastptr);
2082 ph10 527 #ifdef SUPPORT_UCP
2083 ph10 518 if (md->use_ucp)
2084     {
2085     if (c == '_') prev_is_word = TRUE; else
2086 ph10 527 {
2087 ph10 518 int cat = UCD_CATEGORY(c);
2088     prev_is_word = (cat == ucp_L || cat == ucp_N);
2089 ph10 527 }
2090     }
2091     else
2092     #endif
2093 nigel 77 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2094     }
2095 ph10 527
2096 ph10 518 /* Get status of next character */
2097 ph10 527
2098 ph10 443 if (eptr >= md->end_subject)
2099 nigel 77 {
2100 ph10 443 SCHECK_PARTIAL();
2101     cur_is_word = FALSE;
2102 ph10 428 }
2103     else
2104     {
2105 nigel 77 GETCHAR(c, eptr);
2106 ph10 527 #ifdef SUPPORT_UCP
2107 ph10 518 if (md->use_ucp)
2108     {
2109     if (c == '_') cur_is_word = TRUE; else
2110 ph10 527 {
2111 ph10 518 int cat = UCD_CATEGORY(c);
2112     cur_is_word = (cat == ucp_L || cat == ucp_N);
2113 ph10 527 }
2114     }
2115     else
2116     #endif
2117 nigel 77 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2118     }
2119     }
2120     else
2121     #endif
2122    
2123 ph10 527 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2124 ph10 518 consistency with the behaviour of \w we do use it in this case. */
2125 nigel 77
2126     {
2127 ph10 518 /* Get status of previous character */
2128 ph10 527
2129 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
2130     {
2131 ph10 443 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2132 ph10 527 #ifdef SUPPORT_UCP
2133 ph10 518 if (md->use_ucp)
2134     {
2135 ph10 527 c = eptr[-1];
2136 ph10 518 if (c == '_') prev_is_word = TRUE; else
2137 ph10 527 {
2138 ph10 518 int cat = UCD_CATEGORY(c);
2139     prev_is_word = (cat == ucp_L || cat == ucp_N);
2140 ph10 527 }
2141     }
2142     else
2143     #endif
2144 ph10 435 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2145     }
2146 ph10 527
2147 ph10 518 /* Get status of next character */
2148 ph10 527
2149 ph10 443 if (eptr >= md->end_subject)
2150 ph10 428 {
2151 ph10 443 SCHECK_PARTIAL();
2152     cur_is_word = FALSE;
2153 ph10 428 }
2154 ph10 527 else
2155     #ifdef SUPPORT_UCP
2156 ph10 518 if (md->use_ucp)
2157     {
2158 ph10 527 c = *eptr;
2159 ph10 518 if (c == '_') cur_is_word = TRUE; else
2160 ph10 527 {
2161 ph10 518 int cat = UCD_CATEGORY(c);
2162     cur_is_word = (cat == ucp_L || cat == ucp_N);
2163 ph10 527 }
2164     }
2165     else
2166     #endif
2167 ph10 518 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2168 nigel 77 }
2169    
2170     /* Now see if the situation is what we want */
2171    
2172     if ((*ecode++ == OP_WORD_BOUNDARY)?
2173     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2174 ph10 510 MRRETURN(MATCH_NOMATCH);
2175 nigel 77 }
2176     break;
2177    
2178     /* Match a single character type; inline for speed */
2179    
2180     case OP_ANY:
2181 ph10 510 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2182 ph10 345 /* Fall through */
2183    
2184 ph10 341 case OP_ALLANY:
2185 ph10 648 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2186     { /* not be updated before SCHECK_PARTIAL. */
2187 ph10 443 SCHECK_PARTIAL();
2188 ph10 510 MRRETURN(MATCH_NOMATCH);
2189 ph10 443 }
2190 ph10 648 eptr++;
2191 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2192 nigel 77 ecode++;
2193     break;
2194    
2195     /* Match a single byte, even in UTF-8 mode. This opcode really does match
2196     any byte, even newline, independent of the setting of PCRE_DOTALL. */
2197    
2198     case OP_ANYBYTE:
2199 ph10 648 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2200     { /* not be updated before SCHECK_PARTIAL. */
2201 ph10 443 SCHECK_PARTIAL();
2202 ph10 510 MRRETURN(MATCH_NOMATCH);
2203 ph10 443 }
2204 ph10 654 eptr++;
2205 nigel 77 ecode++;
2206     break;
2207    
2208     case OP_NOT_DIGIT:
2209 ph10 443 if (eptr >= md->end_subject)
2210 ph10 428 {
2211 ph10 443 SCHECK_PARTIAL();
2212 ph10 510 MRRETURN(MATCH_NOMATCH);
2213 ph10 443 }
2214 nigel 77 GETCHARINCTEST(c, eptr);
2215     if (
2216     #ifdef SUPPORT_UTF8
2217     c < 256 &&
2218     #endif
2219     (md->ctypes[c] & ctype_digit) != 0
2220     )
2221 ph10 510 MRRETURN(MATCH_NOMATCH);
2222 nigel 77 ecode++;
2223     break;
2224    
2225     case OP_DIGIT:
2226 ph10 443 if (eptr >= md->end_subject)
2227 ph10 428 {
2228 ph10 443 SCHECK_PARTIAL();
2229 ph10 510 MRRETURN(MATCH_NOMATCH);
2230 ph10 443 }
2231 nigel 77 GETCHARINCTEST(c, eptr);
2232     if (
2233     #ifdef SUPPORT_UTF8
2234     c >= 256 ||
2235     #endif
2236     (md->ctypes[c] & ctype_digit) == 0
2237     )
2238 ph10 510 MRRETURN(MATCH_NOMATCH);
2239 nigel 77 ecode++;
2240     break;
2241    
2242     case OP_NOT_WHITESPACE:
2243 ph10 443 if (eptr >= md->end_subject)
2244 ph10 428 {
2245 ph10 443 SCHECK_PARTIAL();
2246 ph10 510 MRRETURN(MATCH_NOMATCH);
2247 ph10 443 }
2248 nigel 77 GETCHARINCTEST(c, eptr);
2249     if (
2250     #ifdef SUPPORT_UTF8
2251     c < 256 &&
2252     #endif
2253     (md->ctypes[c] & ctype_space) != 0
2254     )
2255 ph10 510 MRRETURN(MATCH_NOMATCH);
2256 nigel 77 ecode++;
2257     break;
2258    
2259     case OP_WHITESPACE:
2260 ph10 443 if (eptr >= md->end_subject)
2261 ph10 428 {
2262 ph10 443 SCHECK_PARTIAL();
2263 ph10 510 MRRETURN(MATCH_NOMATCH);
2264 ph10 443 }
2265 nigel 77 GETCHARINCTEST(c, eptr);
2266     if (
2267     #ifdef SUPPORT_UTF8
2268     c >= 256 ||
2269     #endif
2270     (md->ctypes[c] & ctype_space) == 0
2271     )
2272 ph10 510 MRRETURN(MATCH_NOMATCH);
2273 nigel 77 ecode++;
2274     break;
2275    
2276     case OP_NOT_WORDCHAR:
2277 ph10 443 if (eptr >= md->end_subject)
2278 ph10 428 {
2279 ph10 443 SCHECK_PARTIAL();
2280 ph10 510 MRRETURN(MATCH_NOMATCH);
2281 ph10 443 }
2282 nigel 77 GETCHARINCTEST(c, eptr);
2283     if (
2284     #ifdef SUPPORT_UTF8
2285     c < 256 &&
2286     #endif
2287     (md->ctypes[c] & ctype_word) != 0
2288     )
2289 ph10 510 MRRETURN(MATCH_NOMATCH);
2290 nigel 77 ecode++;
2291     break;
2292    
2293     case OP_WORDCHAR:
2294 ph10 443 if (eptr >= md->end_subject)
2295 ph10 428 {
2296 ph10 443 SCHECK_PARTIAL();
2297 ph10 510 MRRETURN(MATCH_NOMATCH);
2298 ph10 443 }
2299 nigel 77 GETCHARINCTEST(c, eptr);
2300     if (
2301     #ifdef SUPPORT_UTF8
2302     c >= 256 ||
2303     #endif
2304     (md->ctypes[c] & ctype_word) == 0
2305     )
2306 ph10 510 MRRETURN(MATCH_NOMATCH);
2307 nigel 77 ecode++;
2308     break;
2309    
2310 nigel 93 case OP_ANYNL:
2311 ph10 443 if (eptr >= md->end_subject)
2312 ph10 428 {
2313 ph10 443 SCHECK_PARTIAL();
2314 ph10 510 MRRETURN(MATCH_NOMATCH);
2315 ph10 443 }
2316 nigel 93 GETCHARINCTEST(c, eptr);
2317     switch(c)
2318     {
2319 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2320 ph10 625
2321 nigel 93 case 0x000d:
2322     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2323     break;
2324 ph10 231
2325 nigel 93 case 0x000a:
2326 ph10 231 break;
2327    
2328 nigel 93 case 0x000b:
2329     case 0x000c:
2330     case 0x0085:
2331     case 0x2028:
2332     case 0x2029:
2333 ph10 510 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2334 nigel 93 break;
2335     }
2336     ecode++;
2337     break;
2338    
2339 ph10 178 case OP_NOT_HSPACE:
2340 ph10 443 if (eptr >= md->end_subject)
2341 ph10 428 {
2342 ph10 443 SCHECK_PARTIAL();
2343 ph10 510 MRRETURN(MATCH_NOMATCH);
2344 ph10 443 }
2345 ph10 178 GETCHARINCTEST(c, eptr);
2346     switch(c)
2347     {
2348     default: break;
2349     case 0x09: /* HT */
2350     case 0x20: /* SPACE */
2351     case 0xa0: /* NBSP */
2352     case 0x1680: /* OGHAM SPACE MARK */
2353     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2354     case 0x2000: /* EN QUAD */
2355     case 0x2001: /* EM QUAD */
2356     case 0x2002: /* EN SPACE */
2357     case 0x2003: /* EM SPACE */
2358     case 0x2004: /* THREE-PER-EM SPACE */
2359     case 0x2005: /* FOUR-PER-EM SPACE */
2360     case 0x2006: /* SIX-PER-EM SPACE */
2361     case 0x2007: /* FIGURE SPACE */
2362     case 0x2008: /* PUNCTUATION SPACE */
2363     case 0x2009: /* THIN SPACE */
2364     case 0x200A: /* HAIR SPACE */
2365     case 0x202f: /* NARROW NO-BREAK SPACE */
2366     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2367     case 0x3000: /* IDEOGRAPHIC SPACE */
2368 ph10 510 MRRETURN(MATCH_NOMATCH);
2369 ph10 178 }
2370     ecode++;
2371     break;
2372    
2373     case OP_HSPACE:
2374 ph10 443 if (eptr >= md->end_subject)
2375 ph10 428 {
2376 ph10 443 SCHECK_PARTIAL();
2377 ph10 510 MRRETURN(MATCH_NOMATCH);
2378 ph10 443 }
2379 ph10 178 GETCHARINCTEST(c, eptr);
2380     switch(c)
2381     {
2382 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2383 ph10 178 case 0x09: /* HT */
2384     case 0x20: /* SPACE */
2385     case 0xa0: /* NBSP */
2386     case 0x1680: /* OGHAM SPACE MARK */
2387     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2388     case 0x2000: /* EN QUAD */
2389     case 0x2001: /* EM QUAD */
2390     case 0x2002: /* EN SPACE */
2391     case 0x2003: /* EM SPACE */
2392     case 0x2004: /* THREE-PER-EM SPACE */
2393     case 0x2005: /* FOUR-PER-EM SPACE */
2394     case 0x2006: /* SIX-PER-EM SPACE */
2395     case 0x2007: /* FIGURE SPACE */
2396     case 0x2008: /* PUNCTUATION SPACE */
2397     case 0x2009: /* THIN SPACE */
2398     case 0x200A: /* HAIR SPACE */
2399     case 0x202f: /* NARROW NO-BREAK SPACE */
2400     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2401     case 0x3000: /* IDEOGRAPHIC SPACE */
2402     break;
2403     }
2404     ecode++;
2405     break;
2406    
2407     case OP_NOT_VSPACE:
2408 ph10 443 if (eptr >= md->end_subject)
2409 ph10 428 {
2410 ph10 443 SCHECK_PARTIAL();
2411 ph10 510 MRRETURN(MATCH_NOMATCH);
2412 ph10 443 }
2413 ph10 178 GETCHARINCTEST(c, eptr);
2414     switch(c)
2415     {
2416     default: break;
2417     case 0x0a: /* LF */
2418     case 0x0b: /* VT */
2419     case 0x0c: /* FF */
2420     case 0x0d: /* CR */
2421     case 0x85: /* NEL */
2422     case 0x2028: /* LINE SEPARATOR */
2423     case 0x2029: /* PARAGRAPH SEPARATOR */
2424 ph10 510 MRRETURN(MATCH_NOMATCH);
2425 ph10 178 }
2426     ecode++;
2427     break;
2428    
2429     case OP_VSPACE:
2430 ph10 443 if (eptr >= md->end_subject)
2431 ph10 428 {
2432 ph10 443 SCHECK_PARTIAL();
2433 ph10 510 MRRETURN(MATCH_NOMATCH);
2434 ph10 443 }
2435 ph10 178 GETCHARINCTEST(c, eptr);
2436     switch(c)
2437     {
2438 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2439 ph10 178 case 0x0a: /* LF */
2440     case 0x0b: /* VT */
2441     case 0x0c: /* FF */
2442     case 0x0d: /* CR */
2443     case 0x85: /* NEL */
2444     case 0x2028: /* LINE SEPARATOR */
2445     case 0x2029: /* PARAGRAPH SEPARATOR */
2446     break;
2447     }
2448     ecode++;
2449     break;
2450    
2451 nigel 77 #ifdef SUPPORT_UCP
2452     /* Check the next character by Unicode property. We will get here only
2453     if the support is in the binary; otherwise a compile-time error occurs. */
2454    
2455     case OP_PROP:
2456     case OP_NOTPROP:
2457 ph10 443 if (eptr >= md->end_subject)
2458 ph10 428 {
2459 ph10 443 SCHECK_PARTIAL();
2460 ph10 510 MRRETURN(MATCH_NOMATCH);
2461 ph10 443 }
2462 nigel 77 GETCHARINCTEST(c, eptr);
2463     {
2464 ph10 384 const ucd_record *prop = GET_UCD(c);
2465 nigel 77
2466 nigel 87 switch(ecode[1])
2467     {
2468     case PT_ANY:
2469 ph10 510 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2470 nigel 87 break;
2471 nigel 77
2472 nigel 87 case PT_LAMP:
2473 ph10 349 if ((prop->chartype == ucp_Lu ||
2474     prop->chartype == ucp_Ll ||
2475     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2476 ph10 510 MRRETURN(MATCH_NOMATCH);
2477 ph10 517 break;
2478 nigel 87
2479     case PT_GC:
2480 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2481 ph10 510 MRRETURN(MATCH_NOMATCH);
2482 nigel 87 break;
2483    
2484     case PT_PC:
2485 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2486 ph10 510 MRRETURN(MATCH_NOMATCH);
2487 nigel 87 break;
2488    
2489     case PT_SC:
2490 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
2491 ph10 510 MRRETURN(MATCH_NOMATCH);
2492 nigel 87 break;
2493 ph10 527
2494 ph10 517 /* These are specials */
2495 ph10 527
2496 ph10 517 case PT_ALNUM:
2497     if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2498     _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2499     MRRETURN(MATCH_NOMATCH);
2500 ph10 527 break;
2501    
2502 ph10 517 case PT_SPACE: /* Perl space */
2503     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2504     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2505     == (op == OP_NOTPROP))
2506     MRRETURN(MATCH_NOMATCH);
2507 ph10 527 break;
2508    
2509 ph10 517 case PT_PXSPACE: /* POSIX space */
2510     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2511 ph10 527 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2512 ph10 517 c == CHAR_FF || c == CHAR_CR)
2513     == (op == OP_NOTPROP))
2514     MRRETURN(MATCH_NOMATCH);
2515 ph10 527 break;
2516 nigel 87
2517 ph10 527 case PT_WORD:
2518 ph10 517 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2519 ph10 527 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2520 ph10 517 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2521     MRRETURN(MATCH_NOMATCH);
2522 ph10 527 break;
2523    
2524 ph10 517 /* This should never occur */
2525    
2526 nigel 87 default:
2527     RRETURN(PCRE_ERROR_INTERNAL);
2528 nigel 77 }
2529 nigel 87
2530     ecode += 3;
2531 nigel 77 }
2532     break;
2533    
2534     /* Match an extended Unicode sequence. We will get here only if the support
2535     is in the binary; otherwise a compile-time error occurs. */
2536    
2537     case OP_EXTUNI:
2538 ph10 443 if (eptr >= md->end_subject)
2539 ph10 428 {
2540 ph10 443 SCHECK_PARTIAL();
2541 ph10 510 MRRETURN(MATCH_NOMATCH);
2542 ph10 443 }
2543 nigel 77 GETCHARINCTEST(c, eptr);
2544 ph10 623 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2545     while (eptr < md->end_subject)
2546 nigel 77 {
2547 ph10 623 int len = 1;
2548     if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2549     if (UCD_CATEGORY(c) != ucp_M) break;
2550     eptr += len;
2551 nigel 77 }
2552     ecode++;
2553     break;
2554     #endif
2555    
2556    
2557     /* Match a back reference, possibly repeatedly. Look past the end of the
2558     item to see if there is repeat information following. The code is similar
2559     to that for character classes, but repeated for efficiency. Then obey
2560     similar code to character type repeats - written out again for speed.
2561     However, if the referenced string is the empty string, always treat
2562     it as matched, any number of times (otherwise there could be infinite
2563     loops). */
2564    
2565     case OP_REF:
2566 ph10 625 case OP_REFI:
2567     caseless = op == OP_REFI;
2568 ph10 595 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2569     ecode += 3;
2570 ph10 345
2571 ph10 595 /* If the reference is unset, there are two possibilities:
2572 ph10 345
2573 ph10 595 (a) In the default, Perl-compatible state, set the length negative;
2574     this ensures that every attempt at a match fails. We can't just fail
2575     here, because of the possibility of quantifiers with zero minima.
2576 ph10 345
2577 ph10 595 (b) If the JavaScript compatibility flag is set, set the length to zero
2578     so that the back reference matches an empty string.
2579 ph10 345
2580 ph10 595 Otherwise, set the length to the length of what was matched by the
2581     referenced subpattern. */
2582 ph10 345
2583 ph10 595 if (offset >= offset_top || md->offset_vector[offset] < 0)
2584     length = (md->jscript_compat)? 0 : -1;
2585     else
2586     length = md->offset_vector[offset+1] - md->offset_vector[offset];
2587 nigel 77
2588 ph10 595 /* Set up for repetition, or handle the non-repeated case */
2589 nigel 77
2590 ph10 595 switch (*ecode)
2591     {
2592     case OP_CRSTAR:
2593     case OP_CRMINSTAR:
2594     case OP_CRPLUS:
2595     case OP_CRMINPLUS:
2596     case OP_CRQUERY:
2597     case OP_CRMINQUERY:
2598     c = *ecode++ - OP_CRSTAR;
2599     minimize = (c & 1) != 0;
2600     min = rep_min[c]; /* Pick up values from tables; */
2601     max = rep_max[c]; /* zero for max => infinity */
2602     if (max == 0) max = INT_MAX;
2603     break;
2604 nigel 77
2605 ph10 595 case OP_CRRANGE:
2606     case OP_CRMINRANGE:
2607     minimize = (*ecode == OP_CRMINRANGE);
2608     min = GET2(ecode, 1);
2609     max = GET2(ecode, 3);
2610     if (max == 0) max = INT_MAX;
2611     ecode += 5;
2612     break;
2613 nigel 77
2614 ph10 595 default: /* No repeat follows */
2615 ph10 602 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2616 ph10 595 {
2617     CHECK_PARTIAL();
2618     MRRETURN(MATCH_NOMATCH);
2619 nigel 77 }
2620 ph10 595 eptr += length;
2621     continue; /* With the main loop */
2622     }
2623 nigel 77
2624 ph10 595 /* Handle repeated back references. If the length of the reference is
2625     zero, just continue with the main loop. */
2626 ph10 443
2627 ph10 595 if (length == 0) continue;
2628 nigel 77
2629 ph10 595 /* First, ensure the minimum number of matches are present. We get back
2630     the length of the reference string explicitly rather than passing the
2631     address of eptr, so that eptr can be a register variable. */
2632 nigel 77
2633 ph10 595 for (i = 1; i <= min; i++)
2634     {
2635 ph10 625 int slength;
2636 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2637 nigel 77 {
2638 ph10 595 CHECK_PARTIAL();
2639     MRRETURN(MATCH_NOMATCH);
2640 nigel 77 }
2641 ph10 595 eptr += slength;
2642     }
2643 nigel 77
2644 ph10 595 /* If min = max, continue at the same level without recursion.
2645     They are not both allowed to be zero. */
2646 nigel 77
2647 ph10 595 if (min == max) continue;
2648 nigel 77
2649 ph10 595 /* If minimizing, keep trying and advancing the pointer */
2650 nigel 77
2651 ph10 595 if (minimize)
2652     {
2653     for (fi = min;; fi++)
2654 nigel 77 {
2655 ph10 625 int slength;
2656 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2657 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2658     if (fi >= max) MRRETURN(MATCH_NOMATCH);
2659 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2660 nigel 77 {
2661 ph10 595 CHECK_PARTIAL();
2662     MRRETURN(MATCH_NOMATCH);
2663 nigel 77 }
2664 ph10 595 eptr += slength;
2665 nigel 77 }
2666 ph10 595 /* Control never gets here */
2667     }
2668 nigel 77
2669 ph10 595 /* If maximizing, find the longest string and work backwards */
2670 nigel 77
2671 ph10 595 else
2672     {
2673     pp = eptr;
2674     for (i = min; i < max; i++)
2675 nigel 77 {
2676 ph10 625 int slength;
2677 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2678 nigel 77 {
2679 ph10 595 CHECK_PARTIAL();
2680     break;
2681 nigel 77 }
2682 ph10 595 eptr += slength;
2683 nigel 77 }
2684 ph10 595 while (eptr >= pp)
2685     {
2686 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2687 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2688     eptr -= length;
2689     }
2690     MRRETURN(MATCH_NOMATCH);
2691 nigel 77 }
2692     /* Control never gets here */
2693    
2694     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2695     used when all the characters in the class have values in the range 0-255,
2696     and either the matching is caseful, or the characters are in the range
2697     0-127 when UTF-8 processing is enabled. The only difference between
2698     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2699     encountered.
2700    
2701     First, look past the end of the item to see if there is repeat information
2702     following. Then obey similar code to character type repeats - written out
2703     again for speed. */
2704    
2705     case OP_NCLASS:
2706     case OP_CLASS:
2707     {
2708     data = ecode + 1; /* Save for matching */
2709     ecode += 33; /* Advance past the item */
2710    
2711     switch (*ecode)
2712     {
2713     case OP_CRSTAR:
2714     case OP_CRMINSTAR:
2715     case OP_CRPLUS:
2716     case OP_CRMINPLUS:
2717     case OP_CRQUERY:
2718     case OP_CRMINQUERY:
2719     c = *ecode++ - OP_CRSTAR;
2720     minimize = (c & 1) != 0;
2721     min = rep_min[c]; /* Pick up values from tables; */
2722     max = rep_max[c]; /* zero for max => infinity */
2723     if (max == 0) max = INT_MAX;
2724     break;
2725    
2726     case OP_CRRANGE:
2727     case OP_CRMINRANGE:
2728     minimize = (*ecode == OP_CRMINRANGE);
2729     min = GET2(ecode, 1);
2730     max = GET2(ecode, 3);
2731     if (max == 0) max = INT_MAX;
2732     ecode += 5;
2733     break;
2734    
2735     default: /* No repeat follows */
2736     min = max = 1;
2737     break;
2738     }
2739    
2740     /* First, ensure the minimum number of matches are present. */
2741    
2742     #ifdef SUPPORT_UTF8
2743     /* UTF-8 mode */
2744     if (utf8)
2745     {
2746     for (i = 1; i <= min; i++)
2747     {
2748 ph10 427 if (eptr >= md->end_subject)
2749 ph10 426 {
2750 ph10 428 SCHECK_PARTIAL();
2751 ph10 510 MRRETURN(MATCH_NOMATCH);
2752 ph10 427 }
2753 nigel 77 GETCHARINC(c, eptr);
2754     if (c > 255)
2755     {
2756 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2757 nigel 77 }
2758     else
2759     {
2760 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2761 nigel 77 }
2762     }
2763     }
2764     else
2765     #endif
2766     /* Not UTF-8 mode */
2767     {
2768     for (i = 1; i <= min; i++)
2769     {
2770 ph10 427 if (eptr >= md->end_subject)
2771 ph10 426 {
2772 ph10 428 SCHECK_PARTIAL();
2773 ph10 510 MRRETURN(MATCH_NOMATCH);
2774 ph10 427 }
2775 nigel 77 c = *eptr++;
2776 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2777 nigel 77 }
2778     }
2779    
2780     /* If max == min we can continue with the main loop without the
2781     need to recurse. */
2782    
2783     if (min == max) continue;
2784    
2785     /* If minimizing, keep testing the rest of the expression and advancing
2786     the pointer while it matches the class. */
2787    
2788     if (minimize)
2789     {
2790     #ifdef SUPPORT_UTF8
2791     /* UTF-8 mode */
2792     if (utf8)
2793     {
2794     for (fi = min;; fi++)
2795     {
2796 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2797 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2798 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2799 ph10 427 if (eptr >= md->end_subject)
2800 ph10 426 {
2801 ph10 427 SCHECK_PARTIAL();
2802 ph10 510 MRRETURN(MATCH_NOMATCH);
2803 ph10 427 }
2804 nigel 77 GETCHARINC(c, eptr);
2805     if (c > 255)
2806     {
2807 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2808 nigel 77 }
2809     else
2810     {
2811 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2812 nigel 77 }
2813     }
2814     }
2815     else
2816     #endif
2817     /* Not UTF-8 mode */
2818     {
2819     for (fi = min;; fi++)
2820     {
2821 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2822 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2823 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2824 ph10 427 if (eptr >= md->end_subject)
2825 ph10 426 {
2826 ph10 427 SCHECK_PARTIAL();
2827 ph10 510 MRRETURN(MATCH_NOMATCH);
2828 ph10 427 }
2829 nigel 77 c = *eptr++;
2830 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2831 nigel 77 }
2832     }
2833     /* Control never gets here */
2834     }
2835    
2836     /* If maximizing, find the longest possible run, then work backwards. */
2837    
2838     else
2839     {
2840     pp = eptr;
2841    
2842     #ifdef SUPPORT_UTF8
2843     /* UTF-8 mode */
2844     if (utf8)
2845     {
2846     for (i = min; i < max; i++)
2847     {
2848     int len = 1;
2849 ph10 463 if (eptr >= md->end_subject)
2850 ph10 462 {
2851 ph10 463 SCHECK_PARTIAL();
2852 ph10 462 break;
2853 ph10 463 }
2854 nigel 77 GETCHARLEN(c, eptr, len);
2855     if (c > 255)
2856     {
2857     if (op == OP_CLASS) break;
2858     }
2859     else
2860     {
2861     if ((data[c/8] & (1 << (c&7))) == 0) break;
2862     }
2863     eptr += len;
2864     }
2865     for (;;)
2866     {
2867 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2868 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2869     if (eptr-- == pp) break; /* Stop if tried at original pos */
2870     BACKCHAR(eptr);
2871     }
2872     }
2873     else
2874     #endif
2875     /* Not UTF-8 mode */
2876     {
2877     for (i = min; i < max; i++)
2878     {
2879 ph10 463 if (eptr >= md->end_subject)
2880 ph10 462 {
2881 ph10 463 SCHECK_PARTIAL();
2882 ph10 462 break;
2883 ph10 463 }
2884 nigel 77 c = *eptr;
2885     if ((data[c/8] & (1 << (c&7))) == 0) break;
2886     eptr++;
2887     }
2888     while (eptr >= pp)
2889     {
2890 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2891 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2892 nigel 77 eptr--;
2893     }
2894     }
2895    
2896 ph10 510 MRRETURN(MATCH_NOMATCH);
2897 nigel 77 }
2898     }
2899     /* Control never gets here */
2900    
2901    
2902     /* Match an extended character class. This opcode is encountered only
2903 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2904     mode, because Unicode properties are supported in non-UTF-8 mode. */
2905 nigel 77
2906     #ifdef SUPPORT_UTF8
2907     case OP_XCLASS:
2908     {
2909     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2910     ecode += GET(ecode, 1); /* Advance past the item */
2911    
2912     switch (*ecode)
2913     {
2914     case OP_CRSTAR:
2915     case OP_CRMINSTAR:
2916     case OP_CRPLUS:
2917     case OP_CRMINPLUS:
2918     case OP_CRQUERY:
2919     case OP_CRMINQUERY:
2920     c = *ecode++ - OP_CRSTAR;
2921     minimize = (c & 1) != 0;
2922     min = rep_min[c]; /* Pick up values from tables; */
2923     max = rep_max[c]; /* zero for max => infinity */
2924     if (max == 0) max = INT_MAX;
2925     break;
2926    
2927     case OP_CRRANGE:
2928     case OP_CRMINRANGE:
2929     minimize = (*ecode == OP_CRMINRANGE);
2930     min = GET2(ecode, 1);
2931     max = GET2(ecode, 3);
2932     if (max == 0) max = INT_MAX;
2933     ecode += 5;
2934     break;
2935    
2936     default: /* No repeat follows */
2937     min = max = 1;
2938     break;
2939     }
2940    
2941     /* First, ensure the minimum number of matches are present. */
2942    
2943     for (i = 1; i <= min; i++)
2944     {
2945 ph10 427 if (eptr >= md->end_subject)
2946 ph10 426 {
2947     SCHECK_PARTIAL();
2948 ph10 510 MRRETURN(MATCH_NOMATCH);
2949 ph10 427 }
2950 ph10 384 GETCHARINCTEST(c, eptr);
2951 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2952 nigel 77 }
2953    
2954     /* If max == min we can continue with the main loop without the
2955     need to recurse. */
2956    
2957     if (min == max) continue;
2958    
2959     /* If minimizing, keep testing the rest of the expression and advancing
2960     the pointer while it matches the class. */
2961    
2962     if (minimize)
2963     {
2964     for (fi = min;; fi++)
2965     {
2966 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2967 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2968 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2969 ph10 427 if (eptr >= md->end_subject)
2970 ph10 426 {
2971 ph10 427 SCHECK_PARTIAL();
2972 ph10 510 MRRETURN(MATCH_NOMATCH);
2973 ph10 427 }
2974 ph10 384 GETCHARINCTEST(c, eptr);
2975 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2976 nigel 77 }
2977     /* Control never gets here */
2978     }
2979    
2980     /* If maximizing, find the longest possible run, then work backwards. */
2981    
2982     else
2983     {
2984     pp = eptr;
2985     for (i = min; i < max; i++)
2986     {
2987     int len = 1;
2988 ph10 463 if (eptr >= md->end_subject)
2989 ph10 462 {
2990 ph10 463 SCHECK_PARTIAL();
2991 ph10 462 break;
2992 ph10 463 }
2993 ph10 384 GETCHARLENTEST(c, eptr, len);
2994 nigel 77 if (!_pcre_xclass(c, data)) break;
2995     eptr += len;
2996     }
2997     for(;;)
2998     {
2999 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3000 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3001     if (eptr-- == pp) break; /* Stop if tried at original pos */
3002 ph10 214 if (utf8) BACKCHAR(eptr);
3003 nigel 77 }
3004 ph10 510 MRRETURN(MATCH_NOMATCH);
3005 nigel 77 }
3006    
3007     /* Control never gets here */
3008     }
3009     #endif /* End of XCLASS */
3010    
3011     /* Match a single character, casefully */
3012    
3013     case OP_CHAR:
3014     #ifdef SUPPORT_UTF8
3015     if (utf8)
3016     {
3017     length = 1;
3018     ecode++;
3019     GETCHARLEN(fc, ecode, length);
3020 ph10 443 if (length > md->end_subject - eptr)
3021 ph10 428 {
3022     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3023 ph10 510 MRRETURN(MATCH_NOMATCH);
3024 ph10 443 }
3025 ph10 510 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
3026 nigel 77 }
3027     else
3028     #endif
3029    
3030     /* Non-UTF-8 mode */
3031     {
3032 ph10 443 if (md->end_subject - eptr < 1)
3033 ph10 428 {
3034     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3035 ph10 510 MRRETURN(MATCH_NOMATCH);
3036 ph10 443 }
3037 ph10 510 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
3038 nigel 77 ecode += 2;
3039     }
3040     break;
3041    
3042     /* Match a single character, caselessly */
3043    
3044 ph10 602 case OP_CHARI:
3045 nigel 77 #ifdef SUPPORT_UTF8
3046     if (utf8)
3047     {
3048     length = 1;
3049     ecode++;
3050     GETCHARLEN(fc, ecode, length);
3051    
3052 ph10 443 if (length > md->end_subject - eptr)
3053 ph10 428 {
3054     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3055 ph10 510 MRRETURN(MATCH_NOMATCH);
3056 ph10 443 }
3057 nigel 77
3058     /* If the pattern character's value is < 128, we have only one byte, and
3059     can use the fast lookup table. */
3060    
3061     if (fc < 128)
3062     {
3063 ph10 510 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3064 nigel 77 }
3065    
3066     /* Otherwise we must pick up the subject character */
3067    
3068     else
3069     {
3070 nigel 93 unsigned int dc;
3071 nigel 77 GETCHARINC(dc, eptr);
3072     ecode += length;
3073    
3074     /* If we have Unicode property support, we can use it to test the other
3075 nigel 87 case of the character, if there is one. */
3076 nigel 77
3077     if (fc != dc)
3078     {
3079     #ifdef SUPPORT_UCP
3080 ph10 349 if (dc != UCD_OTHERCASE(fc))
3081 nigel 77 #endif
3082 ph10 510 MRRETURN(MATCH_NOMATCH);
3083 nigel 77 }
3084     }
3085     }
3086     else
3087     #endif /* SUPPORT_UTF8 */
3088    
3089     /* Non-UTF-8 mode */
3090     {
3091 ph10 443 if (md->end_subject - eptr < 1)
3092 ph10 428 {
3093 ph10 443 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3094 ph10 510 MRRETURN(MATCH_NOMATCH);
3095 ph10 443 }
3096 ph10 510 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3097 nigel 77 ecode += 2;
3098     }
3099     break;
3100    
3101 nigel 93 /* Match a single character repeatedly. */
3102 nigel 77
3103     case OP_EXACT:
3104 ph10 602 case OP_EXACTI:
3105 nigel 77 min = max = GET2(ecode, 1);
3106     ecode += 3;
3107     goto REPEATCHAR;
3108    
3109 nigel 93 case OP_POSUPTO:
3110 ph10 602 case OP_POSUPTOI:
3111 nigel 93 possessive = TRUE;
3112     /* Fall through */
3113    
3114 nigel 77 case OP_UPTO:
3115 ph10 602 case OP_UPTOI:
3116 nigel 77 case OP_MINUPTO:
3117 ph10 602 case OP_MINUPTOI:
3118 nigel 77 min = 0;
3119     max = GET2(ecode, 1);
3120 ph10 602 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3121 nigel 77 ecode += 3;
3122     goto REPEATCHAR;
3123    
3124 nigel 93 case OP_POSSTAR:
3125 ph10 602 case OP_POSSTARI:
3126 nigel 93 possessive = TRUE;
3127     min = 0;
3128     max = INT_MAX;
3129     ecode++;
3130     goto REPEATCHAR;
3131    
3132     case OP_POSPLUS:
3133 ph10 602 case OP_POSPLUSI:
3134 nigel 93 possessive = TRUE;
3135     min = 1;
3136     max = INT_MAX;
3137     ecode++;
3138     goto REPEATCHAR;
3139    
3140     case OP_POSQUERY:
3141 ph10 602 case OP_POSQUERYI:
3142 nigel 93 possessive = TRUE;
3143     min = 0;
3144     max = 1;
3145     ecode++;
3146     goto REPEATCHAR;
3147    
3148 nigel 77 case OP_STAR:
3149 ph10 602 case OP_STARI:
3150 nigel 77 case OP_MINSTAR:
3151 ph10 602 case OP_MINSTARI:
3152 nigel 77 case OP_PLUS:
3153 ph10 602 case OP_PLUSI:
3154 nigel 77 case OP_MINPLUS:
3155 ph10 602 case OP_MINPLUSI:
3156 nigel 77 case OP_QUERY:
3157 ph10 602 case OP_QUERYI:
3158 nigel 77 case OP_MINQUERY:
3159 ph10 602 case OP_MINQUERYI:
3160     c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3161 nigel 77 minimize = (c & 1) != 0;
3162     min = rep_min[c]; /* Pick up values from tables; */
3163     max = rep_max[c]; /* zero for max => infinity */
3164     if (max == 0) max = INT_MAX;
3165    
3166 ph10 426 /* Common code for all repeated single-character matches. */
3167 nigel 77
3168     REPEATCHAR:
3169     #ifdef SUPPORT_UTF8
3170     if (utf8)
3171     {
3172     length = 1;
3173     charptr = ecode;
3174     GETCHARLEN(fc, ecode, length);
3175     ecode += length;
3176    
3177     /* Handle multibyte character matching specially here. There is
3178     support for caseless matching if UCP support is present. */
3179    
3180     if (length > 1)
3181     {
3182     #ifdef SUPPORT_UCP
3183 nigel 93 unsigned int othercase;
3184 ph10 602 if (op >= OP_STARI && /* Caseless */
3185 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
3186 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
3187 ph10 115 else oclength = 0;
3188 nigel 77 #endif /* SUPPORT_UCP */
3189    
3190     for (i = 1; i <= min; i++)
3191     {
3192 ph10 426 if (eptr <= md->end_subject - length &&
3193     memcmp(eptr, charptr, length) == 0) eptr += length;
3194 ph10 123 #ifdef SUPPORT_UCP
3195 ph10 426 else if (oclength > 0 &&
3196     eptr <= md->end_subject - oclength &&
3197     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3198     #endif /* SUPPORT_UCP */
3199 nigel 77 else
3200     {
3201 ph10 426 CHECK_PARTIAL();
3202 ph10 510 MRRETURN(MATCH_NOMATCH);
3203 nigel 77 }
3204     }
3205    
3206     if (min == max) continue;
3207    
3208     if (minimize)
3209     {
3210     for (fi = min;; fi++)
3211     {
3212 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3213 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3214 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3215 ph10 426 if (eptr <= md->end_subject - length &&
3216     memcmp(eptr, charptr, length) == 0) eptr += length;
3217 ph10 123 #ifdef SUPPORT_UCP
3218 ph10 426 else if (oclength > 0 &&
3219     eptr <= md->end_subject - oclength &&
3220     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3221     #endif /* SUPPORT_UCP */
3222 nigel 77 else
3223     {
3224 ph10 426 CHECK_PARTIAL();
3225 ph10 510 MRRETURN(MATCH_NOMATCH);
3226 nigel 77 }
3227     }
3228     /* Control never gets here */
3229     }
3230 nigel 93
3231     else /* Maximize */
3232 nigel 77 {
3233     pp = eptr;
3234     for (i = min; i < max; i++)
3235     {
3236 ph10 426 if (eptr <= md->end_subject - length &&
3237     memcmp(eptr, charptr, length) == 0) eptr += length;
3238 ph10 123 #ifdef SUPPORT_UCP
3239 ph10 426 else if (oclength > 0 &&
3240     eptr <= md->end_subject - oclength &&
3241     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3242     #endif /* SUPPORT_UCP */
3243 ph10 463 else
3244 ph10 462 {
3245 ph10 463 CHECK_PARTIAL();
3246 ph10 462 break;
3247 ph10 463 }
3248 nigel 77 }
3249 nigel 93
3250     if (possessive) continue;
3251 ph10 427
3252 ph10 120 for(;;)
3253 ph10 426 {
3254 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3255 ph10 426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3256 ph10 510 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3257 ph10 115 #ifdef SUPPORT_UCP
3258 ph10 426 eptr--;
3259     BACKCHAR(eptr);
3260 ph10 123 #else /* without SUPPORT_UCP */
3261 ph10 426 eptr -= length;
3262 ph10 123 #endif /* SUPPORT_UCP */
3263 ph10 426 }
3264 nigel 77 }
3265     /* Control never gets here */
3266     }
3267    
3268     /* If the length of a UTF-8 character is 1, we fall through here, and
3269     obey the code as for non-UTF-8 characters below, though in this case the
3270     value of fc will always be < 128. */
3271     }
3272     else
3273     #endif /* SUPPORT_UTF8 */
3274    
3275     /* When not in UTF-8 mode, load a single-byte character. */
3276    
3277 ph10 426 fc = *ecode++;
3278 ph10 443
3279 nigel 77 /* The value of fc at this point is always less than 256, though we may or
3280     may not be in UTF-8 mode. The code is duplicated for the caseless and
3281     caseful cases, for speed, since matching characters is likely to be quite
3282     common. First, ensure the minimum number of matches are present. If min =
3283     max, continue at the same level without recursing. Otherwise, if
3284     minimizing, keep trying the rest of the expression and advancing one
3285     matching character if failing, up to the maximum. Alternatively, if
3286     maximizing, find the maximum number of characters and work backwards. */
3287    
3288     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3289     max, eptr));
3290    
3291 ph10 602 if (op >= OP_STARI) /* Caseless */
3292 nigel 77 {
3293     fc = md->lcc[fc];
3294     for (i = 1; i <= min; i++)
3295 ph10 426 {
3296     if (eptr >= md->end_subject)
3297     {
3298     SCHECK_PARTIAL();
3299 ph10 510 MRRETURN(MATCH_NOMATCH);
3300 ph10 426 }
3301 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3302 ph10 426 }
3303 nigel 77 if (min == max) continue;
3304     if (minimize)
3305     {
3306     for (fi = min;; fi++)
3307     {
3308 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3309 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3310 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3311 ph10 426 if (eptr >= md->end_subject)
3312     {
3313 ph10 427 SCHECK_PARTIAL();
3314 ph10 510 MRRETURN(MATCH_NOMATCH);
3315 ph10 426 }
3316 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3317 nigel 77 }
3318     /* Control never gets here */
3319     }
3320 nigel 93 else /* Maximize */
3321 nigel 77 {
3322     pp = eptr;
3323     for (i = min; i < max; i++)
3324     {
3325 ph10 463 if (eptr >= md->end_subject)
3326 ph10 462 {
3327     SCHECK_PARTIAL();
3328     break;
3329 ph10 463 }
3330 ph10 462 if (fc != md->lcc[*eptr]) break;
3331 nigel 77 eptr++;
3332     }
3333 ph10 427
3334 nigel 93 if (possessive) continue;
3335 ph10 427
3336 nigel 77 while (eptr >= pp)
3337     {
3338 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3339 nigel 77 eptr--;
3340     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3341     }
3342 ph10 510 MRRETURN(MATCH_NOMATCH);
3343 nigel 77 }
3344     /* Control never gets here */
3345     }
3346    
3347     /* Caseful comparisons (includes all multi-byte characters) */
3348    
3349     else
3350     {
3351 ph10 427 for (i = 1; i <= min; i++)
3352 ph10 426 {
3353     if (eptr >= md->end_subject)
3354     {
3355     SCHECK_PARTIAL();
3356 ph10 510 MRRETURN(MATCH_NOMATCH);
3357 ph10 426 }
3358 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3359 ph10 427 }
3360 ph10 443
3361 nigel 77 if (min == max) continue;
3362 ph10 443
3363 nigel 77 if (minimize)
3364     {
3365     for (fi = min;; fi++)
3366     {
3367 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3368 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3369 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3370 ph10 426 if (eptr >= md->end_subject)
3371 ph10 427 {
3372 ph10 426 SCHECK_PARTIAL();
3373 ph10 510 MRRETURN(MATCH_NOMATCH);
3374 ph10 427 }
3375 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3376 nigel 77 }
3377     /* Control never gets here */
3378     }
3379 nigel 93 else /* Maximize */
3380 nigel 77 {
3381     pp = eptr;
3382     for (i = min; i < max; i++)
3383     {
3384 ph10 463 if (eptr >= md->end_subject)
3385 ph10 462 {
3386 ph10 463 SCHECK_PARTIAL();
3387 ph10 462 break;
3388 ph10 463 }
3389 ph10 462 if (fc != *eptr) break;
3390 nigel 77 eptr++;
3391     }
3392 nigel 93 if (possessive) continue;
3393 ph10 443
3394 nigel 77 while (eptr >= pp)
3395     {
3396 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3397 nigel 77 eptr--;
3398     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3399     }
3400 ph10 510 MRRETURN(MATCH_NOMATCH);
3401 nigel 77 }
3402     }
3403     /* Control never gets here */
3404    
3405     /* Match a negated single one-byte character. The character we are
3406     checking can be multibyte. */
3407    
3408     case OP_NOT:
3409 ph10 625 case OP_NOTI:
3410 ph10 443 if (eptr >= md->end_subject)
3411 ph10 428 {
3412 ph10 443 SCHECK_PARTIAL();
3413 ph10 510 MRRETURN(MATCH_NOMATCH);
3414 ph10 443 }
3415 nigel 77 ecode++;
3416     GETCHARINCTEST(c, eptr);
3417 ph10 602 if (op == OP_NOTI) /* The caseless case */
3418 nigel 77 {
3419     #ifdef SUPPORT_UTF8
3420     if (c < 256)
3421     #endif
3422     c = md->lcc[c];
3423 ph10 510 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3424 nigel 77 }
3425 ph10 602 else /* Caseful */
3426 nigel 77 {
3427 ph10 510 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3428 nigel 77 }
3429     break;
3430    
3431     /* Match a negated single one-byte character repeatedly. This is almost a
3432     repeat of the code for a repeated single character, but I haven't found a
3433     nice way of commoning these up that doesn't require a test of the
3434     positive/negative option for each character match. Maybe that wouldn't add
3435     very much to the time taken, but character matching *is* what this is all
3436     about... */
3437    
3438     case OP_NOTEXACT:
3439 ph10 602 case OP_NOTEXACTI:
3440 nigel 77 min = max = GET2(ecode, 1);
3441     ecode += 3;
3442     goto REPEATNOTCHAR;
3443    
3444     case OP_NOTUPTO:
3445 ph10 602 case OP_NOTUPTOI:
3446 nigel 77 case OP_NOTMINUPTO:
3447 ph10 602 case OP_NOTMINUPTOI:
3448 nigel 77 min = 0;
3449     max = GET2(ecode, 1);
3450 ph10 602 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3451 nigel 77 ecode += 3;
3452     goto REPEATNOTCHAR;
3453    
3454 nigel 93 case OP_NOTPOSSTAR:
3455 ph10 602 case OP_NOTPOSSTARI:
3456 nigel 93 possessive = TRUE;
3457     min = 0;
3458     max = INT_MAX;
3459     ecode++;
3460     goto REPEATNOTCHAR;
3461    
3462     case OP_NOTPOSPLUS:
3463 ph10 602 case OP_NOTPOSPLUSI:
3464 nigel 93 possessive = TRUE;
3465     min = 1;
3466     max = INT_MAX;
3467     ecode++;
3468     goto REPEATNOTCHAR;
3469    
3470     case OP_NOTPOSQUERY:
3471 ph10 602 case OP_NOTPOSQUERYI:
3472 nigel 93 possessive = TRUE;
3473     min = 0;
3474     max = 1;
3475     ecode++;
3476     goto REPEATNOTCHAR;
3477    
3478     case OP_NOTPOSUPTO:
3479 ph10 602 case OP_NOTPOSUPTOI:
3480 nigel 93 possessive = TRUE;
3481     min = 0;
3482     max = GET2(ecode, 1);
3483     ecode += 3;
3484     goto REPEATNOTCHAR;
3485    
3486 nigel 77 case OP_NOTSTAR:
3487 ph10 602 case OP_NOTSTARI:
3488 nigel 77 case OP_NOTMINSTAR:
3489 ph10 602 case OP_NOTMINSTARI:
3490 nigel 77 case OP_NOTPLUS:
3491 ph10 602 case OP_NOTPLUSI:
3492 nigel 77 case OP_NOTMINPLUS:
3493 ph10 602 case OP_NOTMINPLUSI:
3494 nigel 77 case OP_NOTQUERY:
3495 ph10 602 case OP_NOTQUERYI:
3496 nigel 77 case OP_NOTMINQUERY:
3497 ph10 602 case OP_NOTMINQUERYI:
3498     c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3499 nigel 77 minimize = (c & 1) != 0;
3500     min = rep_min[c]; /* Pick up values from tables; */
3501     max = rep_max[c]; /* zero for max => infinity */
3502     if (max == 0) max = INT_MAX;
3503    
3504 ph10 426 /* Common code for all repeated single-byte matches. */
3505 nigel 77
3506     REPEATNOTCHAR:
3507     fc = *ecode++;
3508    
3509     /* The code is duplicated for the caseless and caseful cases, for speed,
3510     since matching characters is likely to be quite common. First, ensure the
3511     minimum number of matches are present. If min = max, continue at the same
3512     level without recursing. Otherwise, if minimizing, keep trying the rest of
3513     the expression and advancing one matching character if failing, up to the
3514     maximum. Alternatively, if maximizing, find the maximum number of
3515     characters and work backwards. */
3516    
3517     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3518     max, eptr));
3519    
3520 ph10 602 if (op >= OP_NOTSTARI) /* Caseless */
3521 nigel 77 {
3522     fc = md->lcc[fc];
3523    
3524     #ifdef SUPPORT_UTF8
3525     /* UTF-8 mode */
3526     if (utf8)
3527     {
3528 nigel 93 register unsigned int d;
3529 nigel 77 for (i = 1; i <= min; i++)
3530     {
3531 ph10 426 if (eptr >= md->end_subject)
3532     {
3533     SCHECK_PARTIAL();
3534 ph10 510 MRRETURN(MATCH_NOMATCH);
3535 ph10 427 }
3536 nigel 77 GETCHARINC(d, eptr);
3537     if (d < 256) d = md->lcc[d];
3538 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3539 nigel 77 }
3540     }
3541     else
3542     #endif
3543    
3544     /* Not UTF-8 mode */
3545     {
3546     for (i = 1; i <= min; i++)
3547 ph10 426 {
3548     if (eptr >= md->end_subject)
3549     {
3550     SCHECK_PARTIAL();
3551 ph10 510 MRRETURN(MATCH_NOMATCH);
3552 ph10 427 }
3553 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3554 ph10 427 }
3555 nigel 77 }
3556    
3557     if (min == max) continue;
3558    
3559     if (minimize)
3560     {
3561     #ifdef SUPPORT_UTF8
3562     /* UTF-8 mode */
3563     if (utf8)
3564     {
3565 nigel 93 register unsigned int d;
3566 nigel 77 for (fi = min;; fi++)
3567     {
3568 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3569 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3570 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3571 ph10 427 if (eptr >= md->end_subject)
3572 ph10 426 {
3573 ph10 427 SCHECK_PARTIAL();
3574 ph10 510 MRRETURN(MATCH_NOMATCH);
3575 ph10 427 }
3576 nigel 77 GETCHARINC(d, eptr);
3577     if (d < 256) d = md->lcc[d];
3578 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3579 nigel 77 }
3580     }
3581     else
3582     #endif
3583     /* Not UTF-8 mode */
3584     {
3585     for (fi = min;; fi++)
3586     {
3587 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3588 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3589 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3590 ph10 426 if (eptr >= md->end_subject)
3591     {
3592     SCHECK_PARTIAL();
3593 ph10 510 MRRETURN(MATCH_NOMATCH);
3594 ph10 426 }
3595 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3596 nigel 77 }
3597     }
3598     /* Control never gets here */
3599     }
3600    
3601     /* Maximize case */
3602    
3603     else
3604     {
3605     pp = eptr;
3606    
3607     #ifdef SUPPORT_UTF8
3608     /* UTF-8 mode */
3609     if (utf8)
3610     {
3611 nigel 93 register unsigned int d;
3612 nigel 77 for (i = min; i < max; i++)
3613     {
3614     int len = 1;
3615 ph10 463 if (eptr >= md->end_subject)
3616 ph10 462 {
3617 ph10 463 SCHECK_PARTIAL();
3618 ph10 462 break;
3619 ph10 463 }
3620 nigel 77 GETCHARLEN(d, eptr, len);
3621     if (d < 256) d = md->lcc[d];
3622     if (fc == d) break;
3623     eptr += len;
3624     }
3625 nigel 93 if (possessive) continue;
3626     for(;;)
3627 nigel 77 {
3628 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3629 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3630     if (eptr-- == pp) break; /* Stop if tried at original pos */
3631     BACKCHAR(eptr);
3632     }
3633     }
3634     else
3635     #endif
3636     /* Not UTF-8 mode */
3637     {
3638     for (i = min; i < max; i++)
3639     {
3640 ph10 463 if (eptr >= md->end_subject)
3641 ph10 462 {
3642     SCHECK_PARTIAL();
3643     break;
3644 ph10 463 }
3645 ph10 462 if (fc == md->lcc[*eptr]) break;
3646 nigel 77 eptr++;
3647     }
3648 nigel 93 if (possessive) continue;
3649 nigel 77 while (eptr >= pp)
3650     {
3651 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3652 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3653     eptr--;
3654     }
3655     }
3656    
3657 ph10 510 MRRETURN(MATCH_NOMATCH);
3658 nigel 77 }
3659     /* Control never gets here */
3660     }
3661    
3662     /* Caseful comparisons */
3663    
3664     else
3665     {
3666     #ifdef SUPPORT_UTF8
3667     /* UTF-8 mode */
3668     if (utf8)
3669     {
3670 nigel 93 register unsigned int d;
3671 nigel 77 for (i = 1; i <= min; i++)
3672     {