/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 778 - (hide annotations) (download)
Thu Dec 1 17:38:47 2011 UTC (2 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 202160 byte(s)
Fix bug with caseless matching of characters of different lengths when the 
shorter is right at the end of the subject.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 ph10 625 /* Values for setting in md->match_function_type to indicate two special types
61     of call to match(). We do it this way to save on using another stack variable,
62 ph10 604 as stack usage is to be discouraged. */
63 nigel 77
64 ph10 604 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65     #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66 nigel 77
67     /* Non-error returns from the match() function. Error returns are externally
68     defined PCRE_ERROR_xxx codes, which are all negative. */
69    
70     #define MATCH_MATCH 1
71     #define MATCH_NOMATCH 0
72    
73 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
74 ph10 210 negative to avoid the external error codes. */
75    
76 ph10 511 #define MATCH_ACCEPT (-999)
77     #define MATCH_COMMIT (-998)
78 ph10 604 #define MATCH_KETRPOS (-997)
79 ph10 618 #define MATCH_ONCE (-996)
80     #define MATCH_PRUNE (-995)
81     #define MATCH_SKIP (-994)
82     #define MATCH_SKIP_ARG (-993)
83     #define MATCH_THEN (-992)
84 ph10 210
85 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
86     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87     because the offset vector is always a multiple of 3 long. */
88    
89     #define REC_STACK_SAVE_MAX 30
90    
91     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92    
93     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95    
96    
97    
98 ph10 475 #ifdef PCRE_DEBUG
99 nigel 77 /*************************************************
100     * Debugging function to print chars *
101     *************************************************/
102    
103     /* Print a sequence of chars in printable format, stopping at the end of the
104     subject if the requested.
105    
106     Arguments:
107     p points to characters
108     length number to print
109     is_subject TRUE if printing from within md->start_subject
110     md pointer to matching data block, if is_subject is TRUE
111    
112     Returns: nothing
113     */
114    
115     static void
116     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
117     {
118 nigel 93 unsigned int c;
119 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120     while (length-- > 0)
121     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122     }
123     #endif
124    
125    
126    
127     /*************************************************
128     * Match a back-reference *
129     *************************************************/
130    
131 ph10 595 /* Normally, if a back reference hasn't been set, the length that is passed is
132     negative, so the match always fails. However, in JavaScript compatibility mode,
133 ph10 625 the length passed is zero. Note that in caseless UTF-8 mode, the number of
134 ph10 595 subject bytes matched may be different to the number of reference bytes.
135 nigel 77
136     Arguments:
137     offset index into the offset vector
138 ph10 595 eptr pointer into the subject
139     length length of reference to be matched (number of bytes)
140 nigel 77 md points to match data block
141 ph10 602 caseless TRUE if caseless
142 nigel 77
143 ph10 595 Returns: < 0 if not matched, otherwise the number of subject bytes matched
144 nigel 77 */
145    
146 ph10 595 static int
147 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
148 ph10 602 BOOL caseless)
149 nigel 77 {
150 ph10 595 USPTR eptr_start = eptr;
151     register USPTR p = md->start_subject + md->offset_vector[offset];
152 nigel 77
153 ph10 475 #ifdef PCRE_DEBUG
154 nigel 77 if (eptr >= md->end_subject)
155     printf("matching subject <null>");
156     else
157     {
158     printf("matching subject ");
159     pchars(eptr, length, TRUE, md);
160     }
161     printf(" against backref ");
162     pchars(p, length, FALSE, md);
163     printf("\n");
164     #endif
165    
166 ph10 595 /* Always fail if reference not set (and not JavaScript compatible). */
167 nigel 77
168 ph10 595 if (length < 0) return -1;
169 nigel 77
170 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
171     properly if Unicode properties are supported. Otherwise, we can check only
172     ASCII characters. */
173 nigel 77
174 ph10 602 if (caseless)
175 nigel 77 {
176 ph10 354 #ifdef SUPPORT_UTF8
177     #ifdef SUPPORT_UCP
178     if (md->utf8)
179     {
180 ph10 625 /* Match characters up to the end of the reference. NOTE: the number of
181 ph10 595 bytes matched may differ, because there are some characters whose upper and
182     lower case versions code as different numbers of bytes. For example, U+023A
183     (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
184     a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
185 ph10 625 the latter. It is important, therefore, to check the length along the
186 ph10 595 reference, not along the subject (earlier code did this wrong). */
187 ph10 625
188 ph10 595 USPTR endptr = p + length;
189     while (p < endptr)
190 ph10 354 {
191 ph10 358 int c, d;
192 ph10 597 if (eptr >= md->end_subject) return -1;
193 ph10 354 GETCHARINC(c, eptr);
194     GETCHARINC(d, p);
195 ph10 595 if (c != d && c != UCD_OTHERCASE(d)) return -1;
196 ph10 358 }
197     }
198 ph10 354 else
199     #endif
200     #endif
201    
202     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
203     is no UCP support. */
204 ph10 597 {
205 ph10 625 if (eptr + length > md->end_subject) return -1;
206 ph10 597 while (length-- > 0)
207     { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
208 ph10 625 }
209 nigel 77 }
210 ph10 358
211 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
212     are in UTF-8 mode. */
213 ph10 358
214 nigel 77 else
215 ph10 625 {
216     if (eptr + length > md->end_subject) return -1;
217     while (length-- > 0) if (*p++ != *eptr++) return -1;
218 ph10 597 }
219 nigel 77
220 ph10 595 return eptr - eptr_start;
221 nigel 77 }
222    
223    
224    
225     /***************************************************************************
226     ****************************************************************************
227     RECURSION IN THE match() FUNCTION
228    
229 nigel 87 The match() function is highly recursive, though not every recursive call
230     increases the recursive depth. Nevertheless, some regular expressions can cause
231     it to recurse to a great depth. I was writing for Unix, so I just let it call
232     itself recursively. This uses the stack for saving everything that has to be
233     saved for a recursive call. On Unix, the stack can be large, and this works
234     fine.
235 nigel 77
236 nigel 87 It turns out that on some non-Unix-like systems there are problems with
237     programs that use a lot of stack. (This despite the fact that every last chip
238     has oodles of memory these days, and techniques for extending the stack have
239     been known for decades.) So....
240 nigel 77
241     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
242     calls by keeping local variables that need to be preserved in blocks of memory
243 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
244 nigel 77 achieve this so that the actual code doesn't look very different to what it
245     always used to.
246 ph10 164
247 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
248 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
249     Switzer, the use of longjmp() has been abolished, at the cost of having to
250     provide a unique number for each call to RMATCH. There is no way of generating
251     a sequence of numbers at compile time in C. I have given them names, to make
252     them stand out more clearly.
253    
254     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
255     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
256 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
257     don't have indeterminate values; this has meant that the frame size can be
258 ph10 164 reduced because the result can be "passed back" by straight setting of the
259     variable instead of being passed in the frame.
260 nigel 77 ****************************************************************************
261     ***************************************************************************/
262    
263 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
264     below must be updated in sync. */
265 nigel 77
266 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
267     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
268     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
269     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
270 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
271 ph10 527 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
272 ph10 723 RM61, RM62, RM63, RM64, RM65, RM66 };
273 ph10 164
274 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
275 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
276 ph10 501 actually used in this definition. */
277 nigel 77
278     #ifndef NO_RECURSE
279     #define REGISTER register
280 ph10 164
281 ph10 475 #ifdef PCRE_DEBUG
282 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
283 nigel 87 { \
284     printf("match() called in line %d\n", __LINE__); \
285 ph10 771 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
286 nigel 87 printf("to line %d\n", __LINE__); \
287     }
288     #define RRETURN(ra) \
289     { \
290     printf("match() returned %d from line %d ", ra, __LINE__); \
291     return ra; \
292     }
293     #else
294 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
295 ph10 771 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
296 nigel 77 #define RRETURN(ra) return ra
297 nigel 87 #endif
298    
299 nigel 77 #else
300    
301    
302 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
303     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
304     argument of match(), which never changes. */
305 nigel 77
306     #define REGISTER
307    
308 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw)\
309 nigel 77 {\
310 ph10 563 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
311 ph10 534 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
312 ph10 164 frame->Xwhere = rw; \
313     newframe->Xeptr = ra;\
314     newframe->Xecode = rb;\
315 ph10 168 newframe->Xmstart = mstart;\
316 ph10 164 newframe->Xoffset_top = rc;\
317 ph10 602 newframe->Xeptrb = re;\
318 ph10 164 newframe->Xrdepth = frame->Xrdepth + 1;\
319     newframe->Xprevframe = frame;\
320     frame = newframe;\
321     DPRINTF(("restarting from line %d\n", __LINE__));\
322     goto HEAP_RECURSE;\
323     L_##rw:\
324     DPRINTF(("jumped back to line %d\n", __LINE__));\
325 nigel 77 }
326    
327     #define RRETURN(ra)\
328     {\
329 ph10 527 heapframe *oldframe = frame;\
330     frame = oldframe->Xprevframe;\
331     (pcre_stack_free)(oldframe);\
332 nigel 77 if (frame != NULL)\
333     {\
334 ph10 164 rrc = ra;\
335     goto HEAP_RETURN;\
336 nigel 77 }\
337     return ra;\
338     }
339    
340    
341     /* Structure for remembering the local variables in a private frame */
342    
343     typedef struct heapframe {
344     struct heapframe *Xprevframe;
345    
346     /* Function arguments that may change */
347    
348 ph10 409 USPTR Xeptr;
349 nigel 77 const uschar *Xecode;
350 ph10 409 USPTR Xmstart;
351 nigel 77 int Xoffset_top;
352     eptrblock *Xeptrb;
353 nigel 91 unsigned int Xrdepth;
354 nigel 77
355     /* Function local variables */
356    
357 ph10 409 USPTR Xcallpat;
358 ph10 406 #ifdef SUPPORT_UTF8
359 ph10 409 USPTR Xcharptr;
360 ph10 406 #endif
361 ph10 409 USPTR Xdata;
362     USPTR Xnext;
363     USPTR Xpp;
364     USPTR Xprev;
365     USPTR Xsaved_eptr;
366 nigel 77
367     recursion_info Xnew_recursive;
368    
369     BOOL Xcur_is_word;
370     BOOL Xcondition;
371     BOOL Xprev_is_word;
372    
373     #ifdef SUPPORT_UCP
374     int Xprop_type;
375 nigel 87 int Xprop_value;
376 nigel 77 int Xprop_fail_result;
377 ph10 123 int Xoclength;
378     uschar Xocchars[8];
379 nigel 77 #endif
380    
381 ph10 403 int Xcodelink;
382 nigel 77 int Xctype;
383 nigel 93 unsigned int Xfc;
384 nigel 77 int Xfi;
385     int Xlength;
386     int Xmax;
387     int Xmin;
388     int Xnumber;
389     int Xoffset;
390     int Xop;
391     int Xsave_capture_last;
392     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
393     int Xstacksave[REC_STACK_SAVE_MAX];
394    
395     eptrblock Xnewptrb;
396    
397 ph10 164 /* Where to jump back to */
398 nigel 77
399 ph10 164 int Xwhere;
400 ph10 165
401 nigel 77 } heapframe;
402    
403     #endif
404    
405    
406     /***************************************************************************
407     ***************************************************************************/
408    
409    
410    
411     /*************************************************
412     * Match from current position *
413     *************************************************/
414    
415 nigel 93 /* This function is called recursively in many circumstances. Whenever it
416 nigel 77 returns a negative (error) response, the outer incarnation must also return the
417 ph10 426 same response. */
418 nigel 77
419 ph10 426 /* These macros pack up tests that are used for partial matching, and which
420 ph10 778 appear several times in the code. We set the "hit end" flag if the pointer is
421 ph10 426 at the end of the subject and also past the start of the subject (i.e.
422 ph10 427 something has been matched). For hard partial matching, we then return
423     immediately. The second one is used when we already know we are past the end of
424     the subject. */
425 ph10 426
426     #define CHECK_PARTIAL()\
427 ph10 553 if (md->partial != 0 && eptr >= md->end_subject && \
428     eptr > md->start_used_ptr) \
429     { \
430     md->hitend = TRUE; \
431 ph10 771 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
432 ph10 427 }
433 ph10 426
434     #define SCHECK_PARTIAL()\
435 ph10 553 if (md->partial != 0 && eptr > md->start_used_ptr) \
436     { \
437     md->hitend = TRUE; \
438 ph10 771 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
439 ph10 427 }
440 ph10 426
441 ph10 427
442 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
443     the md structure (e.g. utf8, end_subject) into individual variables to improve
444 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
445     made performance worse.
446    
447     Arguments:
448 nigel 93 eptr pointer to current character in subject
449     ecode pointer to current position in compiled code
450 ph10 168 mstart pointer to the current match start position (can be modified
451 ph10 172 by encountering \K)
452 nigel 77 offset_top current top pointer
453     md pointer to "static" info for the match
454     eptrb pointer to chain of blocks containing eptr at start of
455     brackets - for testing for empty matches
456 nigel 87 rdepth the recursion depth
457 nigel 77
458     Returns: MATCH_MATCH if matched ) these values are >= 0
459     MATCH_NOMATCH if failed to match )
460 ph10 510 a negative MATCH_xxx value for PRUNE, SKIP, etc
461 nigel 77 a negative PCRE_ERROR_xxx value if aborted by an error condition
462 nigel 87 (e.g. stopped by repeated call or recursion limit)
463 nigel 77 */
464    
465     static int
466 ph10 510 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
467 ph10 771 int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
468 nigel 77 {
469     /* These variables do not need to be preserved over recursion in this function,
470 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
471     "register" because they are used a lot in loops. */
472 nigel 77
473 nigel 91 register int rrc; /* Returns from recursive calls */
474     register int i; /* Used for loops not involving calls to RMATCH() */
475 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
476 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
477 nigel 77
478 nigel 93 BOOL minimize, possessive; /* Quantifier options */
479 ph10 602 BOOL caseless;
480 ph10 403 int condcode;
481 nigel 93
482 nigel 77 /* When recursion is not being used, all "local" variables that have to be
483     preserved over calls to RMATCH() are part of a "frame" which is obtained from
484     heap storage. Set up the top-level frame here; others are obtained from the
485     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
486    
487     #ifdef NO_RECURSE
488 ph10 563 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
489 ph10 531 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
490 nigel 77 frame->Xprevframe = NULL; /* Marks the top level */
491    
492     /* Copy in the original argument variables */
493    
494     frame->Xeptr = eptr;
495     frame->Xecode = ecode;
496 ph10 168 frame->Xmstart = mstart;
497 nigel 77 frame->Xoffset_top = offset_top;
498     frame->Xeptrb = eptrb;
499 nigel 87 frame->Xrdepth = rdepth;
500 nigel 77
501     /* This is where control jumps back to to effect "recursion" */
502    
503     HEAP_RECURSE:
504    
505     /* Macros make the argument variables come from the current frame */
506    
507     #define eptr frame->Xeptr
508     #define ecode frame->Xecode
509 ph10 168 #define mstart frame->Xmstart
510 nigel 77 #define offset_top frame->Xoffset_top
511     #define eptrb frame->Xeptrb
512 nigel 87 #define rdepth frame->Xrdepth
513 nigel 77
514     /* Ditto for the local variables */
515    
516     #ifdef SUPPORT_UTF8
517     #define charptr frame->Xcharptr
518     #endif
519     #define callpat frame->Xcallpat
520 ph10 403 #define codelink frame->Xcodelink
521 nigel 77 #define data frame->Xdata
522     #define next frame->Xnext
523     #define pp frame->Xpp
524     #define prev frame->Xprev
525     #define saved_eptr frame->Xsaved_eptr
526    
527     #define new_recursive frame->Xnew_recursive
528    
529     #define cur_is_word frame->Xcur_is_word
530     #define condition frame->Xcondition
531     #define prev_is_word frame->Xprev_is_word
532    
533     #ifdef SUPPORT_UCP
534     #define prop_type frame->Xprop_type
535 nigel 87 #define prop_value frame->Xprop_value
536 nigel 77 #define prop_fail_result frame->Xprop_fail_result
537 ph10 115 #define oclength frame->Xoclength
538     #define occhars frame->Xocchars
539 nigel 77 #endif
540    
541     #define ctype frame->Xctype
542     #define fc frame->Xfc
543     #define fi frame->Xfi
544     #define length frame->Xlength
545     #define max frame->Xmax
546     #define min frame->Xmin
547     #define number frame->Xnumber
548     #define offset frame->Xoffset
549     #define op frame->Xop
550     #define save_capture_last frame->Xsave_capture_last
551     #define save_offset1 frame->Xsave_offset1
552     #define save_offset2 frame->Xsave_offset2
553     #define save_offset3 frame->Xsave_offset3
554     #define stacksave frame->Xstacksave
555    
556     #define newptrb frame->Xnewptrb
557    
558     /* When recursion is being used, local variables are allocated on the stack and
559     get preserved during recursion in the normal way. In this environment, fi and
560     i, and fc and c, can be the same variables. */
561    
562 nigel 93 #else /* NO_RECURSE not defined */
563 nigel 77 #define fi i
564     #define fc c
565    
566 ph10 604 /* Many of the following variables are used only in small blocks of the code.
567     My normal style of coding would have declared them within each of those blocks.
568     However, in order to accommodate the version of this code that uses an external
569     "stack" implemented on the heap, it is easier to declare them all here, so the
570     declarations can be cut out in a block. The only declarations within blocks
571     below are for variables that do not have to be preserved over a recursive call
572     to RMATCH(). */
573 nigel 77
574 ph10 625 #ifdef SUPPORT_UTF8
575     const uschar *charptr;
576     #endif
577     const uschar *callpat;
578     const uschar *data;
579     const uschar *next;
580     USPTR pp;
581     const uschar *prev;
582     USPTR saved_eptr;
583    
584     recursion_info new_recursive;
585    
586     BOOL cur_is_word;
587 nigel 87 BOOL condition;
588 nigel 77 BOOL prev_is_word;
589    
590     #ifdef SUPPORT_UCP
591     int prop_type;
592 nigel 87 int prop_value;
593 nigel 77 int prop_fail_result;
594 ph10 115 int oclength;
595     uschar occhars[8];
596 nigel 77 #endif
597    
598 ph10 399 int codelink;
599 nigel 77 int ctype;
600     int length;
601     int max;
602     int min;
603     int number;
604     int offset;
605     int op;
606     int save_capture_last;
607     int save_offset1, save_offset2, save_offset3;
608     int stacksave[REC_STACK_SAVE_MAX];
609    
610     eptrblock newptrb;
611 nigel 93 #endif /* NO_RECURSE */
612 nigel 77
613 ph10 625 /* To save space on the stack and in the heap frame, I have doubled up on some
614     of the local variables that are used only in localised parts of the code, but
615     still need to be preserved over recursive calls of match(). These macros define
616 ph10 604 the alternative names that are used. */
617    
618     #define allow_zero cur_is_word
619     #define cbegroup condition
620     #define code_offset codelink
621     #define condassert condition
622     #define matched_once prev_is_word
623    
624 nigel 77 /* These statements are here to stop the compiler complaining about unitialized
625     variables. */
626    
627     #ifdef SUPPORT_UCP
628 nigel 87 prop_value = 0;
629 nigel 77 prop_fail_result = 0;
630     #endif
631    
632 nigel 93
633 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
634     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
635     used. Thanks to Ian Taylor for noticing this possibility and sending the
636     original patch. */
637    
638     TAIL_RECURSE:
639    
640 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
641     are specified by the macro RMATCH and RRETURN is used to return. When
642     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
643 ph10 475 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
644 nigel 87 defined). However, RMATCH isn't like a function call because it's quite a
645     complicated macro. It has to be used in one particular way. This shouldn't,
646     however, impact performance when true recursion is being used. */
647 nigel 77
648 ph10 164 #ifdef SUPPORT_UTF8
649     utf8 = md->utf8; /* Local copy of the flag */
650     #else
651     utf8 = FALSE;
652     #endif
653    
654 nigel 87 /* First check that we haven't called match() too many times, or that we
655     haven't exceeded the recursive call limit. */
656    
657 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
658 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
659 nigel 77
660 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
661 ph10 625 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
662     done this way to save having to use another function argument, which would take
663 ph10 604 up space on the stack. See also MATCH_CONDASSERT below.
664 nigel 77
665 ph10 604 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
666     such remembered pointers, to be checked when we hit the closing ket, in order
667     to break infinite loops that match no characters. When match() is called in
668     other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
669     NOT be used with tail recursion, because the memory block that is used is on
670     the stack, so a new one may be required for each match(). */
671    
672     if (md->match_function_type == MATCH_CBEGROUP)
673 nigel 77 {
674 ph10 197 newptrb.epb_saved_eptr = eptr;
675     newptrb.epb_prev = eptrb;
676     eptrb = &newptrb;
677 ph10 604 md->match_function_type = 0;
678 nigel 77 }
679    
680 nigel 93 /* Now start processing the opcodes. */
681 nigel 77
682     for (;;)
683     {
684 nigel 93 minimize = possessive = FALSE;
685 nigel 77 op = *ecode;
686 ph10 625
687 nigel 93 switch(op)
688     {
689 ph10 510 case OP_MARK:
690 ph10 771 md->nomatch_mark = ecode + 2;
691     md->mark = NULL; /* In case previously set by assertion */
692 ph10 510 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
693 ph10 604 eptrb, RM55);
694 ph10 771 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
695     md->mark == NULL) md->mark = ecode + 2;
696 ph10 512
697     /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
698     argument, and we must check whether that argument matches this MARK's
699     argument. It is passed back in md->start_match_ptr (an overloading of that
700     variable). If it does match, we reset that variable to the current subject
701     position and return MATCH_SKIP. Otherwise, pass back the return code
702 ph10 510 unaltered. */
703 ph10 512
704 ph10 771 else if (rrc == MATCH_SKIP_ARG &&
705     strcmp((char *)(ecode + 2), (char *)(md->start_match_ptr)) == 0)
706 ph10 510 {
707     md->start_match_ptr = eptr;
708     RRETURN(MATCH_SKIP);
709     }
710     RRETURN(rrc);
711    
712 ph10 210 case OP_FAIL:
713 ph10 771 RRETURN(MATCH_NOMATCH);
714 ph10 211
715 ph10 551 /* COMMIT overrides PRUNE, SKIP, and THEN */
716 ph10 553
717 ph10 510 case OP_COMMIT:
718     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
719 ph10 604 eptrb, RM52);
720 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
721 ph10 553 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
722     rrc != MATCH_THEN)
723 ph10 551 RRETURN(rrc);
724 ph10 771 RRETURN(MATCH_COMMIT);
725 ph10 510
726 ph10 551 /* PRUNE overrides THEN */
727 ph10 553
728 ph10 210 case OP_PRUNE:
729     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
730 ph10 604 eptrb, RM51);
731 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
732 ph10 771 RRETURN(MATCH_PRUNE);
733 ph10 211
734 ph10 510 case OP_PRUNE_ARG:
735 ph10 771 md->nomatch_mark = ecode + 2;
736     md->mark = NULL; /* In case previously set by assertion */
737 ph10 510 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
738 ph10 604 eptrb, RM56);
739 ph10 771 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
740     md->mark == NULL) md->mark = ecode + 2;
741 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
742 ph10 510 RRETURN(MATCH_PRUNE);
743 ph10 211
744 ph10 551 /* SKIP overrides PRUNE and THEN */
745 ph10 553
746 ph10 210 case OP_SKIP:
747     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
748 ph10 604 eptrb, RM53);
749 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
750 ph10 551 RRETURN(rrc);
751 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
752 ph10 771 RRETURN(MATCH_SKIP);
753 ph10 211
754 ph10 771 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
755     nomatch_mark. There is a flag that disables this opcode when re-matching a
756     pattern that ended with a SKIP for which there was not a matching MARK. */
757    
758 ph10 510 case OP_SKIP_ARG:
759 ph10 771 if (md->ignore_skip_arg)
760     {
761     ecode += _pcre_OP_lengths[*ecode] + ecode[1];
762     break;
763     }
764 ph10 510 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
765 ph10 604 eptrb, RM57);
766 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
767 ph10 551 RRETURN(rrc);
768 ph10 512
769     /* Pass back the current skip name by overloading md->start_match_ptr and
770     returning the special MATCH_SKIP_ARG return code. This will either be
771 ph10 771 caught by a matching MARK, or get to the top, where it causes a rematch
772     with the md->ignore_skip_arg flag set. */
773 ph10 512
774 ph10 510 md->start_match_ptr = ecode + 2;
775 ph10 512 RRETURN(MATCH_SKIP_ARG);
776 ph10 553
777 ph10 716 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
778     the branch in which it occurs can be determined. Overload the start of
779     match pointer to do this. */
780 ph10 512
781 ph10 210 case OP_THEN:
782     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
783 ph10 604 eptrb, RM54);
784 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
785 ph10 716 md->start_match_ptr = ecode;
786 ph10 771 RRETURN(MATCH_THEN);
787 ph10 510
788     case OP_THEN_ARG:
789 ph10 771 md->nomatch_mark = ecode + 2;
790     md->mark = NULL; /* In case previously set by assertion */
791 ph10 733 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top,
792 ph10 716 md, eptrb, RM58);
793 ph10 771 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
794     md->mark == NULL) md->mark = ecode + 2;
795 ph10 510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
796 ph10 733 md->start_match_ptr = ecode;
797 ph10 212 RRETURN(MATCH_THEN);
798 ph10 733
799 ph10 723 /* Handle an atomic group that does not contain any capturing parentheses.
800 ph10 733 This can be handled like an assertion. Prior to 8.13, all atomic groups
801     were handled this way. In 8.13, the code was changed as below for ONCE, so
802     that backups pass through the group and thereby reset captured values.
803     However, this uses a lot more stack, so in 8.20, atomic groups that do not
804     contain any captures generate OP_ONCE_NC, which can be handled in the old,
805 ph10 723 less stack intensive way.
806 ph10 211
807 ph10 723 Check the alternative branches in turn - the matching won't pass the KET
808     for this kind of subpattern. If any one branch matches, we carry on as at
809     the end of a normal bracket, leaving the subject pointer, but resetting
810     the start-of-match value in case it was changed by \K. */
811    
812     case OP_ONCE_NC:
813     prev = ecode;
814     saved_eptr = eptr;
815     do
816     {
817     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
818     if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
819     {
820     mstart = md->start_match_ptr;
821     break;
822     }
823     if (rrc == MATCH_THEN)
824     {
825     next = ecode + GET(ecode,1);
826 ph10 733 if (md->start_match_ptr < next &&
827 ph10 723 (*ecode == OP_ALT || *next == OP_ALT))
828     rrc = MATCH_NOMATCH;
829 ph10 733 }
830    
831 ph10 723 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
832     ecode += GET(ecode,1);
833     }
834     while (*ecode == OP_ALT);
835    
836     /* If hit the end of the group (which could be repeated), fail */
837    
838     if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
839    
840     /* Continue as from after the group, updating the offsets high water
841     mark, since extracts may have been taken. */
842    
843     do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
844    
845     offset_top = md->end_offset_top;
846     eptr = md->end_match_ptr;
847    
848     /* For a non-repeating ket, just continue at this level. This also
849     happens for a repeating ket if no characters were matched in the group.
850     This is the forcible breaking of infinite loops as implemented in Perl
851     5.005. */
852    
853     if (*ecode == OP_KET || eptr == saved_eptr)
854     {
855     ecode += 1+LINK_SIZE;
856     break;
857     }
858    
859     /* The repeating kets try the rest of the pattern or restart from the
860     preceding bracket, in the appropriate order. The second "call" of match()
861     uses tail recursion, to avoid using another stack frame. */
862    
863     if (*ecode == OP_KETRMIN)
864     {
865     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
866     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
867     ecode = prev;
868     goto TAIL_RECURSE;
869     }
870     else /* OP_KETRMAX */
871     {
872 ph10 733 md->match_function_type = MATCH_CBEGROUP;
873 ph10 723 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
874     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
875     ecode += 1 + LINK_SIZE;
876     goto TAIL_RECURSE;
877     }
878     /* Control never gets here */
879    
880 ph10 604 /* Handle a capturing bracket, other than those that are possessive with an
881     unlimited repeat. If there is space in the offset vector, save the current
882     subject position in the working slot at the top of the vector. We mustn't
883     change the current values of the data slot, because they may be set from a
884     previous iteration of this group, and be referred to by a reference inside
885 ph10 625 the group. A failure to match might occur after the group has succeeded,
886 ph10 617 if something later on doesn't match. For this reason, we need to restore
887     the working value and also the values of the final offsets, in case they
888     were set by a previous iteration of the same bracket.
889 nigel 77
890 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
891     a non-capturing bracket. Don't worry about setting the flag for the error
892     case here; that is handled in the code for KET. */
893 nigel 77
894 nigel 93 case OP_CBRA:
895     case OP_SCBRA:
896     number = GET2(ecode, 1+LINK_SIZE);
897 nigel 77 offset = number << 1;
898 ph10 625
899 ph10 475 #ifdef PCRE_DEBUG
900 nigel 93 printf("start bracket %d\n", number);
901     printf("subject=");
902 nigel 77 pchars(eptr, 16, TRUE, md);
903     printf("\n");
904     #endif
905    
906     if (offset < md->offset_max)
907     {
908     save_offset1 = md->offset_vector[offset];
909     save_offset2 = md->offset_vector[offset+1];
910     save_offset3 = md->offset_vector[md->offset_end - number];
911     save_capture_last = md->capture_last;
912    
913     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
914 ph10 531 md->offset_vector[md->offset_end - number] =
915 ph10 530 (int)(eptr - md->start_subject);
916 nigel 77
917 ph10 604 for (;;)
918 nigel 77 {
919 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
920     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
921 ph10 604 eptrb, RM1);
922 ph10 618 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
923 ph10 733
924     /* If we backed up to a THEN, check whether it is within the current
925     branch by comparing the address of the THEN that is passed back with
926 ph10 716 the end of the branch. If it is within the current branch, and the
927     branch is one of two or more alternatives (it either starts or ends
928 ph10 733 with OP_ALT), we have reached the limit of THEN's action, so convert
929     the return code to NOMATCH, which will cause normal backtracking to
930 ph10 716 happen from now on. Otherwise, THEN is passed back to an outer
931 ph10 733 alternative. This implements Perl's treatment of parenthesized groups,
932     where a group not containing | does not affect the current alternative,
933 ph10 716 that is, (X) is NOT the same as (X|(*F)). */
934    
935     if (rrc == MATCH_THEN)
936     {
937     next = ecode + GET(ecode,1);
938 ph10 733 if (md->start_match_ptr < next &&
939 ph10 716 (*ecode == OP_ALT || *next == OP_ALT))
940     rrc = MATCH_NOMATCH;
941 ph10 733 }
942    
943 ph10 716 /* Anything other than NOMATCH is passed back. */
944    
945     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
946 nigel 77 md->capture_last = save_capture_last;
947     ecode += GET(ecode, 1);
948 ph10 625 if (*ecode != OP_ALT) break;
949 nigel 77 }
950    
951     DPRINTF(("bracket %d failed\n", number));
952     md->offset_vector[offset] = save_offset1;
953     md->offset_vector[offset+1] = save_offset2;
954     md->offset_vector[md->offset_end - number] = save_offset3;
955 ph10 625
956 ph10 716 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
957 nigel 77
958 ph10 716 RRETURN(rrc);
959 nigel 77 }
960    
961 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
962     as a non-capturing bracket. */
963 nigel 77
964 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
965     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
966    
967 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
968 nigel 77
969 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
970     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
971    
972 ph10 618 /* Non-capturing or atomic group, except for possessive with unlimited
973 ph10 723 repeat and ONCE group with no captures. Loop for all the alternatives.
974 ph10 708
975 ph10 702 When we get to the final alternative within the brackets, we used to return
976     the result of a recursive call to match() whatever happened so it was
977     possible to reduce stack usage by turning this into a tail recursion,
978     except in the case of a possibly empty group. However, now that there is
979     the possiblity of (*THEN) occurring in the final alternative, this
980     optimization is no longer always possible.
981 ph10 625
982 ph10 708 We can optimize if we know there are no (*THEN)s in the pattern; at present
983     this is the best that can be done.
984    
985 ph10 625 MATCH_ONCE is returned when the end of an atomic group is successfully
986     reached, but subsequent matching fails. It passes back up the tree (causing
987     captured values to be reset) until the original atomic group level is
988 ph10 618 reached. This is tested by comparing md->once_target with the start of the
989     group. At this point, the return is converted into MATCH_NOMATCH so that
990     previous backup points can be taken. */
991 nigel 77
992 ph10 618 case OP_ONCE:
993 nigel 93 case OP_BRA:
994     case OP_SBRA:
995     DPRINTF(("start non-capturing bracket\n"));
996 ph10 618
997 nigel 91 for (;;)
998 nigel 77 {
999 ph10 618 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1000 ph10 702
1001     /* If this is not a possibly empty group, and there are no (*THEN)s in
1002 ph10 708 the pattern, and this is the final alternative, optimize as described
1003 ph10 702 above. */
1004    
1005     else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1006     {
1007     ecode += _pcre_OP_lengths[*ecode];
1008     goto TAIL_RECURSE;
1009 ph10 708 }
1010 ph10 702
1011     /* In all other cases, we have to make another call to match(). */
1012    
1013 ph10 708 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
1014 ph10 604 RM2);
1015 ph10 733
1016 ph10 716 /* See comment in the code for capturing groups above about handling
1017     THEN. */
1018    
1019     if (rrc == MATCH_THEN)
1020 ph10 625 {
1021 ph10 716 next = ecode + GET(ecode,1);
1022 ph10 733 if (md->start_match_ptr < next &&
1023 ph10 716 (*ecode == OP_ALT || *next == OP_ALT))
1024     rrc = MATCH_NOMATCH;
1025 ph10 733 }
1026    
1027     if (rrc != MATCH_NOMATCH)
1028 ph10 716 {
1029 ph10 618 if (rrc == MATCH_ONCE)
1030     {
1031     const uschar *scode = ecode;
1032     if (*scode != OP_ONCE) /* If not at start, find it */
1033     {
1034     while (*scode == OP_ALT) scode += GET(scode, 1);
1035     scode -= GET(scode, 1);
1036 ph10 625 }
1037 ph10 618 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1038 ph10 625 }
1039 ph10 550 RRETURN(rrc);
1040 ph10 625 }
1041 nigel 77 ecode += GET(ecode, 1);
1042 ph10 625 if (*ecode != OP_ALT) break;
1043 nigel 77 }
1044 ph10 733
1045 ph10 609 RRETURN(MATCH_NOMATCH);
1046    
1047 ph10 625 /* Handle possessive capturing brackets with an unlimited repeat. We come
1048 ph10 604 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1049     handled similarly to the normal case above. However, the matching is
1050     different. The end of these brackets will always be OP_KETRPOS, which
1051     returns MATCH_KETRPOS without going further in the pattern. By this means
1052     we can handle the group by iteration rather than recursion, thereby
1053     reducing the amount of stack needed. */
1054 ph10 625
1055 ph10 604 case OP_CBRAPOS:
1056     case OP_SCBRAPOS:
1057     allow_zero = FALSE;
1058 ph10 625
1059 ph10 604 POSSESSIVE_CAPTURE:
1060     number = GET2(ecode, 1+LINK_SIZE);
1061     offset = number << 1;
1062    
1063     #ifdef PCRE_DEBUG
1064     printf("start possessive bracket %d\n", number);
1065     printf("subject=");
1066     pchars(eptr, 16, TRUE, md);
1067     printf("\n");
1068     #endif
1069    
1070     if (offset < md->offset_max)
1071     {
1072     matched_once = FALSE;
1073 ph10 625 code_offset = ecode - md->start_code;
1074 ph10 604
1075     save_offset1 = md->offset_vector[offset];
1076     save_offset2 = md->offset_vector[offset+1];
1077     save_offset3 = md->offset_vector[md->offset_end - number];
1078     save_capture_last = md->capture_last;
1079    
1080     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1081 ph10 625
1082     /* Each time round the loop, save the current subject position for use
1083     when the group matches. For MATCH_MATCH, the group has matched, so we
1084     restart it with a new subject starting position, remembering that we had
1085     at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1086     usual. If we haven't matched any alternatives in any iteration, check to
1087     see if a previous iteration matched. If so, the group has matched;
1088     continue from afterwards. Otherwise it has failed; restore the previous
1089 ph10 604 capture values before returning NOMATCH. */
1090 ph10 625
1091 ph10 604 for (;;)
1092     {
1093     md->offset_vector[md->offset_end - number] =
1094     (int)(eptr - md->start_subject);
1095 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1096 ph10 604 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1097     eptrb, RM63);
1098     if (rrc == MATCH_KETRPOS)
1099     {
1100     offset_top = md->end_offset_top;
1101     eptr = md->end_match_ptr;
1102 ph10 625 ecode = md->start_code + code_offset;
1103 ph10 604 save_capture_last = md->capture_last;
1104 ph10 625 matched_once = TRUE;
1105     continue;
1106     }
1107 ph10 733
1108 ph10 716 /* See comment in the code for capturing groups above about handling
1109     THEN. */
1110    
1111     if (rrc == MATCH_THEN)
1112     {
1113     next = ecode + GET(ecode,1);
1114 ph10 733 if (md->start_match_ptr < next &&
1115 ph10 716 (*ecode == OP_ALT || *next == OP_ALT))
1116     rrc = MATCH_NOMATCH;
1117 ph10 733 }
1118 ph10 716
1119     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1120 ph10 604 md->capture_last = save_capture_last;
1121     ecode += GET(ecode, 1);
1122 ph10 625 if (*ecode != OP_ALT) break;
1123 ph10 604 }
1124 ph10 610
1125 ph10 604 if (!matched_once)
1126 ph10 625 {
1127 ph10 604 md->offset_vector[offset] = save_offset1;
1128     md->offset_vector[offset+1] = save_offset2;
1129     md->offset_vector[md->offset_end - number] = save_offset3;
1130     }
1131 ph10 625
1132 ph10 604 if (allow_zero || matched_once)
1133 ph10 625 {
1134 ph10 604 ecode += 1 + LINK_SIZE;
1135     break;
1136 ph10 625 }
1137    
1138 ph10 604 RRETURN(MATCH_NOMATCH);
1139     }
1140 ph10 625
1141 ph10 604 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1142     as a non-capturing bracket. */
1143    
1144     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1145     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1146    
1147     DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1148    
1149     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1150     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1151    
1152 ph10 625 /* Non-capturing possessive bracket with unlimited repeat. We come here
1153 ph10 604 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1154     without the capturing complication. It is written out separately for speed
1155     and cleanliness. */
1156    
1157     case OP_BRAPOS:
1158     case OP_SBRAPOS:
1159 ph10 625 allow_zero = FALSE;
1160    
1161 ph10 604 POSSESSIVE_NON_CAPTURE:
1162     matched_once = FALSE;
1163 ph10 625 code_offset = ecode - md->start_code;
1164 ph10 604
1165     for (;;)
1166     {
1167 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1168 ph10 604 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1169 ph10 609 eptrb, RM48);
1170 ph10 604 if (rrc == MATCH_KETRPOS)
1171     {
1172 ph10 610 offset_top = md->end_offset_top;
1173 ph10 604 eptr = md->end_match_ptr;
1174 ph10 625 ecode = md->start_code + code_offset;
1175     matched_once = TRUE;
1176     continue;
1177     }
1178 ph10 733
1179 ph10 716 /* See comment in the code for capturing groups above about handling
1180     THEN. */
1181    
1182     if (rrc == MATCH_THEN)
1183     {
1184     next = ecode + GET(ecode,1);
1185 ph10 733 if (md->start_match_ptr < next &&
1186 ph10 716 (*ecode == OP_ALT || *next == OP_ALT))
1187     rrc = MATCH_NOMATCH;
1188 ph10 733 }
1189 ph10 716
1190     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1191 ph10 604 ecode += GET(ecode, 1);
1192 ph10 625 if (*ecode != OP_ALT) break;
1193 ph10 604 }
1194 ph10 625
1195     if (matched_once || allow_zero)
1196 ph10 604 {
1197     ecode += 1 + LINK_SIZE;
1198     break;
1199 ph10 625 }
1200 ph10 604 RRETURN(MATCH_NOMATCH);
1201    
1202     /* Control never reaches here. */
1203    
1204 nigel 77 /* Conditional group: compilation checked that there are no more than
1205     two branches. If the condition is false, skipping the first branch takes us
1206     past the end if there is only one branch, but that's OK because that is
1207 ph10 609 exactly what going to the ket would do. */
1208 nigel 77
1209     case OP_COND:
1210 nigel 93 case OP_SCOND:
1211 ph10 604 codelink = GET(ecode, 1);
1212 ph10 406
1213 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
1214     inserted between OP_COND and an assertion condition. */
1215 ph10 392
1216 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1217     {
1218     if (pcre_callout != NULL)
1219     {
1220     pcre_callout_block cb;
1221 ph10 645 cb.version = 2; /* Version 1 of the callout block */
1222 ph10 381 cb.callout_number = ecode[LINK_SIZE+2];
1223     cb.offset_vector = md->offset_vector;
1224     cb.subject = (PCRE_SPTR)md->start_subject;
1225 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1226     cb.start_match = (int)(mstart - md->start_subject);
1227     cb.current_position = (int)(eptr - md->start_subject);
1228 ph10 381 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1229     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1230     cb.capture_top = offset_top/2;
1231     cb.capture_last = md->capture_last;
1232     cb.callout_data = md->callout_data;
1233 ph10 771 cb.mark = md->nomatch_mark;
1234     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1235 ph10 381 if (rrc < 0) RRETURN(rrc);
1236     }
1237     ecode += _pcre_OP_lengths[OP_CALLOUT];
1238     }
1239 ph10 392
1240 ph10 399 condcode = ecode[LINK_SIZE+1];
1241 ph10 406
1242 ph10 381 /* Now see what the actual condition is */
1243 ph10 392
1244 ph10 459 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1245 nigel 77 {
1246 ph10 459 if (md->recursive == NULL) /* Not recursing => FALSE */
1247     {
1248 ph10 461 condition = FALSE;
1249     ecode += GET(ecode, 1);
1250     }
1251 ph10 459 else
1252 ph10 461 {
1253 ph10 459 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1254 ph10 751 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1255 ph10 461
1256 ph10 459 /* If the test is for recursion into a specific subpattern, and it is
1257     false, but the test was set up by name, scan the table to see if the
1258     name refers to any other numbers, and test them. The condition is true
1259     if any one is set. */
1260 ph10 461
1261 ph10 751 if (!condition && condcode == OP_NRREF)
1262 ph10 459 {
1263     uschar *slotA = md->name_table;
1264     for (i = 0; i < md->name_count; i++)
1265 ph10 461 {
1266     if (GET2(slotA, 0) == recno) break;
1267 ph10 459 slotA += md->name_entry_size;
1268     }
1269 ph10 461
1270 ph10 459 /* Found a name for the number - there can be only one; duplicate
1271     names for different numbers are allowed, but not vice versa. First
1272     scan down for duplicates. */
1273 ph10 461
1274 ph10 459 if (i < md->name_count)
1275 ph10 461 {
1276 ph10 459 uschar *slotB = slotA;
1277     while (slotB > md->name_table)
1278     {
1279     slotB -= md->name_entry_size;
1280     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1281     {
1282     condition = GET2(slotB, 0) == md->recursive->group_num;
1283 ph10 461 if (condition) break;
1284     }
1285 ph10 459 else break;
1286 ph10 461 }
1287    
1288 ph10 459 /* Scan up for duplicates */
1289 ph10 461
1290 ph10 459 if (!condition)
1291 ph10 461 {
1292 ph10 459 slotB = slotA;
1293     for (i++; i < md->name_count; i++)
1294     {
1295     slotB += md->name_entry_size;
1296     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1297     {
1298     condition = GET2(slotB, 0) == md->recursive->group_num;
1299     if (condition) break;
1300 ph10 461 }
1301 ph10 459 else break;
1302 ph10 461 }
1303     }
1304 ph10 459 }
1305 ph10 461 }
1306    
1307 ph10 459 /* Chose branch according to the condition */
1308 ph10 461
1309 ph10 459 ecode += condition? 3 : GET(ecode, 1);
1310     }
1311 ph10 461 }
1312 nigel 93
1313 ph10 459 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1314 nigel 93 {
1315 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1316 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1317 ph10 461
1318 ph10 459 /* If the numbered capture is unset, but the reference was by name,
1319 ph10 461 scan the table to see if the name refers to any other numbers, and test
1320     them. The condition is true if any one is set. This is tediously similar
1321     to the code above, but not close enough to try to amalgamate. */
1322    
1323 ph10 459 if (!condition && condcode == OP_NCREF)
1324     {
1325 ph10 461 int refno = offset >> 1;
1326 ph10 459 uschar *slotA = md->name_table;
1327 ph10 461
1328 ph10 459 for (i = 0; i < md->name_count; i++)
1329 ph10 461 {
1330     if (GET2(slotA, 0) == refno) break;
1331 ph10 459 slotA += md->name_entry_size;
1332     }
1333 ph10 461
1334     /* Found a name for the number - there can be only one; duplicate names
1335     for different numbers are allowed, but not vice versa. First scan down
1336 ph10 459 for duplicates. */
1337 ph10 461
1338 ph10 459 if (i < md->name_count)
1339 ph10 461 {
1340 ph10 459 uschar *slotB = slotA;
1341     while (slotB > md->name_table)
1342     {
1343     slotB -= md->name_entry_size;
1344     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1345     {
1346     offset = GET2(slotB, 0) << 1;
1347 ph10 461 condition = offset < offset_top &&
1348 ph10 459 md->offset_vector[offset] >= 0;
1349 ph10 461 if (condition) break;
1350     }
1351 ph10 459 else break;
1352 ph10 461 }
1353    
1354 ph10 459 /* Scan up for duplicates */
1355 ph10 461
1356 ph10 459 if (!condition)
1357 ph10 461 {
1358 ph10 459 slotB = slotA;
1359     for (i++; i < md->name_count; i++)
1360     {
1361     slotB += md->name_entry_size;
1362     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1363     {
1364     offset = GET2(slotB, 0) << 1;
1365 ph10 461 condition = offset < offset_top &&
1366 ph10 459 md->offset_vector[offset] >= 0;
1367 ph10 461 if (condition) break;
1368     }
1369 ph10 459 else break;
1370 ph10 461 }
1371     }
1372 ph10 459 }
1373 ph10 461 }
1374    
1375 ph10 459 /* Chose branch according to the condition */
1376    
1377 nigel 93 ecode += condition? 3 : GET(ecode, 1);
1378 nigel 77 }
1379    
1380 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
1381 nigel 93 {
1382     condition = FALSE;
1383     ecode += GET(ecode, 1);
1384     }
1385    
1386 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
1387 ph10 604 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1388     an assertion. */
1389 nigel 77
1390     else
1391     {
1392 ph10 625 md->match_function_type = MATCH_CONDASSERT;
1393 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1394 nigel 77 if (rrc == MATCH_MATCH)
1395     {
1396 ph10 619 if (md->end_offset_top > offset_top)
1397     offset_top = md->end_offset_top; /* Captures may have happened */
1398 nigel 93 condition = TRUE;
1399     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1400 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1401     }
1402 ph10 733
1403 ph10 716 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1404 ph10 733 assertion; it is therefore treated as NOMATCH. */
1405 ph10 716
1406 ph10 733 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1407 nigel 77 {
1408     RRETURN(rrc); /* Need braces because of following else */
1409     }
1410 nigel 93 else
1411     {
1412     condition = FALSE;
1413 ph10 399 ecode += codelink;
1414 nigel 93 }
1415     }
1416 nigel 91
1417 ph10 716 /* We are now at the branch that is to be obeyed. As there is only one, can
1418     use tail recursion to avoid using another stack frame, except when there is
1419     unlimited repeat of a possibly empty group. In the latter case, a recursive
1420     call to match() is always required, unless the second alternative doesn't
1421     exist, in which case we can just plough on. Note that, for compatibility
1422     with Perl, the | in a conditional group is NOT treated as creating two
1423     alternatives. If a THEN is encountered in the branch, it propagates out to
1424     the enclosing alternative (unless nested in a deeper set of alternatives,
1425     of course). */
1426 nigel 91
1427 nigel 93 if (condition || *ecode == OP_ALT)
1428     {
1429 ph10 716 if (op != OP_SCOND)
1430 ph10 702 {
1431     ecode += 1 + LINK_SIZE;
1432     goto TAIL_RECURSE;
1433 ph10 708 }
1434 ph10 733
1435 ph10 716 md->match_function_type = MATCH_CBEGROUP;
1436 ph10 609 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1437     RRETURN(rrc);
1438 nigel 77 }
1439 ph10 708
1440 ph10 702 /* Condition false & no alternative; continue after the group. */
1441 ph10 708
1442 ph10 702 else
1443 nigel 93 {
1444     ecode += 1 + LINK_SIZE;
1445     }
1446     break;
1447 nigel 77
1448 ph10 461
1449 ph10 447 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1450     to close any currently open capturing brackets. */
1451 ph10 461
1452 ph10 447 case OP_CLOSE:
1453 ph10 461 number = GET2(ecode, 1);
1454 ph10 447 offset = number << 1;
1455 ph10 461
1456 ph10 475 #ifdef PCRE_DEBUG
1457 ph10 447 printf("end bracket %d at *ACCEPT", number);
1458     printf("\n");
1459     #endif
1460 nigel 77
1461 ph10 447 md->capture_last = number;
1462     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1463     {
1464     md->offset_vector[offset] =
1465     md->offset_vector[md->offset_end - number];
1466 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1467 ph10 447 if (offset_top <= offset) offset_top = offset + 2;
1468     }
1469     ecode += 3;
1470 ph10 461 break;
1471 ph10 447
1472    
1473 ph10 619 /* End of the pattern, either real or forced. */
1474 nigel 77
1475 ph10 619 case OP_END:
1476 ph10 210 case OP_ACCEPT:
1477 ph10 625 case OP_ASSERT_ACCEPT:
1478    
1479 ph10 619 /* If we have matched an empty string, fail if not in an assertion and not
1480     in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1481 ph10 613 is set and we have matched at the start of the subject. In both cases,
1482     backtracking will then try other alternatives, if any. */
1483 ph10 443
1484 ph10 619 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1485 ph10 618 md->recursive == NULL &&
1486 ph10 619 (md->notempty ||
1487     (md->notempty_atstart &&
1488     mstart == md->start_subject + md->start_offset)))
1489 ph10 771 RRETURN(MATCH_NOMATCH);
1490 ph10 443
1491 ph10 442 /* Otherwise, we have a match. */
1492 ph10 625
1493 ph10 168 md->end_match_ptr = eptr; /* Record where we ended */
1494     md->end_offset_top = offset_top; /* and how many extracts were taken */
1495 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1496 nigel 77
1497 ph10 512 /* For some reason, the macros don't work properly if an expression is
1498 ph10 771 given as the argument to RRETURN when the heap is in use. */
1499 ph10 512
1500     rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1501 ph10 771 RRETURN(rrc);
1502 ph10 512
1503 nigel 77 /* Assertion brackets. Check the alternative branches in turn - the
1504     matching won't pass the KET for an assertion. If any one branch matches,
1505     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1506     start of each branch to move the current point backwards, so the code at
1507 ph10 625 this level is identical to the lookahead case. When the assertion is part
1508     of a condition, we want to return immediately afterwards. The caller of
1509     this incarnation of the match() function will have set MATCH_CONDASSERT in
1510     md->match_function type, and one of these opcodes will be the first opcode
1511     that is processed. We use a local variable that is preserved over calls to
1512 ph10 604 match() to remember this case. */
1513 nigel 77
1514     case OP_ASSERT:
1515     case OP_ASSERTBACK:
1516 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1517     {
1518     condassert = TRUE;
1519     md->match_function_type = 0;
1520     }
1521 ph10 625 else condassert = FALSE;
1522    
1523 nigel 77 do
1524     {
1525 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1526 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1527 ph10 500 {
1528     mstart = md->start_match_ptr; /* In case \K reset it */
1529     break;
1530 ph10 501 }
1531 ph10 733
1532     /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1533 ph10 716 as NOMATCH. */
1534 ph10 733
1535 ph10 716 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1536 nigel 77 ecode += GET(ecode, 1);
1537     }
1538     while (*ecode == OP_ALT);
1539 ph10 625
1540 ph10 771 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1541 nigel 77
1542     /* If checking an assertion for a condition, return MATCH_MATCH. */
1543    
1544 ph10 604 if (condassert) RRETURN(MATCH_MATCH);
1545 nigel 77
1546     /* Continue from after the assertion, updating the offsets high water
1547     mark, since extracts may have been taken during the assertion. */
1548    
1549     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1550     ecode += 1 + LINK_SIZE;
1551     offset_top = md->end_offset_top;
1552     continue;
1553    
1554 ph10 473 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1555 ph10 482 PRUNE, or COMMIT means we must assume failure without checking subsequent
1556 ph10 473 branches. */
1557 nigel 77
1558     case OP_ASSERT_NOT:
1559     case OP_ASSERTBACK_NOT:
1560 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1561     {
1562     condassert = TRUE;
1563     md->match_function_type = 0;
1564     }
1565 ph10 625 else condassert = FALSE;
1566 ph10 604
1567 nigel 77 do
1568     {
1569 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1570 ph10 771 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1571 ph10 473 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1572     {
1573     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1574 ph10 482 break;
1575     }
1576 ph10 716
1577 ph10 733 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1578 ph10 716 as NOMATCH. */
1579    
1580     if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1581 nigel 77 ecode += GET(ecode,1);
1582     }
1583     while (*ecode == OP_ALT);
1584    
1585 ph10 604 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1586 ph10 625
1587 nigel 77 ecode += 1 + LINK_SIZE;
1588     continue;
1589    
1590     /* Move the subject pointer back. This occurs only at the start of
1591     each branch of a lookbehind assertion. If we are too close to the start to
1592     move back, this match function fails. When working with UTF-8 we move
1593     back a number of characters, not bytes. */
1594    
1595     case OP_REVERSE:
1596     #ifdef SUPPORT_UTF8
1597     if (utf8)
1598     {
1599 nigel 93 i = GET(ecode, 1);
1600     while (i-- > 0)
1601 nigel 77 {
1602     eptr--;
1603 ph10 771 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1604 ph10 207 BACKCHAR(eptr);
1605 nigel 77 }
1606     }
1607     else
1608     #endif
1609    
1610     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1611    
1612     {
1613 nigel 93 eptr -= GET(ecode, 1);
1614 ph10 771 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1615 nigel 77 }
1616    
1617 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1618 nigel 77
1619 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1620 nigel 77 ecode += 1 + LINK_SIZE;
1621     break;
1622    
1623     /* The callout item calls an external function, if one is provided, passing
1624     details of the match so far. This is mainly for debugging, though the
1625     function is able to force a failure. */
1626    
1627     case OP_CALLOUT:
1628     if (pcre_callout != NULL)
1629     {
1630     pcre_callout_block cb;
1631 ph10 645 cb.version = 2; /* Version 1 of the callout block */
1632 nigel 77 cb.callout_number = ecode[1];
1633     cb.offset_vector = md->offset_vector;
1634 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1635 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1636     cb.start_match = (int)(mstart - md->start_subject);
1637     cb.current_position = (int)(eptr - md->start_subject);
1638 nigel 77 cb.pattern_position = GET(ecode, 2);
1639     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1640     cb.capture_top = offset_top/2;
1641     cb.capture_last = md->capture_last;
1642     cb.callout_data = md->callout_data;
1643 ph10 771 cb.mark = md->nomatch_mark;
1644     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1645 nigel 77 if (rrc < 0) RRETURN(rrc);
1646     }
1647     ecode += 2 + 2*LINK_SIZE;
1648     break;
1649    
1650     /* Recursion either matches the current regex, or some subexpression. The
1651     offset data is the offset to the starting bracket from the start of the
1652     whole pattern. (This is so that it works from duplicated subpatterns.)
1653 ph10 625
1654 ph10 618 The state of the capturing groups is preserved over recursion, and
1655 ph10 625 re-instated afterwards. We don't know how many are started and not yet
1656 ph10 618 finished (offset_top records the completed total) so we just have to save
1657     all the potential data. There may be up to 65535 such values, which is too
1658     large to put on the stack, but using malloc for small numbers seems
1659     expensive. As a compromise, the stack is used when there are no more than
1660     REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1661 nigel 77
1662     There are also other values that have to be saved. We use a chained
1663     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1664 ph10 625 for the original version of this logic. It has, however, been hacked around
1665 ph10 618 a lot, so he is not to blame for the current way it works. */
1666 nigel 77
1667     case OP_RECURSE:
1668     {
1669 ph10 642 recursion_info *ri;
1670     int recno;
1671 ph10 654
1672 nigel 77 callpat = md->start_code + GET(ecode, 1);
1673 ph10 642 recno = (callpat == md->start_code)? 0 :
1674 ph10 654 GET2(callpat, 1 + LINK_SIZE);
1675    
1676     /* Check for repeating a recursion without advancing the subject pointer.
1677 ph10 642 This should catch convoluted mutual recursions. (Some simple cases are
1678 ph10 654 caught at compile time.) */
1679    
1680 ph10 642 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1681 ph10 654 if (recno == ri->group_num && eptr == ri->subject_position)
1682 ph10 642 RRETURN(PCRE_ERROR_RECURSELOOP);
1683 nigel 77
1684     /* Add to "recursing stack" */
1685    
1686 ph10 642 new_recursive.group_num = recno;
1687     new_recursive.subject_position = eptr;
1688 nigel 77 new_recursive.prevrec = md->recursive;
1689     md->recursive = &new_recursive;
1690    
1691 ph10 618 /* Where to continue from afterwards */
1692 nigel 77
1693     ecode += 1 + LINK_SIZE;
1694    
1695 ph10 618 /* Now save the offset data */
1696 nigel 77
1697     new_recursive.saved_max = md->offset_end;
1698     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1699     new_recursive.offset_save = stacksave;
1700     else
1701     {
1702     new_recursive.offset_save =
1703     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1704     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1705     }
1706     memcpy(new_recursive.offset_save, md->offset_vector,
1707     new_recursive.saved_max * sizeof(int));
1708 ph10 625
1709 ph10 618 /* OK, now we can do the recursion. After processing each alternative,
1710 ph10 625 restore the offset data. If there were nested recursions, md->recursive
1711 ph10 618 might be changed, so reset it before looping. */
1712 nigel 77
1713     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1714 ph10 604 cbegroup = (*callpat >= OP_SBRA);
1715 nigel 77 do
1716     {
1717 ph10 604 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1718 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1719 ph10 604 md, eptrb, RM6);
1720 ph10 618 memcpy(md->offset_vector, new_recursive.offset_save,
1721     new_recursive.saved_max * sizeof(int));
1722 ph10 681 md->recursive = new_recursive.prevrec;
1723 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1724 nigel 77 {
1725 nigel 87 DPRINTF(("Recursion matched\n"));
1726 nigel 77 if (new_recursive.offset_save != stacksave)
1727     (pcre_free)(new_recursive.offset_save);
1728 ph10 618
1729     /* Set where we got to in the subject, and reset the start in case
1730 ph10 625 it was changed by \K. This *is* propagated back out of a recursion,
1731     for Perl compatibility. */
1732    
1733 ph10 618 eptr = md->end_match_ptr;
1734     mstart = md->start_match_ptr;
1735     goto RECURSION_MATCHED; /* Exit loop; end processing */
1736 nigel 77 }
1737 ph10 716
1738     /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1739     as NOMATCH. */
1740    
1741 ph10 733 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1742 nigel 87 {
1743     DPRINTF(("Recursion gave error %d\n", rrc));
1744 ph10 400 if (new_recursive.offset_save != stacksave)
1745     (pcre_free)(new_recursive.offset_save);
1746 nigel 87 RRETURN(rrc);
1747     }
1748 nigel 77
1749     md->recursive = &new_recursive;
1750     callpat += GET(callpat, 1);
1751     }
1752     while (*callpat == OP_ALT);
1753    
1754     DPRINTF(("Recursion didn't match\n"));
1755     md->recursive = new_recursive.prevrec;
1756     if (new_recursive.offset_save != stacksave)
1757     (pcre_free)(new_recursive.offset_save);
1758 ph10 771 RRETURN(MATCH_NOMATCH);
1759 nigel 77 }
1760 ph10 625
1761 ph10 618 RECURSION_MATCHED:
1762     break;
1763 nigel 77
1764     /* An alternation is the end of a branch; scan along to find the end of the
1765     bracketed group and go to there. */
1766    
1767     case OP_ALT:
1768     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1769     break;
1770    
1771 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1772     indicating that it may occur zero times. It may repeat infinitely, or not
1773     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1774     with fixed upper repeat limits are compiled as a number of copies, with the
1775     optional ones preceded by BRAZERO or BRAMINZERO. */
1776 ph10 625
1777 nigel 77 case OP_BRAZERO:
1778 ph10 604 next = ecode + 1;
1779     RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1780     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1781     do next += GET(next, 1); while (*next == OP_ALT);
1782     ecode = next + 1 + LINK_SIZE;
1783 nigel 77 break;
1784 ph10 625
1785 nigel 77 case OP_BRAMINZERO:
1786 ph10 604 next = ecode + 1;
1787     do next += GET(next, 1); while (*next == OP_ALT);
1788     RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1789     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1790     ecode++;
1791 nigel 77 break;
1792    
1793 ph10 335 case OP_SKIPZERO:
1794 ph10 604 next = ecode+1;
1795     do next += GET(next,1); while (*next == OP_ALT);
1796     ecode = next + 1 + LINK_SIZE;
1797 ph10 335 break;
1798 ph10 625
1799 ph10 604 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1800     here; just jump to the group, with allow_zero set TRUE. */
1801 ph10 625
1802 ph10 604 case OP_BRAPOSZERO:
1803 ph10 625 op = *(++ecode);
1804 ph10 604 allow_zero = TRUE;
1805     if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1806     goto POSSESSIVE_NON_CAPTURE;
1807 ph10 335
1808 nigel 93 /* End of a group, repeated or non-repeating. */
1809 nigel 77
1810     case OP_KET:
1811     case OP_KETRMIN:
1812     case OP_KETRMAX:
1813 ph10 625 case OP_KETRPOS:
1814 nigel 91 prev = ecode - GET(ecode, 1);
1815 ph10 625
1816 nigel 93 /* If this was a group that remembered the subject start, in order to break
1817     infinite repeats of empty string matches, retrieve the subject start from
1818     the chain. Otherwise, set it NULL. */
1819 nigel 77
1820 ph10 618 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1821 nigel 93 {
1822     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1823     eptrb = eptrb->epb_prev; /* Backup to previous group */
1824     }
1825     else saved_eptr = NULL;
1826 nigel 77
1827 ph10 733 /* If we are at the end of an assertion group or a non-capturing atomic
1828 ph10 723 group, stop matching and return MATCH_MATCH, but record the current high
1829     water mark for use by positive assertions. We also need to record the match
1830     start in case it was changed by \K. */
1831 nigel 93
1832 ph10 723 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1833 ph10 733 *prev == OP_ONCE_NC)
1834 nigel 91 {
1835 ph10 723 md->end_match_ptr = eptr; /* For ONCE_NC */
1836 nigel 91 md->end_offset_top = offset_top;
1837 ph10 500 md->start_match_ptr = mstart;
1838 ph10 771 RRETURN(MATCH_MATCH); /* Sets md->mark */
1839 nigel 91 }
1840 nigel 77
1841 nigel 93 /* For capturing groups we have to check the group number back at the start
1842     and if necessary complete handling an extraction by setting the offsets and
1843 ph10 618 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1844     into group 0, so it won't be picked up here. Instead, we catch it when the
1845     OP_END is reached. Other recursion is handled here. We just have to record
1846     the current subject position and start match pointer and give a MATCH
1847     return. */
1848 nigel 77
1849 ph10 604 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1850     *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1851 nigel 91 {
1852 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1853 nigel 91 offset = number << 1;
1854 ph10 461
1855 ph10 475 #ifdef PCRE_DEBUG
1856 nigel 91 printf("end bracket %d", number);
1857     printf("\n");
1858 nigel 77 #endif
1859    
1860 ph10 618 /* Handle a recursively called group. */
1861    
1862     if (md->recursive != NULL && md->recursive->group_num == number)
1863     {
1864     md->end_match_ptr = eptr;
1865     md->start_match_ptr = mstart;
1866     RRETURN(MATCH_MATCH);
1867     }
1868    
1869     /* Deal with capturing */
1870    
1871 nigel 93 md->capture_last = number;
1872     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1873 nigel 91 {
1874 ph10 625 /* If offset is greater than offset_top, it means that we are
1875     "skipping" a capturing group, and that group's offsets must be marked
1876     unset. In earlier versions of PCRE, all the offsets were unset at the
1877     start of matching, but this doesn't work because atomic groups and
1878 ph10 615 assertions can cause a value to be set that should later be unset.
1879     Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1880 ph10 625 part of the atomic group, but this is not on the final matching path,
1881     so must be unset when 2 is set. (If there is no group 2, there is no
1882 ph10 615 problem, because offset_top will then be 2, indicating no capture.) */
1883 ph10 625
1884 ph10 615 if (offset > offset_top)
1885     {
1886     register int *iptr = md->offset_vector + offset_top;
1887     register int *iend = md->offset_vector + offset;
1888     while (iptr < iend) *iptr++ = -1;
1889 ph10 625 }
1890    
1891 ph10 615 /* Now make the extraction */
1892    
1893 nigel 93 md->offset_vector[offset] =
1894     md->offset_vector[md->offset_end - number];
1895 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1896 nigel 93 if (offset_top <= offset) offset_top = offset + 2;
1897     }
1898 nigel 91 }
1899 nigel 77
1900 ph10 618 /* For an ordinary non-repeating ket, just continue at this level. This
1901     also happens for a repeating ket if no characters were matched in the
1902     group. This is the forcible breaking of infinite loops as implemented in
1903 ph10 723 Perl 5.005. For a non-repeating atomic group that includes captures,
1904     establish a backup point by processing the rest of the pattern at a lower
1905     level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1906     original OP_ONCE level, thereby bypassing intermediate backup points, but
1907     resetting any captures that happened along the way. */
1908 nigel 77
1909 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1910     {
1911 ph10 618 if (*prev == OP_ONCE)
1912     {
1913     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1914     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1915     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1916 ph10 625 RRETURN(MATCH_ONCE);
1917     }
1918 ph10 618 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1919 nigel 91 break;
1920     }
1921 ph10 625
1922     /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1923 ph10 604 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1924     at a time from the outer level, thus saving stack. */
1925 ph10 625
1926 ph10 604 if (*ecode == OP_KETRPOS)
1927 ph10 625 {
1928 ph10 604 md->end_match_ptr = eptr;
1929 ph10 625 md->end_offset_top = offset_top;
1930 ph10 604 RRETURN(MATCH_KETRPOS);
1931 ph10 625 }
1932 nigel 77
1933 ph10 604 /* The normal repeating kets try the rest of the pattern or restart from
1934     the preceding bracket, in the appropriate order. In the second case, we can
1935     use tail recursion to avoid using another stack frame, unless we have an
1936 ph10 618 an atomic group or an unlimited repeat of a group that can match an empty
1937     string. */
1938 nigel 77
1939 nigel 91 if (*ecode == OP_KETRMIN)
1940     {
1941 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1942 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1943 ph10 618 if (*prev == OP_ONCE)
1944     {
1945 ph10 623 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1946 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1947     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1948 ph10 625 RRETURN(MATCH_ONCE);
1949     }
1950 ph10 604 if (*prev >= OP_SBRA) /* Could match an empty string */
1951 ph10 197 {
1952 ph10 625 md->match_function_type = MATCH_CBEGROUP;
1953 ph10 604 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1954 ph10 197 RRETURN(rrc);
1955     }
1956 nigel 91 ecode = prev;
1957     goto TAIL_RECURSE;
1958 nigel 77 }
1959 nigel 91 else /* OP_KETRMAX */
1960     {
1961 ph10 625 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1962 ph10 604 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1963 ph10 618 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1964 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1965 ph10 618 if (*prev == OP_ONCE)
1966     {
1967 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1968 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1969     md->once_target = prev;
1970 ph10 625 RRETURN(MATCH_ONCE);
1971     }
1972 nigel 91 ecode += 1 + LINK_SIZE;
1973     goto TAIL_RECURSE;
1974     }
1975     /* Control never gets here */
1976 nigel 77
1977 ph10 602 /* Not multiline mode: start of subject assertion, unless notbol. */
1978 nigel 77
1979     case OP_CIRC:
1980 ph10 771 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1981 ph10 625
1982 nigel 77 /* Start of subject assertion */
1983    
1984     case OP_SOD:
1985 ph10 771 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1986 nigel 77 ecode++;
1987     break;
1988 ph10 625
1989 ph10 602 /* Multiline mode: start of subject unless notbol, or after any newline. */
1990 nigel 77
1991 ph10 602 case OP_CIRCM:
1992 ph10 771 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1993 ph10 602 if (eptr != md->start_subject &&
1994     (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1995 ph10 771 RRETURN(MATCH_NOMATCH);
1996 ph10 602 ecode++;
1997     break;
1998    
1999 nigel 77 /* Start of match assertion */
2000    
2001     case OP_SOM:
2002 ph10 771 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2003 nigel 77 ecode++;
2004     break;
2005 ph10 172
2006 ph10 168 /* Reset the start of match point */
2007 ph10 172
2008 ph10 168 case OP_SET_SOM:
2009     mstart = eptr;
2010 ph10 172 ecode++;
2011     break;
2012 nigel 77
2013 ph10 602 /* Multiline mode: assert before any newline, or before end of subject
2014     unless noteol is set. */
2015 nigel 77
2016 ph10 602 case OP_DOLLM:
2017     if (eptr < md->end_subject)
2018 ph10 771 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
2019 ph10 602 else
2020 nigel 77 {
2021 ph10 771 if (md->noteol) RRETURN(MATCH_NOMATCH);
2022 ph10 602 SCHECK_PARTIAL();
2023 nigel 77 }
2024 ph10 602 ecode++;
2025     break;
2026 ph10 579
2027 ph10 625 /* Not multiline mode: assert before a terminating newline or before end of
2028 ph10 602 subject unless noteol is set. */
2029    
2030     case OP_DOLL:
2031 ph10 771 if (md->noteol) RRETURN(MATCH_NOMATCH);
2032 ph10 602 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2033    
2034 nigel 91 /* ... else fall through for endonly */
2035 nigel 77
2036     /* End of subject assertion (\z) */
2037    
2038     case OP_EOD:
2039 ph10 771 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2040 ph10 553 SCHECK_PARTIAL();
2041 nigel 77 ecode++;
2042     break;
2043    
2044     /* End of subject or ending \n assertion (\Z) */
2045    
2046     case OP_EODN:
2047 ph10 553 ASSERT_NL_OR_EOS:
2048     if (eptr < md->end_subject &&
2049 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2050 ph10 771 RRETURN(MATCH_NOMATCH);
2051 ph10 579
2052 ph10 553 /* Either at end of string or \n before end. */
2053 ph10 579
2054 ph10 553 SCHECK_PARTIAL();
2055 nigel 77 ecode++;
2056     break;
2057    
2058     /* Word boundary assertions */
2059    
2060     case OP_NOT_WORD_BOUNDARY:
2061     case OP_WORD_BOUNDARY:
2062     {
2063    
2064     /* Find out if the previous and current characters are "word" characters.
2065     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2066 ph10 443 be "non-word" characters. Remember the earliest consulted character for
2067 ph10 435 partial matching. */
2068 nigel 77
2069     #ifdef SUPPORT_UTF8
2070     if (utf8)
2071     {
2072 ph10 518 /* Get status of previous character */
2073 ph10 527
2074 nigel 77 if (eptr == md->start_subject) prev_is_word = FALSE; else
2075     {
2076 ph10 409 USPTR lastptr = eptr - 1;
2077 nigel 77 while((*lastptr & 0xc0) == 0x80) lastptr--;
2078 ph10 443 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2079 nigel 77 GETCHAR(c, lastptr);
2080 ph10 527 #ifdef SUPPORT_UCP
2081 ph10 518 if (md->use_ucp)
2082     {
2083     if (c == '_') prev_is_word = TRUE; else
2084 ph10 527 {
2085 ph10 518 int cat = UCD_CATEGORY(c);
2086     prev_is_word = (cat == ucp_L || cat == ucp_N);
2087 ph10 527 }
2088     }
2089     else
2090     #endif
2091 nigel 77 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2092     }
2093 ph10 527
2094 ph10 518 /* Get status of next character */
2095 ph10 527
2096 ph10 443 if (eptr >= md->end_subject)
2097 nigel 77 {
2098 ph10 443 SCHECK_PARTIAL();
2099     cur_is_word = FALSE;
2100 ph10 428 }
2101     else
2102     {
2103 nigel 77 GETCHAR(c, eptr);
2104 ph10 527 #ifdef SUPPORT_UCP
2105 ph10 518 if (md->use_ucp)
2106     {
2107     if (c == '_') cur_is_word = TRUE; else
2108 ph10 527 {
2109 ph10 518 int cat = UCD_CATEGORY(c);
2110     cur_is_word = (cat == ucp_L || cat == ucp_N);
2111 ph10 527 }
2112     }
2113     else
2114     #endif
2115 nigel 77 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2116     }
2117     }
2118     else
2119     #endif
2120    
2121 ph10 527 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2122 ph10 518 consistency with the behaviour of \w we do use it in this case. */
2123 nigel 77
2124     {
2125 ph10 518 /* Get status of previous character */
2126 ph10 527
2127 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
2128     {
2129 ph10 443 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2130 ph10 527 #ifdef SUPPORT_UCP
2131 ph10 518 if (md->use_ucp)
2132     {
2133 ph10 527 c = eptr[-1];
2134 ph10 518 if (c == '_') prev_is_word = TRUE; else
2135 ph10 527 {
2136 ph10 518 int cat = UCD_CATEGORY(c);
2137     prev_is_word = (cat == ucp_L || cat == ucp_N);
2138 ph10 527 }
2139     }
2140     else
2141     #endif
2142 ph10 435 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2143     }
2144 ph10 527
2145 ph10 518 /* Get status of next character */
2146 ph10 527
2147 ph10 443 if (eptr >= md->end_subject)
2148 ph10 428 {
2149 ph10 443 SCHECK_PARTIAL();
2150     cur_is_word = FALSE;
2151 ph10 428 }
2152 ph10 527 else
2153     #ifdef SUPPORT_UCP
2154 ph10 518 if (md->use_ucp)
2155     {
2156 ph10 527 c = *eptr;
2157 ph10 518 if (c == '_') cur_is_word = TRUE; else
2158 ph10 527 {
2159 ph10 518 int cat = UCD_CATEGORY(c);
2160     cur_is_word = (cat == ucp_L || cat == ucp_N);
2161 ph10 527 }
2162     }
2163     else
2164     #endif
2165 ph10 518 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2166 nigel 77 }
2167    
2168     /* Now see if the situation is what we want */
2169    
2170     if ((*ecode++ == OP_WORD_BOUNDARY)?
2171     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2172 ph10 771 RRETURN(MATCH_NOMATCH);
2173 nigel 77 }
2174     break;
2175    
2176     /* Match a single character type; inline for speed */
2177    
2178     case OP_ANY:
2179 ph10 771 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2180 ph10 345 /* Fall through */
2181    
2182 ph10 341 case OP_ALLANY:
2183 ph10 648 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2184     { /* not be updated before SCHECK_PARTIAL. */
2185 ph10 443 SCHECK_PARTIAL();
2186 ph10 771 RRETURN(MATCH_NOMATCH);
2187 ph10 443 }
2188 ph10 648 eptr++;
2189 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2190 nigel 77 ecode++;
2191     break;
2192    
2193     /* Match a single byte, even in UTF-8 mode. This opcode really does match
2194     any byte, even newline, independent of the setting of PCRE_DOTALL. */
2195    
2196     case OP_ANYBYTE:
2197 ph10 648 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2198     { /* not be updated before SCHECK_PARTIAL. */
2199 ph10 443 SCHECK_PARTIAL();
2200 ph10 771 RRETURN(MATCH_NOMATCH);
2201 ph10 443 }
2202 ph10 654 eptr++;
2203 nigel 77 ecode++;
2204     break;
2205    
2206     case OP_NOT_DIGIT:
2207 ph10 443 if (eptr >= md->end_subject)
2208 ph10 428 {
2209 ph10 443 SCHECK_PARTIAL();
2210 ph10 771 RRETURN(MATCH_NOMATCH);
2211 ph10 443 }
2212 nigel 77 GETCHARINCTEST(c, eptr);
2213     if (
2214     #ifdef SUPPORT_UTF8
2215     c < 256 &&
2216     #endif
2217     (md->ctypes[c] & ctype_digit) != 0
2218     )
2219 ph10 771 RRETURN(MATCH_NOMATCH);
2220 nigel 77 ecode++;
2221     break;
2222    
2223     case OP_DIGIT:
2224 ph10 443 if (eptr >= md->end_subject)
2225 ph10 428 {
2226 ph10 443 SCHECK_PARTIAL();
2227 ph10 771 RRETURN(MATCH_NOMATCH);
2228 ph10 443 }
2229 nigel 77 GETCHARINCTEST(c, eptr);
2230     if (
2231     #ifdef SUPPORT_UTF8
2232     c >= 256 ||
2233     #endif
2234     (md->ctypes[c] & ctype_digit) == 0
2235     )
2236 ph10 771 RRETURN(MATCH_NOMATCH);
2237 nigel 77 ecode++;
2238     break;
2239    
2240     case OP_NOT_WHITESPACE:
2241 ph10 443 if (eptr >= md->end_subject)
2242 ph10 428 {
2243 ph10 443 SCHECK_PARTIAL();
2244 ph10 771 RRETURN(MATCH_NOMATCH);
2245 ph10 443 }
2246 nigel 77 GETCHARINCTEST(c, eptr);
2247     if (
2248     #ifdef SUPPORT_UTF8
2249     c < 256 &&
2250     #endif
2251     (md->ctypes[c] & ctype_space) != 0
2252     )
2253 ph10 771 RRETURN(MATCH_NOMATCH);
2254 nigel 77 ecode++;
2255     break;
2256    
2257     case OP_WHITESPACE:
2258 ph10 443 if (eptr >= md->end_subject)
2259 ph10 428 {
2260 ph10 443 SCHECK_PARTIAL();
2261 ph10 771 RRETURN(MATCH_NOMATCH);
2262 ph10 443 }
2263 nigel 77 GETCHARINCTEST(c, eptr);
2264     if (
2265     #ifdef SUPPORT_UTF8
2266     c >= 256 ||
2267     #endif
2268     (md->ctypes[c] & ctype_space) == 0
2269     )
2270 ph10 771 RRETURN(MATCH_NOMATCH);
2271 nigel 77 ecode++;
2272     break;
2273    
2274     case OP_NOT_WORDCHAR:
2275 ph10 443 if (eptr >= md->end_subject)
2276 ph10 428 {
2277 ph10 443 SCHECK_PARTIAL();
2278 ph10 771 RRETURN(MATCH_NOMATCH);
2279 ph10 443 }
2280 nigel 77 GETCHARINCTEST(c, eptr);
2281     if (
2282     #ifdef SUPPORT_UTF8
2283     c < 256 &&
2284     #endif
2285     (md->ctypes[c] & ctype_word) != 0
2286     )
2287 ph10 771 RRETURN(MATCH_NOMATCH);
2288 nigel 77 ecode++;
2289     break;
2290    
2291     case OP_WORDCHAR:
2292 ph10 443 if (eptr >= md->end_subject)
2293 ph10 428 {
2294 ph10 443 SCHECK_PARTIAL();
2295 ph10 771 RRETURN(MATCH_NOMATCH);
2296 ph10 443 }
2297 nigel 77 GETCHARINCTEST(c, eptr);
2298     if (
2299     #ifdef SUPPORT_UTF8
2300     c >= 256 ||
2301     #endif
2302     (md->ctypes[c] & ctype_word) == 0
2303     )
2304 ph10 771 RRETURN(MATCH_NOMATCH);
2305 nigel 77 ecode++;
2306     break;
2307    
2308 nigel 93 case OP_ANYNL:
2309 ph10 443 if (eptr >= md->end_subject)
2310 ph10 428 {
2311 ph10 443 SCHECK_PARTIAL();
2312 ph10 771 RRETURN(MATCH_NOMATCH);
2313 ph10 443 }
2314 nigel 93 GETCHARINCTEST(c, eptr);
2315     switch(c)
2316     {
2317 ph10 771 default: RRETURN(MATCH_NOMATCH);
2318 ph10 625
2319 nigel 93 case 0x000d:
2320     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2321     break;
2322 ph10 231
2323 nigel 93 case 0x000a:
2324 ph10 231 break;
2325    
2326 nigel 93 case 0x000b:
2327     case 0x000c:
2328     case 0x0085:
2329     case 0x2028:
2330     case 0x2029:
2331 ph10 771 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2332 nigel 93 break;
2333     }
2334     ecode++;
2335     break;
2336    
2337 ph10 178 case OP_NOT_HSPACE:
2338 ph10 443 if (eptr >= md->end_subject)
2339 ph10 428 {
2340 ph10 443 SCHECK_PARTIAL();
2341 ph10 771 RRETURN(MATCH_NOMATCH);
2342 ph10 443 }
2343 ph10 178 GETCHARINCTEST(c, eptr);
2344     switch(c)
2345     {
2346     default: break;
2347     case 0x09: /* HT */
2348     case 0x20: /* SPACE */
2349     case 0xa0: /* NBSP */
2350     case 0x1680: /* OGHAM SPACE MARK */
2351     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2352     case 0x2000: /* EN QUAD */
2353     case 0x2001: /* EM QUAD */
2354     case 0x2002: /* EN SPACE */
2355     case 0x2003: /* EM SPACE */
2356     case 0x2004: /* THREE-PER-EM SPACE */
2357     case 0x2005: /* FOUR-PER-EM SPACE */
2358     case 0x2006: /* SIX-PER-EM SPACE */
2359     case 0x2007: /* FIGURE SPACE */
2360     case 0x2008: /* PUNCTUATION SPACE */
2361     case 0x2009: /* THIN SPACE */
2362     case 0x200A: /* HAIR SPACE */
2363     case 0x202f: /* NARROW NO-BREAK SPACE */
2364     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2365     case 0x3000: /* IDEOGRAPHIC SPACE */
2366 ph10 771 RRETURN(MATCH_NOMATCH);
2367 ph10 178 }
2368     ecode++;
2369     break;
2370    
2371     case OP_HSPACE:
2372 ph10 443 if (eptr >= md->end_subject)
2373 ph10 428 {
2374 ph10 443 SCHECK_PARTIAL();
2375 ph10 771 RRETURN(MATCH_NOMATCH);
2376 ph10 443 }
2377 ph10 178 GETCHARINCTEST(c, eptr);
2378     switch(c)
2379     {
2380 ph10 771 default: RRETURN(MATCH_NOMATCH);
2381 ph10 178 case 0x09: /* HT */
2382     case 0x20: /* SPACE */
2383     case 0xa0: /* NBSP */
2384     case 0x1680: /* OGHAM SPACE MARK */
2385     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2386     case 0x2000: /* EN QUAD */
2387     case 0x2001: /* EM QUAD */
2388     case 0x2002: /* EN SPACE */
2389     case 0x2003: /* EM SPACE */
2390     case 0x2004: /* THREE-PER-EM SPACE */
2391     case 0x2005: /* FOUR-PER-EM SPACE */
2392     case 0x2006: /* SIX-PER-EM SPACE */
2393     case 0x2007: /* FIGURE SPACE */
2394     case 0x2008: /* PUNCTUATION SPACE */
2395     case 0x2009: /* THIN SPACE */
2396     case 0x200A: /* HAIR SPACE */
2397     case 0x202f: /* NARROW NO-BREAK SPACE */
2398     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2399     case 0x3000: /* IDEOGRAPHIC SPACE */
2400     break;
2401     }
2402     ecode++;
2403     break;
2404    
2405     case OP_NOT_VSPACE:
2406 ph10 443 if (eptr >= md->end_subject)
2407 ph10 428 {
2408 ph10 443 SCHECK_PARTIAL();
2409 ph10 771 RRETURN(MATCH_NOMATCH);
2410 ph10 443 }
2411 ph10 178 GETCHARINCTEST(c, eptr);
2412     switch(c)
2413     {
2414     default: break;
2415     case 0x0a: /* LF */
2416     case 0x0b: /* VT */
2417     case 0x0c: /* FF */
2418     case 0x0d: /* CR */
2419     case 0x85: /* NEL */
2420     case 0x2028: /* LINE SEPARATOR */
2421     case 0x2029: /* PARAGRAPH SEPARATOR */
2422 ph10 771 RRETURN(MATCH_NOMATCH);
2423 ph10 178 }
2424     ecode++;
2425     break;
2426    
2427     case OP_VSPACE:
2428 ph10 443 if (eptr >= md->end_subject)
2429 ph10 428 {
2430 ph10 443 SCHECK_PARTIAL();
2431 ph10 771 RRETURN(MATCH_NOMATCH);
2432 ph10 443 }
2433 ph10 178 GETCHARINCTEST(c, eptr);
2434     switch(c)
2435     {
2436 ph10 771 default: RRETURN(MATCH_NOMATCH);
2437 ph10 178 case 0x0a: /* LF */
2438     case 0x0b: /* VT */
2439     case 0x0c: /* FF */
2440     case 0x0d: /* CR */
2441     case 0x85: /* NEL */
2442     case 0x2028: /* LINE SEPARATOR */
2443     case 0x2029: /* PARAGRAPH SEPARATOR */
2444     break;
2445     }
2446     ecode++;
2447     break;
2448    
2449 nigel 77 #ifdef SUPPORT_UCP
2450     /* Check the next character by Unicode property. We will get here only
2451     if the support is in the binary; otherwise a compile-time error occurs. */
2452    
2453     case OP_PROP:
2454     case OP_NOTPROP:
2455 ph10 443 if (eptr >= md->end_subject)
2456 ph10 428 {
2457 ph10 443 SCHECK_PARTIAL();
2458 ph10 771 RRETURN(MATCH_NOMATCH);
2459 ph10 443 }
2460 nigel 77 GETCHARINCTEST(c, eptr);
2461     {
2462 ph10 384 const ucd_record *prop = GET_UCD(c);
2463 nigel 77
2464 nigel 87 switch(ecode[1])
2465     {
2466     case PT_ANY:
2467 ph10 771 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2468 nigel 87 break;
2469 nigel 77
2470 nigel 87 case PT_LAMP:
2471 ph10 349 if ((prop->chartype == ucp_Lu ||
2472     prop->chartype == ucp_Ll ||
2473     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2474 ph10 771 RRETURN(MATCH_NOMATCH);
2475 ph10 517 break;
2476 nigel 87
2477     case PT_GC:
2478 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2479 ph10 771 RRETURN(MATCH_NOMATCH);
2480 nigel 87 break;
2481    
2482     case PT_PC:
2483 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2484 ph10 771 RRETURN(MATCH_NOMATCH);
2485 nigel 87 break;
2486    
2487     case PT_SC:
2488 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
2489 ph10 771 RRETURN(MATCH_NOMATCH);
2490 nigel 87 break;
2491 ph10 527
2492 ph10 517 /* These are specials */
2493 ph10 527
2494 ph10 517 case PT_ALNUM:
2495     if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2496     _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2497 ph10 771 RRETURN(MATCH_NOMATCH);
2498 ph10 527 break;
2499    
2500 ph10 517 case PT_SPACE: /* Perl space */
2501     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2502     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2503     == (op == OP_NOTPROP))
2504 ph10 771 RRETURN(MATCH_NOMATCH);
2505 ph10 527 break;
2506    
2507 ph10 517 case PT_PXSPACE: /* POSIX space */
2508     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2509 ph10 527 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2510 ph10 517 c == CHAR_FF || c == CHAR_CR)
2511     == (op == OP_NOTPROP))
2512 ph10 771 RRETURN(MATCH_NOMATCH);
2513 ph10 527 break;
2514 nigel 87
2515 ph10 527 case PT_WORD:
2516 ph10 517 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2517 ph10 527 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2518 ph10 517 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2519 ph10 771 RRETURN(MATCH_NOMATCH);
2520 ph10 527 break;
2521    
2522 ph10 517 /* This should never occur */
2523    
2524 nigel 87 default:
2525     RRETURN(PCRE_ERROR_INTERNAL);
2526 nigel 77 }
2527 nigel 87
2528     ecode += 3;
2529 nigel 77 }
2530     break;
2531    
2532     /* Match an extended Unicode sequence. We will get here only if the support
2533     is in the binary; otherwise a compile-time error occurs. */
2534    
2535     case OP_EXTUNI:
2536 ph10 443 if (eptr >= md->end_subject)
2537 ph10 428 {
2538 ph10 443 SCHECK_PARTIAL();
2539 ph10 771 RRETURN(MATCH_NOMATCH);
2540 ph10 443 }
2541 nigel 77 GETCHARINCTEST(c, eptr);
2542 ph10 771 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2543 ph10 623 while (eptr < md->end_subject)
2544 nigel 77 {
2545 ph10 623 int len = 1;
2546     if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2547     if (UCD_CATEGORY(c) != ucp_M) break;
2548     eptr += len;
2549 nigel 77 }
2550     ecode++;
2551     break;
2552     #endif
2553    
2554    
2555     /* Match a back reference, possibly repeatedly. Look past the end of the
2556     item to see if there is repeat information following. The code is similar
2557     to that for character classes, but repeated for efficiency. Then obey
2558     similar code to character type repeats - written out again for speed.
2559     However, if the referenced string is the empty string, always treat
2560     it as matched, any number of times (otherwise there could be infinite
2561     loops). */
2562    
2563     case OP_REF:
2564 ph10 625 case OP_REFI:
2565     caseless = op == OP_REFI;
2566 ph10 595 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2567     ecode += 3;
2568 ph10 345
2569 ph10 595 /* If the reference is unset, there are two possibilities:
2570 ph10 345
2571 ph10 595 (a) In the default, Perl-compatible state, set the length negative;
2572     this ensures that every attempt at a match fails. We can't just fail
2573     here, because of the possibility of quantifiers with zero minima.
2574 ph10 345
2575 ph10 595 (b) If the JavaScript compatibility flag is set, set the length to zero
2576     so that the back reference matches an empty string.
2577 ph10 345
2578 ph10 595 Otherwise, set the length to the length of what was matched by the
2579     referenced subpattern. */
2580 ph10 345
2581 ph10 595 if (offset >= offset_top || md->offset_vector[offset] < 0)
2582     length = (md->jscript_compat)? 0 : -1;
2583     else
2584     length = md->offset_vector[offset+1] - md->offset_vector[offset];
2585 nigel 77
2586 ph10 595 /* Set up for repetition, or handle the non-repeated case */
2587 nigel 77
2588 ph10 595 switch (*ecode)
2589     {
2590     case OP_CRSTAR:
2591     case OP_CRMINSTAR:
2592     case OP_CRPLUS:
2593     case OP_CRMINPLUS:
2594     case OP_CRQUERY:
2595     case OP_CRMINQUERY:
2596     c = *ecode++ - OP_CRSTAR;
2597     minimize = (c & 1) != 0;
2598     min = rep_min[c]; /* Pick up values from tables; */
2599     max = rep_max[c]; /* zero for max => infinity */
2600     if (max == 0) max = INT_MAX;
2601     break;
2602 nigel 77
2603 ph10 595 case OP_CRRANGE:
2604     case OP_CRMINRANGE:
2605     minimize = (*ecode == OP_CRMINRANGE);
2606     min = GET2(ecode, 1);
2607     max = GET2(ecode, 3);
2608     if (max == 0) max = INT_MAX;
2609     ecode += 5;
2610     break;
2611 nigel 77
2612 ph10 595 default: /* No repeat follows */
2613 ph10 602 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2614 ph10 595 {
2615     CHECK_PARTIAL();
2616 ph10 771 RRETURN(MATCH_NOMATCH);
2617 nigel 77 }
2618 ph10 595 eptr += length;
2619     continue; /* With the main loop */
2620     }
2621 nigel 77
2622 ph10 595 /* Handle repeated back references. If the length of the reference is
2623     zero, just continue with the main loop. */
2624 ph10 443
2625 ph10 595 if (length == 0) continue;
2626 nigel 77
2627 ph10 595 /* First, ensure the minimum number of matches are present. We get back
2628     the length of the reference string explicitly rather than passing the
2629     address of eptr, so that eptr can be a register variable. */
2630 nigel 77
2631 ph10 595 for (i = 1; i <= min; i++)
2632     {
2633 ph10 625 int slength;
2634 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2635 nigel 77 {
2636 ph10 595 CHECK_PARTIAL();
2637 ph10 771 RRETURN(MATCH_NOMATCH);
2638 nigel 77 }
2639 ph10 595 eptr += slength;
2640     }
2641 nigel 77
2642 ph10 595 /* If min = max, continue at the same level without recursion.
2643     They are not both allowed to be zero. */
2644 nigel 77
2645 ph10 595 if (min == max) continue;
2646 nigel 77
2647 ph10 595 /* If minimizing, keep trying and advancing the pointer */
2648 nigel 77
2649 ph10 595 if (minimize)
2650     {
2651     for (fi = min;; fi++)
2652 nigel 77 {
2653 ph10 625 int slength;
2654 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2655 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2656 ph10 771 if (fi >= max) RRETURN(MATCH_NOMATCH);
2657 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2658 nigel 77 {
2659 ph10 595 CHECK_PARTIAL();
2660 ph10 771 RRETURN(MATCH_NOMATCH);
2661 nigel 77 }
2662 ph10 595 eptr += slength;
2663 nigel 77 }
2664 ph10 595 /* Control never gets here */
2665     }
2666 nigel 77
2667 ph10 595 /* If maximizing, find the longest string and work backwards */
2668 nigel 77
2669 ph10 595 else
2670     {
2671     pp = eptr;
2672     for (i = min; i < max; i++)
2673 nigel 77 {
2674 ph10 625 int slength;
2675 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2676 nigel 77 {
2677 ph10 595 CHECK_PARTIAL();
2678     break;
2679 nigel 77 }
2680 ph10 595 eptr += slength;
2681 nigel 77 }
2682 ph10 595 while (eptr >= pp)
2683     {
2684 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2685 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2686     eptr -= length;
2687     }
2688 ph10 771 RRETURN(MATCH_NOMATCH);
2689 nigel 77 }
2690     /* Control never gets here */
2691    
2692     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2693     used when all the characters in the class have values in the range 0-255,
2694     and either the matching is caseful, or the characters are in the range
2695     0-127 when UTF-8 processing is enabled. The only difference between
2696     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2697     encountered.
2698    
2699     First, look past the end of the item to see if there is repeat information
2700     following. Then obey similar code to character type repeats - written out
2701     again for speed. */
2702    
2703     case OP_NCLASS:
2704     case OP_CLASS:
2705     {
2706     data = ecode + 1; /* Save for matching */
2707     ecode += 33; /* Advance past the item */
2708    
2709     switch (*ecode)
2710     {
2711     case OP_CRSTAR:
2712     case OP_CRMINSTAR:
2713     case OP_CRPLUS:
2714     case OP_CRMINPLUS:
2715     case OP_CRQUERY:
2716     case OP_CRMINQUERY:
2717     c = *ecode++ - OP_CRSTAR;
2718     minimize = (c & 1) != 0;
2719     min = rep_min[c]; /* Pick up values from tables; */
2720     max = rep_max[c]; /* zero for max => infinity */
2721     if (max == 0) max = INT_MAX;
2722     break;
2723    
2724     case OP_CRRANGE:
2725     case OP_CRMINRANGE:
2726     minimize = (*ecode == OP_CRMINRANGE);
2727     min = GET2(ecode, 1);
2728     max = GET2(ecode, 3);
2729     if (max == 0) max = INT_MAX;
2730     ecode += 5;
2731     break;
2732    
2733     default: /* No repeat follows */
2734     min = max = 1;
2735     break;
2736     }
2737    
2738     /* First, ensure the minimum number of matches are present. */
2739    
2740     #ifdef SUPPORT_UTF8
2741     /* UTF-8 mode */
2742     if (utf8)
2743     {
2744     for (i = 1; i <= min; i++)
2745     {
2746 ph10 427 if (eptr >= md->end_subject)
2747 ph10 426 {
2748 ph10 428 SCHECK_PARTIAL();
2749 ph10 771 RRETURN(MATCH_NOMATCH);
2750 ph10 427 }
2751 nigel 77 GETCHARINC(c, eptr);
2752     if (c > 255)
2753     {
2754 ph10 771 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2755 nigel 77 }
2756     else
2757     {
2758 ph10 771 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2759 nigel 77 }
2760     }
2761     }
2762     else
2763     #endif
2764     /* Not UTF-8 mode */
2765     {
2766     for (i = 1; i <= min; i++)
2767     {
2768 ph10 427 if (eptr >= md->end_subject)
2769 ph10 426 {
2770 ph10 428 SCHECK_PARTIAL();
2771 ph10 771 RRETURN(MATCH_NOMATCH);
2772 ph10 427 }
2773 nigel 77 c = *eptr++;
2774 ph10 771 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2775 nigel 77 }
2776     }
2777    
2778     /* If max == min we can continue with the main loop without the
2779     need to recurse. */
2780    
2781     if (min == max) continue;
2782    
2783     /* If minimizing, keep testing the rest of the expression and advancing
2784     the pointer while it matches the class. */
2785    
2786     if (minimize)
2787     {
2788     #ifdef SUPPORT_UTF8
2789     /* UTF-8 mode */
2790     if (utf8)
2791     {
2792     for (fi = min;; fi++)
2793     {
2794 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2795 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2796 ph10 771 if (fi >= max) RRETURN(MATCH_NOMATCH);
2797 ph10 427 if (eptr >= md->end_subject)
2798 ph10 426 {
2799 ph10 427 SCHECK_PARTIAL();
2800 ph10 771 RRETURN(MATCH_NOMATCH);
2801 ph10 427 }
2802 nigel 77 GETCHARINC(c, eptr);
2803     if (c > 255)
2804     {
2805 ph10 771 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2806 nigel 77 }
2807     else
2808     {
2809 ph10 771 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2810 nigel 77 }
2811     }
2812     }
2813     else
2814     #endif
2815     /* Not UTF-8 mode */
2816     {
2817     for (fi = min;; fi++)
2818     {
2819 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2820 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2821 ph10 771 if (fi >= max) RRETURN(MATCH_NOMATCH);
2822 ph10 427 if (eptr >= md->end_subject)
2823 ph10 426 {
2824 ph10 427 SCHECK_PARTIAL();
2825 ph10 771 RRETURN(MATCH_NOMATCH);
2826 ph10 427 }
2827 nigel 77 c = *eptr++;
2828 ph10 771 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2829 nigel 77 }
2830     }
2831     /* Control never gets here */
2832     }
2833    
2834     /* If maximizing, find the longest possible run, then work backwards. */
2835    
2836     else
2837     {
2838     pp = eptr;
2839    
2840     #ifdef SUPPORT_UTF8
2841     /* UTF-8 mode */
2842     if (utf8)
2843     {
2844     for (i = min; i < max; i++)
2845     {
2846     int len = 1;
2847 ph10 463 if (eptr >= md->end_subject)
2848 ph10 462 {
2849 ph10 463 SCHECK_PARTIAL();
2850 ph10 462 break;
2851 ph10 463 }
2852 nigel 77 GETCHARLEN(c, eptr, len);
2853     if (c > 255)
2854     {
2855     if (op == OP_CLASS) break;
2856     }
2857     else
2858     {
2859     if ((data[c/8] & (1 << (c&7))) == 0) break;
2860     }
2861     eptr += len;
2862     }
2863     for (;;)
2864     {
2865 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2866 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2867     if (eptr-- == pp) break; /* Stop if tried at original pos */
2868     BACKCHAR(eptr);
2869     }
2870     }
2871     else
2872     #endif
2873     /* Not UTF-8 mode */
2874     {
2875     for (i = min; i < max; i++)
2876     {
2877 ph10 463 if (eptr >= md->end_subject)
2878 ph10 462 {
2879 ph10 463 SCHECK_PARTIAL();
2880 ph10 462 break;
2881 ph10 463 }
2882 nigel 77 c = *eptr;
2883     if ((data[c/8] & (1 << (c&7))) == 0) break;
2884     eptr++;
2885     }
2886     while (eptr >= pp)
2887     {
2888 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2889 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2890 nigel 77 eptr--;
2891     }
2892     }
2893    
2894 ph10 771 RRETURN(MATCH_NOMATCH);
2895 nigel 77 }
2896     }
2897     /* Control never gets here */
2898    
2899    
2900     /* Match an extended character class. This opcode is encountered only
2901 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2902     mode, because Unicode properties are supported in non-UTF-8 mode. */
2903 nigel 77
2904     #ifdef SUPPORT_UTF8
2905     case OP_XCLASS:
2906     {
2907     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2908     ecode += GET(ecode, 1); /* Advance past the item */
2909    
2910     switch (*ecode)
2911     {
2912     case OP_CRSTAR:
2913     case OP_CRMINSTAR:
2914     case OP_CRPLUS:
2915     case OP_CRMINPLUS:
2916     case OP_CRQUERY:
2917     case OP_CRMINQUERY:
2918     c = *ecode++ - OP_CRSTAR;
2919     minimize = (c & 1) != 0;
2920     min = rep_min[c]; /* Pick up values from tables; */
2921     max = rep_max[c]; /* zero for max => infinity */
2922     if (max == 0) max = INT_MAX;
2923     break;
2924    
2925     case OP_CRRANGE:
2926     case OP_CRMINRANGE:
2927     minimize = (*ecode == OP_CRMINRANGE);
2928     min = GET2(ecode, 1);
2929     max = GET2(ecode, 3);
2930     if (max == 0) max = INT_MAX;
2931     ecode += 5;
2932     break;
2933    
2934     default: /* No repeat follows */
2935     min = max = 1;
2936     break;
2937     }
2938    
2939     /* First, ensure the minimum number of matches are present. */
2940    
2941     for (i = 1; i <= min; i++)
2942     {
2943 ph10 427 if (eptr >= md->end_subject)
2944 ph10 426 {
2945     SCHECK_PARTIAL();
2946 ph10 771 RRETURN(MATCH_NOMATCH);
2947 ph10 427 }
2948 ph10 384 GETCHARINCTEST(c, eptr);
2949 ph10 771 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2950 nigel 77 }
2951    
2952     /* If max == min we can continue with the main loop without the
2953     need to recurse. */
2954    
2955     if (min == max) continue;
2956    
2957     /* If minimizing, keep testing the rest of the expression and advancing
2958     the pointer while it matches the class. */
2959    
2960     if (minimize)
2961     {
2962     for (fi = min;; fi++)
2963     {
2964 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2965 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2966 ph10 771 if (fi >= max) RRETURN(MATCH_NOMATCH);
2967 ph10 427 if (eptr >= md->end_subject)
2968 ph10 426 {
2969 ph10 427 SCHECK_PARTIAL();
2970 ph10 771 RRETURN(MATCH_NOMATCH);
2971 ph10 427 }
2972 ph10 384 GETCHARINCTEST(c, eptr);
2973 ph10 771 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2974 nigel 77 }
2975     /* Control never gets here */
2976     }
2977    
2978     /* If maximizing, find the longest possible run, then work backwards. */
2979    
2980     else
2981     {
2982     pp = eptr;
2983     for (i = min; i < max; i++)
2984     {
2985     int len = 1;
2986 ph10 463 if (eptr >= md->end_subject)
2987 ph10 462 {
2988 ph10 463 SCHECK_PARTIAL();
2989 ph10 462 break;
2990 ph10 463 }
2991 ph10 384 GETCHARLENTEST(c, eptr, len);
2992 nigel 77 if (!_pcre_xclass(c, data)) break;
2993     eptr += len;
2994     }
2995     for(;;)
2996     {
2997 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2998 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2999     if (eptr-- == pp) break; /* Stop if tried at original pos */
3000 ph10 214 if (utf8) BACKCHAR(eptr);
3001 nigel 77 }
3002 ph10 771 RRETURN(MATCH_NOMATCH);
3003 nigel 77 }
3004    
3005     /* Control never gets here */
3006     }
3007     #endif /* End of XCLASS */
3008    
3009     /* Match a single character, casefully */
3010    
3011     case OP_CHAR:
3012     #ifdef SUPPORT_UTF8
3013     if (utf8)
3014     {
3015     length = 1;
3016     ecode++;
3017     GETCHARLEN(fc, ecode, length);
3018 ph10 443 if (length > md->end_subject - eptr)
3019 ph10 428 {
3020     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3021 ph10 771 RRETURN(MATCH_NOMATCH);
3022 ph10 443 }
3023 ph10 771 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3024 nigel 77 }
3025     else
3026     #endif
3027    
3028     /* Non-UTF-8 mode */
3029     {
3030 ph10 443 if (md->end_subject - eptr < 1)
3031 ph10 428 {
3032     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3033 ph10 771 RRETURN(MATCH_NOMATCH);
3034 ph10 443 }
3035 ph10 771 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3036 nigel 77 ecode += 2;
3037     }
3038     break;
3039    
3040 ph10 778 /* Match a single character, caselessly. If we are at the end of the
3041     subject, give up immediately. */
3042 nigel 77
3043 ph10 602 case OP_CHARI:
3044 ph10 778 if (eptr >= md->end_subject)
3045     {
3046     SCHECK_PARTIAL();
3047     RRETURN(MATCH_NOMATCH);
3048     }
3049    
3050 nigel 77 #ifdef SUPPORT_UTF8
3051     if (utf8)
3052     {
3053     length = 1;
3054     ecode++;
3055     GETCHARLEN(fc, ecode, length);
3056 ph10 778
3057 nigel 77 /* If the pattern character's value is < 128, we have only one byte, and
3058 ph10 778 we know that its other case must also be one byte long, so we can use the
3059     fast lookup table. We know that there is at least one byte left in the
3060     subject. */
3061 nigel 77
3062     if (fc < 128)
3063     {
3064 ph10 771 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3065 nigel 77 }
3066    
3067 ph10 778 /* Otherwise we must pick up the subject character. Note that we cannot
3068     use the value of "length" to check for sufficient bytes left, because the
3069     other case of the character may have more or fewer bytes. */
3070 nigel 77
3071     else
3072     {
3073 nigel 93 unsigned int dc;
3074 nigel 77 GETCHARINC(dc, eptr);
3075     ecode += length;
3076    
3077     /* If we have Unicode property support, we can use it to test the other
3078 nigel 87 case of the character, if there is one. */
3079 nigel 77
3080     if (fc != dc)
3081     {
3082     #ifdef SUPPORT_UCP
3083 ph10 349 if (dc != UCD_OTHERCASE(fc))
3084 nigel 77 #endif
3085 ph10 771 RRETURN(MATCH_NOMATCH);
3086 nigel 77 }
3087     }
3088     }
3089     else
3090     #endif /* SUPPORT_UTF8 */
3091    
3092     /* Non-UTF-8 mode */
3093     {
3094 ph10 771 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3095 nigel 77 ecode += 2;
3096     }
3097     break;
3098    
3099 nigel 93 /* Match a single character repeatedly. */
3100 nigel 77
3101     case OP_EXACT:
3102 ph10 602 case OP_EXACTI:
3103 nigel 77 min = max = GET2(ecode, 1);
3104     ecode += 3;
3105     goto REPEATCHAR;
3106    
3107 nigel 93 case OP_POSUPTO:
3108 ph10 602 case OP_POSUPTOI:
3109 nigel 93 possessive = TRUE;
3110     /* Fall through */
3111    
3112 nigel 77 case OP_UPTO:
3113 ph10 602 case OP_UPTOI:
3114 nigel 77 case OP_MINUPTO:
3115 ph10 602 case OP_MINUPTOI:
3116 nigel 77 min = 0;
3117     max = GET2(ecode, 1);
3118 ph10 602 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3119 nigel 77 ecode += 3;
3120     goto REPEATCHAR;
3121    
3122 nigel 93 case OP_POSSTAR:
3123 ph10 602 case OP_POSSTARI:
3124 nigel 93 possessive = TRUE;
3125     min = 0;
3126     max = INT_MAX;
3127     ecode++;
3128     goto REPEATCHAR;
3129    
3130     case OP_POSPLUS:
3131 ph10 602 case OP_POSPLUSI:
3132 nigel 93 possessive = TRUE;
3133     min = 1;
3134     max = INT_MAX;
3135     ecode++;
3136     goto REPEATCHAR;
3137    
3138     case OP_POSQUERY:
3139 ph10 602 case OP_POSQUERYI:
3140 nigel 93 possessive = TRUE;
3141     min = 0;
3142     max = 1;
3143     ecode++;
3144     goto REPEATCHAR;
3145    
3146 nigel 77 case OP_STAR:
3147 ph10 602 case OP_STARI:
3148 nigel 77 case OP_MINSTAR:
3149 ph10 602 case OP_MINSTARI:
3150 nigel 77 case OP_PLUS:
3151 ph10 602 case OP_PLUSI:
3152 nigel 77 case OP_MINPLUS:
3153 ph10 602 case OP_MINPLUSI:
3154 nigel 77 case OP_QUERY:
3155 ph10 602 case OP_QUERYI:
3156 nigel 77 case OP_MINQUERY:
3157 ph10 602 case OP_MINQUERYI:
3158     c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3159 nigel 77 minimize = (c & 1) != 0;
3160     min = rep_min[c]; /* Pick up values from tables; */
3161     max = rep_max[c]; /* zero for max => infinity */
3162     if (max == 0) max = INT_MAX;
3163    
3164 ph10 426 /* Common code for all repeated single-character matches. */
3165 nigel 77
3166     REPEATCHAR:
3167     #ifdef SUPPORT_UTF8
3168     if (utf8)
3169     {
3170     length = 1;
3171     charptr = ecode;
3172     GETCHARLEN(fc, ecode, length);
3173     ecode += length;
3174    
3175     /* Handle multibyte character matching specially here. There is
3176     support for caseless matching if UCP support is present. */
3177    
3178     if (length > 1)
3179     {
3180     #ifdef SUPPORT_UCP
3181 nigel 93 unsigned int othercase;
3182 ph10 602 if (op >= OP_STARI && /* Caseless */
3183 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
3184 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
3185 ph10 115 else oclength = 0;
3186 nigel 77 #endif /* SUPPORT_UCP */
3187    
3188     for (i = 1; i <= min; i++)
3189     {
3190 ph10 426 if (eptr <= md->end_subject - length &&
3191     memcmp(eptr, charptr, length) == 0) eptr += length;
3192 ph10 123 #ifdef SUPPORT_UCP
3193 ph10 426 else if (oclength > 0 &&
3194     eptr <= md->end_subject - oclength &&
3195     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3196     #endif /* SUPPORT_UCP */
3197 nigel 77 else
3198     {
3199 ph10 426 CHECK_PARTIAL();
3200 ph10 771 RRETURN(MATCH_NOMATCH);
3201 nigel 77 }
3202     }
3203    
3204     if (min == max) continue;
3205    
3206     if (minimize)
3207     {
3208     for (fi = min;; fi++)
3209     {
3210 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3211 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3212 ph10 771 if (fi >= max) RRETURN(MATCH_NOMATCH);
3213 ph10 426 if (eptr <= md->end_subject - length &&
3214     memcmp(eptr, charptr, length) == 0) eptr += length;
3215 ph10 123 #ifdef SUPPORT_UCP
3216 ph10 426 else if (oclength > 0 &&
3217     eptr <= md->end_subject - oclength &&
3218     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3219     #endif /* SUPPORT_UCP */
3220 nigel 77 else
3221     {
3222 ph10 426 CHECK_PARTIAL();
3223 ph10 771 RRETURN(MATCH_NOMATCH);
3224 nigel 77 }
3225     }
3226     /* Control never gets here */
3227     }
3228 nigel 93
3229     else /* Maximize */
3230 nigel 77 {
3231     pp = eptr;
3232     for (i = min; i < max; i++)
3233     {
3234 ph10 426 if (eptr <= md->end_subject - length &&
3235     memcmp(eptr, charptr, length) == 0) eptr += length;
3236 ph10 123 #ifdef SUPPORT_UCP
3237 ph10 426 else if (oclength > 0 &&
3238     eptr <= md->end_subject - oclength &&
3239     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3240     #endif /* SUPPORT_UCP */
3241 ph10 463 else
3242 ph10 462 {
3243 ph10 463 CHECK_PARTIAL();
3244 ph10 462 break;
3245 ph10 463 }
3246 nigel 77 }
3247 nigel 93
3248     if (possessive) continue;
3249 ph10 427
3250 ph10 120 for(;;)
3251 ph10 426 {
3252 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3253 ph10 426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3254 ph10 771 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3255 ph10 115 #ifdef SUPPORT_UCP
3256 ph10 426 eptr--;
3257     BACKCHAR(eptr);
3258 ph10 123 #else /* without SUPPORT_UCP */
3259 ph10 426 eptr -= length;
3260 ph10 123 #endif /* SUPPORT_UCP */
3261 ph10 426 }
3262 nigel 77 }
3263     /* Control never gets here */
3264     }
3265    
3266     /* If the length of a UTF-8 character is 1, we fall through here, and
3267     obey the code as for non-UTF-8 characters below, though in this case the
3268     value of fc will always be < 128. */
3269     }
3270     else
3271     #endif /* SUPPORT_UTF8 */
3272    
3273     /* When not in UTF-8 mode, load a single-byte character. */
3274    
3275 ph10 426 fc = *ecode++;
3276 ph10 443
3277 nigel 77 /* The value of fc at this point is always less than 256, though we may or
3278     may not be in UTF-8 mode. The code is duplicated for the caseless and
3279     caseful cases, for speed, since matching characters is likely to be quite
3280     common. First, ensure the minimum number of matches are present. If min =
3281     max, continue at the same level without recursing. Otherwise, if
3282     minimizing, keep trying the rest of the expression and advancing one
3283     matching character if failing, up to the maximum. Alternatively, if
3284     maximizing, find the maximum number of characters and work backwards. */
3285    
3286     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3287     max, eptr));
3288    
3289 ph10 602 if (op >= OP_STARI) /* Caseless */
3290 nigel 77 {
3291     fc = md->lcc[fc];
3292     for (i = 1; i <= min; i++)
3293 ph10 426 {
3294     if (eptr >= md->end_subject)
3295     {
3296     SCHECK_PARTIAL();
3297 ph10 771 RRETURN(MATCH_NOMATCH);
3298 ph10 426 }
3299 ph10 771 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3300 ph10 426 }
3301 nigel 77 if (min == max) continue;
3302     if (minimize)
3303     {
3304     for (fi = min;; fi++)
3305     {
3306 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3307 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3308 ph10 771 if (fi >= max) RRETURN(MATCH_NOMATCH);
3309 ph10 426 if (eptr >= md->end_subject)
3310     {
3311 ph10 427 SCHECK_PARTIAL();
3312 ph10 771 RRETURN(MATCH_NOMATCH);
3313 ph10 426 }
3314 ph10 771 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3315 nigel 77 }
3316     /* Control never gets here */
3317     }
3318 nigel 93 else /* Maximize */
3319 nigel 77 {
3320     pp = eptr;
3321     for (i = min; i < max; i++)
3322     {
3323 ph10 463 if (eptr >= md->end_subject)
3324 ph10 462 {
3325     SCHECK_PARTIAL();
3326     break;
3327 ph10 463 }
3328 ph10 462 if (fc != md->lcc[*eptr]) break;
3329 nigel 77 eptr++;
3330     }
3331 ph10 427
3332 nigel 93 if (possessive) continue;
3333 ph10 427
3334 nigel 77 while (eptr >= pp)
3335     {
3336 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3337 nigel 77 eptr--;
3338     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3339     }
3340 ph10 771 RRETURN(MATCH_NOMATCH);
3341 nigel 77 }
3342     /* Control never gets here */
3343     }
3344    
3345     /* Caseful comparisons (includes all multi-byte characters) */
3346    
3347     else
3348     {
3349 ph10 427 for (i = 1; i <= min; i++)
3350 ph10 426 {
3351     if (eptr >= md->end_subject)
3352     {
3353     SCHECK_PARTIAL();
3354 ph10 771 RRETURN(MATCH_NOMATCH);
3355 ph10 426 }
3356 ph10 771 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3357 ph10 427 }
3358 ph10 443
3359 nigel 77 if (min == max) continue;
3360 ph10 443
3361 nigel 77 if (minimize)
3362     {
3363     for (fi = min;; fi++)
3364     {
3365 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3366 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3367 ph10 771 if (fi >= max) RRETURN(MATCH_NOMATCH);
3368 ph10 426 if (eptr >= md->end_subject)
3369 ph10 427 {
3370 ph10 426 SCHECK_PARTIAL();
3371 ph10 771 RRETURN(MATCH_NOMATCH);
3372 ph10 427 }
3373 ph10 771 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3374 nigel 77 }
3375     /* Control never gets here */
3376     }
3377 nigel 93 else /* Maximize */
3378 nigel 77 {
3379     pp = eptr;
3380     for (i = min; i < max; i++)
3381     {
3382 ph10 463 if (eptr >= md->end_subject)
3383 ph10 462 {
3384 ph10 463 SCHECK_PARTIAL();
3385 ph10 462 break;
3386 ph10 463 }
3387 ph10 462 if (fc != *eptr) break;
3388 nigel 77 eptr++;
3389     }
3390 nigel 93 if (possessive) continue;
3391 ph10 443
3392 nigel 77 while (eptr >= pp)
3393     {
3394 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3395 nigel 77 eptr--;
3396     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3397     }
3398 ph10 771 RRETURN(MATCH_NOMATCH);
3399 nigel 77 }
3400     }
3401     /* Control never gets here */
3402    
3403     /* Match a negated single one-byte character. The character we are
3404     checking can be multibyte. */
3405    
3406     case OP_NOT:
3407 ph10 625 case OP_NOTI:
3408 ph10 443 if (eptr >= md->end_subject)
3409 ph10 428 {
3410 ph10 443 SCHECK_PARTIAL();
3411 ph10 771 RRETURN(MATCH_NOMATCH);
3412 ph10 443 }
3413 nigel 77 ecode++;
3414     GETCHARINCTEST(c, eptr);
3415 ph10 602 if (op == OP_NOTI) /* The caseless case */
3416 nigel 77 {
3417     #ifdef SUPPORT_UTF8
3418     if (c < 256)
3419     #endif
3420     c = md->lcc[c];
3421 ph10 771 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
3422 nigel 77 }
3423 ph10 602 else /* Caseful */
3424 nigel 77 {
3425 ph10 771 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
3426 nigel 77 }
3427     break;
3428    
3429     /* Match a negated single one-byte character repeatedly. This is almost a
3430     repeat of the code for a repeated single character, but I haven't found a
3431     nice way of commoning these up that doesn't require a test of the
3432     positive/negative option for each character match. Maybe that wouldn't add
3433     very much to the time taken, but character matching *is* what this is all
3434     about... */
3435    
3436     case OP_NOTEXACT:
3437 ph10 602 case OP_NOTEXACTI:
3438 nigel 77 min = max = GET2(ecode, 1);
3439     ecode += 3;
3440     goto REPEATNOTCHAR;
3441    
3442     case OP_NOTUPTO:
3443 ph10 602 case OP_NOTUPTOI:
3444 nigel 77 case OP_NOTMINUPTO:
3445 ph10 602 case OP_NOTMINUPTOI:
3446 nigel 77 min = 0;
3447     max = GET2(ecode, 1);
3448 ph10 602 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3449 nigel 77 ecode += 3;
3450     goto REPEATNOTCHAR;
3451    
3452 nigel 93 case OP_NOTPOSSTAR:
3453 ph10 602 case OP_NOTPOSSTARI:
3454 nigel 93 possessive = TRUE;
3455     min = 0;
3456     max = INT_MAX;
3457     ecode++;
3458     goto REPEATNOTCHAR;
3459    
3460     case OP_NOTPOSPLUS:
3461 ph10 602 case OP_NOTPOSPLUSI:
3462 nigel 93 possessive = TRUE;
3463     min = 1;
3464     max = INT_MAX;
3465     ecode++;
3466     goto REPEATNOTCHAR;
3467    
3468     case OP_NOTPOSQUERY:
3469 ph10 602 case OP_NOTPOSQUERYI:
3470 nigel 93 possessive = TRUE;
3471     min = 0;
3472     max = 1;
3473     ecode++;
3474     goto REPEATNOTCHAR;
3475    
3476     case OP_NOTPOSUPTO:
3477 ph10 602 case OP_NOTPOSUPTOI:
3478 nigel 93 possessive = TRUE;
3479     min = 0;
3480     max = GET2(ecode, 1);
3481     ecode += 3;
3482     goto REPEATNOTCHAR;
3483    
3484 nigel 77 case OP_NOTSTAR:
3485 ph10 602 case OP_NOTSTARI:
3486 nigel 77 case OP_NOTMINSTAR:
3487 ph10 602 case OP_NOTMINSTARI:
3488 nigel 77 case OP_NOTPLUS:
3489 ph10 602 case OP_NOTPLUSI:
3490 nigel 77 case OP_NOTMINPLUS:
3491 ph10 602 case OP_NOTMINPLUSI:
3492 nigel 77 case OP_NOTQUERY:
3493 ph10 602 case OP_NOTQUERYI:
3494 nigel 77 case OP_NOTMINQUERY:
3495 ph10 602 case OP_NOTMINQUERYI:
3496     c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3497 nigel 77 minimize = (c & 1) != 0;
3498     min = rep_min[c]; /* Pick up values from tables; */
3499     max = rep_max[c]; /* zero for max => infinity */
3500     if (max == 0) max = INT_MAX;
3501    
3502 ph10 426 /* Common code for all repeated single-byte matches. */
3503 nigel 77
3504     REPEATNOTCHAR:
3505     fc = *ecode++;
3506    
3507     /* The code is duplicated for the caseless and caseful cases, for speed,
3508     since matching characters is likely to be quite common. First, ensure the
3509     minimum number of matches are present. If min = max, continue at the same
3510     level without recursing. Otherwise, if minimizing, keep trying the rest of
3511     the expression and advancing one matching character if failing, up to the
3512     maximum. Alternatively, if maximizing, find the maximum number of
3513     characters and work backwards. */
3514    
3515     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3516     max, eptr));
3517    
3518 ph10 602 if (op >= OP_NOTSTARI) /* Caseless */
3519 nigel 77 {
3520     fc = md->lcc[fc];
3521    
3522     #ifdef SUPPORT_UTF8
3523     /* UTF-8 mode */
3524     if (utf8)
3525     {
3526 nigel 93 register unsigned int d;
3527 nigel 77 for (i = 1; i <= min; i++)
3528     {
3529 ph10 426 if (eptr >= md->end_subject)
3530     {
3531     SCHECK_PARTIAL();
3532 ph10 771 RRETURN(MATCH_NOMATCH);
3533 ph10 427 }
3534 nigel 77 GETCHARINC(d, eptr);
3535     if (d < 256) d = md->lcc[d];
3536 ph10 771 if (fc == d) RRETURN(MATCH_NOMATCH);
3537 nigel 77 }
3538     }
3539     else
3540     #endif
3541    
3542     /* Not UTF-8 mode */
3543     {
3544     for (i = 1; i <= min; i++)
3545 ph10 426 {
3546     if (eptr >= md->end_subject)
3547     {
3548     SCHECK_PARTIAL();
3549 ph10 771 RRETURN(MATCH_NOMATCH);
3550 ph10 427 }
3551 ph10 771 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3552 ph10 427 }
3553 nigel 77 }
3554    
3555     if (min == max) continue;
3556    
3557     if (minimize)
3558     {
3559     #ifdef SUPPORT_UTF8
3560     /* UTF-8 mode */
3561     if (utf8)
3562     {
3563 nigel 93 register unsigned int d;
3564 nigel 77 for (fi = min;; fi++)
3565     {
3566 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3567 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3568 ph10 771 if (fi >= max) RRETURN(MATCH_NOMATCH);
3569 ph10 427 if (eptr >= md->end_subject)
3570 ph10 426 {
3571 ph10 427 SCHECK_PARTIAL();
3572 ph10 771 RRETURN(MATCH_NOMATCH);
3573 ph10 427 }
3574 nigel 77 GETCHARINC(d, eptr);
3575     if (d < 256) d = md->lcc[d];
3576 ph10 771 if (fc == d) RRETURN(MATCH_NOMATCH);
3577 nigel 77 }
3578     }
3579     else
3580     #endif
3581     /* Not UTF-8 mode */
3582     {
3583     for (fi = min;; fi++)
3584     {
3585 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3586 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3587 ph10 771 if (fi >= max) RRETURN(MATCH_NOMATCH);
3588 ph10 426 if (eptr >= md->end_subject)
3589     {
3590     SCHECK_PARTIAL();
3591 ph10 771 RRETURN(MATCH_NOMATCH);
3592 ph10 426 }
3593 ph10 771 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3594 nigel 77 }
3595     }
3596     /* Control never gets here */
3597     }
3598    
3599     /* Maximize case */
3600    
3601     else
3602     {
3603     pp = eptr;
3604    
3605     #ifdef SUPPORT_UTF8
3606     /* UTF-8 mode */
3607     if (utf8)
3608     {
3609 nigel 93 register unsigned int d;
3610 nigel 77 for (i = min; i < max; i++)
3611     {
3612     int len = 1;
3613 ph10 463 if (eptr >= md->end_subject)
3614 ph10 462 {
3615 ph10 463 SCHECK_PARTIAL();
3616 ph10 462 break;
3617 ph10 463 }
3618 nigel 77 GETCHARLEN(d, eptr, len);
3619     if (d < 256) d = md->lcc[d];
3620     if (fc == d) break;
3621     eptr += len;
3622     }
3623 nigel 93 if (possessive) continue;
3624     for(;;)
3625 nigel 77 {
3626 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3627 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3628     if (eptr-- == pp) break; /* Stop if tried at original pos */
3629     BACKCHAR(eptr);
3630     }
3631     }
3632     else
3633     #endif
3634     /* Not UTF-8 mode */
3635     {
3636     for (i = min; i < max; i++)
3637     {
3638 ph10 463 if (eptr >= md->end_subject)
3639 ph10 462 {
3640     SCHECK_PARTIAL();
3641     break;
3642 ph10 463 }
3643 ph10 462 if (fc == md->lcc[*eptr]) break;
3644 nigel 77 eptr++;
3645     }
3646 nigel 93 if (possessive) continue;
3647 nigel 77 while (eptr >= pp)
3648     {
3649 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3650 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3651     eptr--;
3652     }
3653     }
3654    
3655 ph10 771 RRETURN(MATCH_NOMATCH);
3656 nigel 77 }
3657     /* Control never gets here */
3658     }
3659    
3660     /* Caseful comparisons */
3661    
3662     else
3663     {
3664     #ifdef SUPPORT_UTF8
3665     /* UTF-8 mode */
3666     if (utf8)
3667     {
3668 nigel 93 register unsigned int d;
3669 nigel 77 for (i = 1; i <= min; i++)
3670     {
3671 ph10 426 if (eptr >= md->end_subject)
3672     {
3673     SCHECK_PARTIAL();
3674 ph10 771 RRETURN(MATCH_NOMATCH);
3675 ph10 427 }
3676 nigel 77 GETCHARINC(d, eptr);
3677 ph10 771 if (fc == d) RRETURN(MATCH_NOMATCH);
3678 nigel 77 }
3679     }
3680     else
3681     #endif
3682     /* Not UTF-8 mode */
3683     {
3684     for (i = 1; i <= min; i++)
3685 ph10 426 {
3686     if (eptr >= md->end_subject)
3687     {
3688     SCHECK_PARTIAL();
3689 ph10 771 RRETURN(MATCH_NOMATCH);
3690 ph10 427 }
3691 ph10 771 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3692 ph10 427 }
3693 nigel 77 }
3694    
3695     if (min == max) continue;
3696    
3697     if (minimize)
3698     {
3699     #ifdef SUPPORT_UTF8
3700     /* UTF-8 mode */
3701     if (utf8)
3702     {
3703 nigel 93 register unsigned int d;
3704 nigel 77 for (fi = min;; fi++)
3705     {
3706 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3707 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3708 ph10 771 if (fi >= max) RRETURN(MATCH_NOMATCH);
3709 ph10 427 if (eptr >= md->end_subject)
3710 ph10 426 {
3711 ph10 427 SCHECK_PARTIAL();
3712 ph10 771 RRETURN(MATCH_NOMATCH);
3713 ph10 427 }
3714 nigel 77 GETCHARINC(d, eptr);
3715 ph10 771 if (fc == d) RRETURN(MATCH_NOMATCH);
3716 nigel 77 }
3717     }
3718     else
3719     #endif
3720     /* Not UTF-8 mode */
3721     {
3722     for (fi = min;; fi++)
3723     {
3724 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3725 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3726 ph10 771 if (fi >= max) RRETURN(MATCH_NOMATCH);
3727 ph10 426 if (eptr >= md->end_subject)
3728     {
3729     SCHECK_PARTIAL();
3730 ph10 771 RRETURN(MATCH_NOMATCH);
3731 ph10 427 }
3732 ph10 771 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3733 nigel 77 }
3734     }
3735     /* Control never gets here */
3736     }
3737    
3738     /* Maximize case */
3739    
3740     else
3741     {
3742     pp = eptr;
3743    
3744     #ifdef SUPPORT_UTF8
3745     /* UTF-8 mode */
3746     if (utf8)
3747     {
3748 nigel 93 register unsigned int d;
3749 nigel 77 for (i = min; i < max; i++)
3750     {
3751     int len = 1;
3752 ph10 463 if (eptr >= md->end_subject)
3753 ph10 462 {
3754 ph10 463 SCHECK_PARTIAL();
3755 ph10 462 break;
3756 ph10 463 }
3757 nigel 77 GETCHARLEN(d, eptr, len);
3758     if (fc == d) break;
3759     eptr += len;
3760     }
3761 nigel 93 if (possessive) continue;
3762 nigel 77 for(;;)
3763     {
3764 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);