/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 614 - (hide annotations) (download)
Sat Jul 9 10:48:16 2011 UTC (3 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 196987 byte(s)
Fix atomic group and assertion capturing problems.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 ph10 604 /* Values for setting in md->match_function_type to indicate two special types
61     of call to match(). We do it this way to save on using another stack variable,
62     as stack usage is to be discouraged. */
63 nigel 77
64 ph10 604 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65     #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66 nigel 77
67     /* Non-error returns from the match() function. Error returns are externally
68     defined PCRE_ERROR_xxx codes, which are all negative. */
69    
70     #define MATCH_MATCH 1
71     #define MATCH_NOMATCH 0
72    
73 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
74 ph10 210 negative to avoid the external error codes. */
75    
76 ph10 511 #define MATCH_ACCEPT (-999)
77     #define MATCH_COMMIT (-998)
78 ph10 604 #define MATCH_KETRPOS (-997)
79     #define MATCH_PRUNE (-996)
80     #define MATCH_SKIP (-995)
81     #define MATCH_SKIP_ARG (-994)
82     #define MATCH_THEN (-993)
83 ph10 210
84 ph10 510 /* This is a convenience macro for code that occurs many times. */
85    
86     #define MRRETURN(ra) \
87     { \
88     md->mark = markptr; \
89     RRETURN(ra); \
90     }
91    
92 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
93     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
94     because the offset vector is always a multiple of 3 long. */
95    
96     #define REC_STACK_SAVE_MAX 30
97    
98     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
99    
100     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
101     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
102    
103    
104    
105 ph10 475 #ifdef PCRE_DEBUG
106 nigel 77 /*************************************************
107     * Debugging function to print chars *
108     *************************************************/
109    
110     /* Print a sequence of chars in printable format, stopping at the end of the
111     subject if the requested.
112    
113     Arguments:
114     p points to characters
115     length number to print
116     is_subject TRUE if printing from within md->start_subject
117     md pointer to matching data block, if is_subject is TRUE
118    
119     Returns: nothing
120     */
121    
122     static void
123     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
124     {
125 nigel 93 unsigned int c;
126 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
127     while (length-- > 0)
128     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
129     }
130     #endif
131    
132    
133    
134     /*************************************************
135     * Match a back-reference *
136     *************************************************/
137    
138 ph10 595 /* Normally, if a back reference hasn't been set, the length that is passed is
139     negative, so the match always fails. However, in JavaScript compatibility mode,
140     the length passed is zero. Note that in caseless UTF-8 mode, the number of
141     subject bytes matched may be different to the number of reference bytes.
142 nigel 77
143     Arguments:
144     offset index into the offset vector
145 ph10 595 eptr pointer into the subject
146     length length of reference to be matched (number of bytes)
147 nigel 77 md points to match data block
148 ph10 602 caseless TRUE if caseless
149 nigel 77
150 ph10 595 Returns: < 0 if not matched, otherwise the number of subject bytes matched
151 nigel 77 */
152    
153 ph10 595 static int
154 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
155 ph10 602 BOOL caseless)
156 nigel 77 {
157 ph10 595 USPTR eptr_start = eptr;
158     register USPTR p = md->start_subject + md->offset_vector[offset];
159 nigel 77
160 ph10 475 #ifdef PCRE_DEBUG
161 nigel 77 if (eptr >= md->end_subject)
162     printf("matching subject <null>");
163     else
164     {
165     printf("matching subject ");
166     pchars(eptr, length, TRUE, md);
167     }
168     printf(" against backref ");
169     pchars(p, length, FALSE, md);
170     printf("\n");
171     #endif
172    
173 ph10 595 /* Always fail if reference not set (and not JavaScript compatible). */
174 nigel 77
175 ph10 595 if (length < 0) return -1;
176 nigel 77
177 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
178     properly if Unicode properties are supported. Otherwise, we can check only
179     ASCII characters. */
180 nigel 77
181 ph10 602 if (caseless)
182 nigel 77 {
183 ph10 354 #ifdef SUPPORT_UTF8
184     #ifdef SUPPORT_UCP
185     if (md->utf8)
186     {
187 ph10 595 /* Match characters up to the end of the reference. NOTE: the number of
188     bytes matched may differ, because there are some characters whose upper and
189     lower case versions code as different numbers of bytes. For example, U+023A
190     (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
191     a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
192     the latter. It is important, therefore, to check the length along the
193     reference, not along the subject (earlier code did this wrong). */
194    
195     USPTR endptr = p + length;
196     while (p < endptr)
197 ph10 354 {
198 ph10 358 int c, d;
199 ph10 597 if (eptr >= md->end_subject) return -1;
200 ph10 354 GETCHARINC(c, eptr);
201     GETCHARINC(d, p);
202 ph10 595 if (c != d && c != UCD_OTHERCASE(d)) return -1;
203 ph10 358 }
204     }
205 ph10 354 else
206     #endif
207     #endif
208    
209     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
210     is no UCP support. */
211 ph10 597 {
212     if (eptr + length > md->end_subject) return -1;
213     while (length-- > 0)
214     { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
215     }
216 nigel 77 }
217 ph10 358
218 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
219     are in UTF-8 mode. */
220 ph10 358
221 nigel 77 else
222 ph10 597 {
223     if (eptr + length > md->end_subject) return -1;
224     while (length-- > 0) if (*p++ != *eptr++) return -1;
225     }
226 nigel 77
227 ph10 595 return eptr - eptr_start;
228 nigel 77 }
229    
230    
231    
232     /***************************************************************************
233     ****************************************************************************
234     RECURSION IN THE match() FUNCTION
235    
236 nigel 87 The match() function is highly recursive, though not every recursive call
237     increases the recursive depth. Nevertheless, some regular expressions can cause
238     it to recurse to a great depth. I was writing for Unix, so I just let it call
239     itself recursively. This uses the stack for saving everything that has to be
240     saved for a recursive call. On Unix, the stack can be large, and this works
241     fine.
242 nigel 77
243 nigel 87 It turns out that on some non-Unix-like systems there are problems with
244     programs that use a lot of stack. (This despite the fact that every last chip
245     has oodles of memory these days, and techniques for extending the stack have
246     been known for decades.) So....
247 nigel 77
248     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
249     calls by keeping local variables that need to be preserved in blocks of memory
250 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
251 nigel 77 achieve this so that the actual code doesn't look very different to what it
252     always used to.
253 ph10 164
254 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
255 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
256     Switzer, the use of longjmp() has been abolished, at the cost of having to
257     provide a unique number for each call to RMATCH. There is no way of generating
258     a sequence of numbers at compile time in C. I have given them names, to make
259     them stand out more clearly.
260    
261     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
262     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
263 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
264     don't have indeterminate values; this has meant that the frame size can be
265 ph10 164 reduced because the result can be "passed back" by straight setting of the
266     variable instead of being passed in the frame.
267 nigel 77 ****************************************************************************
268     ***************************************************************************/
269    
270 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
271     below must be updated in sync. */
272 nigel 77
273 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
274     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
275     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
276     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
277 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
278 ph10 527 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
279 ph10 609 RM61, RM62, RM63};
280 ph10 164
281 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
282 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
283 ph10 501 actually used in this definition. */
284 nigel 77
285     #ifndef NO_RECURSE
286     #define REGISTER register
287 ph10 164
288 ph10 475 #ifdef PCRE_DEBUG
289 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
290 nigel 87 { \
291     printf("match() called in line %d\n", __LINE__); \
292 ph10 604 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
293 nigel 87 printf("to line %d\n", __LINE__); \
294     }
295     #define RRETURN(ra) \
296     { \
297     printf("match() returned %d from line %d ", ra, __LINE__); \
298     return ra; \
299     }
300     #else
301 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
302     rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
303 nigel 77 #define RRETURN(ra) return ra
304 nigel 87 #endif
305    
306 nigel 77 #else
307    
308    
309 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
310     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
311     argument of match(), which never changes. */
312 nigel 77
313     #define REGISTER
314    
315 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw)\
316 nigel 77 {\
317 ph10 563 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
318 ph10 534 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
319 ph10 164 frame->Xwhere = rw; \
320     newframe->Xeptr = ra;\
321     newframe->Xecode = rb;\
322 ph10 168 newframe->Xmstart = mstart;\
323 ph10 501 newframe->Xmarkptr = markptr;\
324 ph10 164 newframe->Xoffset_top = rc;\
325 ph10 602 newframe->Xeptrb = re;\
326 ph10 164 newframe->Xrdepth = frame->Xrdepth + 1;\
327     newframe->Xprevframe = frame;\
328     frame = newframe;\
329     DPRINTF(("restarting from line %d\n", __LINE__));\
330     goto HEAP_RECURSE;\
331     L_##rw:\
332     DPRINTF(("jumped back to line %d\n", __LINE__));\
333 nigel 77 }
334    
335     #define RRETURN(ra)\
336     {\
337 ph10 527 heapframe *oldframe = frame;\
338     frame = oldframe->Xprevframe;\
339     (pcre_stack_free)(oldframe);\
340 nigel 77 if (frame != NULL)\
341     {\
342 ph10 164 rrc = ra;\
343     goto HEAP_RETURN;\
344 nigel 77 }\
345     return ra;\
346     }
347    
348    
349     /* Structure for remembering the local variables in a private frame */
350    
351     typedef struct heapframe {
352     struct heapframe *Xprevframe;
353    
354     /* Function arguments that may change */
355    
356 ph10 409 USPTR Xeptr;
357 nigel 77 const uschar *Xecode;
358 ph10 409 USPTR Xmstart;
359 ph10 501 USPTR Xmarkptr;
360 nigel 77 int Xoffset_top;
361     eptrblock *Xeptrb;
362 nigel 91 unsigned int Xrdepth;
363 nigel 77
364     /* Function local variables */
365    
366 ph10 409 USPTR Xcallpat;
367 ph10 406 #ifdef SUPPORT_UTF8
368 ph10 409 USPTR Xcharptr;
369 ph10 406 #endif
370 ph10 409 USPTR Xdata;
371     USPTR Xnext;
372     USPTR Xpp;
373     USPTR Xprev;
374     USPTR Xsaved_eptr;
375 nigel 77
376     recursion_info Xnew_recursive;
377    
378     BOOL Xcur_is_word;
379     BOOL Xcondition;
380     BOOL Xprev_is_word;
381    
382     #ifdef SUPPORT_UCP
383     int Xprop_type;
384 nigel 87 int Xprop_value;
385 nigel 77 int Xprop_fail_result;
386     int Xprop_category;
387     int Xprop_chartype;
388 nigel 87 int Xprop_script;
389 ph10 123 int Xoclength;
390     uschar Xocchars[8];
391 nigel 77 #endif
392    
393 ph10 403 int Xcodelink;
394 nigel 77 int Xctype;
395 nigel 93 unsigned int Xfc;
396 nigel 77 int Xfi;
397     int Xlength;
398     int Xmax;
399     int Xmin;
400     int Xnumber;
401     int Xoffset;
402     int Xop;
403     int Xsave_capture_last;
404     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
405     int Xstacksave[REC_STACK_SAVE_MAX];
406    
407     eptrblock Xnewptrb;
408    
409 ph10 164 /* Where to jump back to */
410 nigel 77
411 ph10 164 int Xwhere;
412 ph10 165
413 nigel 77 } heapframe;
414    
415     #endif
416    
417    
418     /***************************************************************************
419     ***************************************************************************/
420    
421    
422    
423     /*************************************************
424     * Match from current position *
425     *************************************************/
426    
427 nigel 93 /* This function is called recursively in many circumstances. Whenever it
428 nigel 77 returns a negative (error) response, the outer incarnation must also return the
429 ph10 426 same response. */
430 nigel 77
431 ph10 426 /* These macros pack up tests that are used for partial matching, and which
432     appears several times in the code. We set the "hit end" flag if the pointer is
433     at the end of the subject and also past the start of the subject (i.e.
434 ph10 427 something has been matched). For hard partial matching, we then return
435     immediately. The second one is used when we already know we are past the end of
436     the subject. */
437 ph10 426
438     #define CHECK_PARTIAL()\
439 ph10 553 if (md->partial != 0 && eptr >= md->end_subject && \
440     eptr > md->start_used_ptr) \
441     { \
442     md->hitend = TRUE; \
443     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
444 ph10 427 }
445 ph10 426
446     #define SCHECK_PARTIAL()\
447 ph10 553 if (md->partial != 0 && eptr > md->start_used_ptr) \
448     { \
449     md->hitend = TRUE; \
450     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
451 ph10 427 }
452 ph10 426
453 ph10 427
454 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
455     the md structure (e.g. utf8, end_subject) into individual variables to improve
456 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
457     made performance worse.
458    
459     Arguments:
460 nigel 93 eptr pointer to current character in subject
461     ecode pointer to current position in compiled code
462 ph10 168 mstart pointer to the current match start position (can be modified
463 ph10 172 by encountering \K)
464 ph10 501 markptr pointer to the most recent MARK name, or NULL
465 nigel 77 offset_top current top pointer
466     md pointer to "static" info for the match
467     eptrb pointer to chain of blocks containing eptr at start of
468     brackets - for testing for empty matches
469 nigel 87 rdepth the recursion depth
470 nigel 77
471     Returns: MATCH_MATCH if matched ) these values are >= 0
472     MATCH_NOMATCH if failed to match )
473 ph10 510 a negative MATCH_xxx value for PRUNE, SKIP, etc
474 nigel 77 a negative PCRE_ERROR_xxx value if aborted by an error condition
475 nigel 87 (e.g. stopped by repeated call or recursion limit)
476 nigel 77 */
477    
478     static int
479 ph10 510 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
480 ph10 602 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
481 ph10 604 unsigned int rdepth)
482 nigel 77 {
483     /* These variables do not need to be preserved over recursion in this function,
484 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
485     "register" because they are used a lot in loops. */
486 nigel 77
487 nigel 91 register int rrc; /* Returns from recursive calls */
488     register int i; /* Used for loops not involving calls to RMATCH() */
489 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
490 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
491 nigel 77
492 nigel 93 BOOL minimize, possessive; /* Quantifier options */
493 ph10 602 BOOL caseless;
494 ph10 403 int condcode;
495 nigel 93
496 nigel 77 /* When recursion is not being used, all "local" variables that have to be
497     preserved over calls to RMATCH() are part of a "frame" which is obtained from
498     heap storage. Set up the top-level frame here; others are obtained from the
499     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
500    
501     #ifdef NO_RECURSE
502 ph10 563 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
503 ph10 531 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
504 nigel 77 frame->Xprevframe = NULL; /* Marks the top level */
505    
506     /* Copy in the original argument variables */
507    
508     frame->Xeptr = eptr;
509     frame->Xecode = ecode;
510 ph10 168 frame->Xmstart = mstart;
511 ph10 501 frame->Xmarkptr = markptr;
512 nigel 77 frame->Xoffset_top = offset_top;
513     frame->Xeptrb = eptrb;
514 nigel 87 frame->Xrdepth = rdepth;
515 nigel 77
516     /* This is where control jumps back to to effect "recursion" */
517    
518     HEAP_RECURSE:
519    
520     /* Macros make the argument variables come from the current frame */
521    
522     #define eptr frame->Xeptr
523     #define ecode frame->Xecode
524 ph10 168 #define mstart frame->Xmstart
525 ph10 501 #define markptr frame->Xmarkptr
526 nigel 77 #define offset_top frame->Xoffset_top
527     #define eptrb frame->Xeptrb
528 nigel 87 #define rdepth frame->Xrdepth
529 nigel 77
530     /* Ditto for the local variables */
531    
532     #ifdef SUPPORT_UTF8
533     #define charptr frame->Xcharptr
534     #endif
535     #define callpat frame->Xcallpat
536 ph10 403 #define codelink frame->Xcodelink
537 nigel 77 #define data frame->Xdata
538     #define next frame->Xnext
539     #define pp frame->Xpp
540     #define prev frame->Xprev
541     #define saved_eptr frame->Xsaved_eptr
542    
543     #define new_recursive frame->Xnew_recursive
544    
545     #define cur_is_word frame->Xcur_is_word
546     #define condition frame->Xcondition
547     #define prev_is_word frame->Xprev_is_word
548    
549     #ifdef SUPPORT_UCP
550     #define prop_type frame->Xprop_type
551 nigel 87 #define prop_value frame->Xprop_value
552 nigel 77 #define prop_fail_result frame->Xprop_fail_result
553     #define prop_category frame->Xprop_category
554     #define prop_chartype frame->Xprop_chartype
555 nigel 87 #define prop_script frame->Xprop_script
556 ph10 115 #define oclength frame->Xoclength
557     #define occhars frame->Xocchars
558 nigel 77 #endif
559    
560     #define ctype frame->Xctype
561     #define fc frame->Xfc
562     #define fi frame->Xfi
563     #define length frame->Xlength
564     #define max frame->Xmax
565     #define min frame->Xmin
566     #define number frame->Xnumber
567     #define offset frame->Xoffset
568     #define op frame->Xop
569     #define save_capture_last frame->Xsave_capture_last
570     #define save_offset1 frame->Xsave_offset1
571     #define save_offset2 frame->Xsave_offset2
572     #define save_offset3 frame->Xsave_offset3
573     #define stacksave frame->Xstacksave
574    
575     #define newptrb frame->Xnewptrb
576    
577     /* When recursion is being used, local variables are allocated on the stack and
578     get preserved during recursion in the normal way. In this environment, fi and
579     i, and fc and c, can be the same variables. */
580    
581 nigel 93 #else /* NO_RECURSE not defined */
582 nigel 77 #define fi i
583     #define fc c
584    
585 ph10 604 /* Many of the following variables are used only in small blocks of the code.
586     My normal style of coding would have declared them within each of those blocks.
587     However, in order to accommodate the version of this code that uses an external
588     "stack" implemented on the heap, it is easier to declare them all here, so the
589     declarations can be cut out in a block. The only declarations within blocks
590     below are for variables that do not have to be preserved over a recursive call
591     to RMATCH(). */
592 nigel 77
593 ph10 604 #ifdef SUPPORT_UTF8
594     const uschar *charptr;
595     #endif
596     const uschar *callpat;
597     const uschar *data;
598     const uschar *next;
599     USPTR pp;
600     const uschar *prev;
601     USPTR saved_eptr;
602    
603     recursion_info new_recursive;
604    
605     BOOL cur_is_word;
606 nigel 87 BOOL condition;
607 nigel 77 BOOL prev_is_word;
608    
609     #ifdef SUPPORT_UCP
610     int prop_type;
611 nigel 87 int prop_value;
612 nigel 77 int prop_fail_result;
613     int prop_category;
614     int prop_chartype;
615 nigel 87 int prop_script;
616 ph10 115 int oclength;
617     uschar occhars[8];
618 nigel 77 #endif
619    
620 ph10 399 int codelink;
621 nigel 77 int ctype;
622     int length;
623     int max;
624     int min;
625     int number;
626     int offset;
627     int op;
628     int save_capture_last;
629     int save_offset1, save_offset2, save_offset3;
630     int stacksave[REC_STACK_SAVE_MAX];
631    
632     eptrblock newptrb;
633 nigel 93 #endif /* NO_RECURSE */
634 nigel 77
635 ph10 604 /* To save space on the stack and in the heap frame, I have doubled up on some
636     of the local variables that are used only in localised parts of the code, but
637     still need to be preserved over recursive calls of match(). These macros define
638     the alternative names that are used. */
639    
640     #define allow_zero cur_is_word
641     #define cbegroup condition
642     #define code_offset codelink
643     #define condassert condition
644     #define matched_once prev_is_word
645    
646 nigel 77 /* These statements are here to stop the compiler complaining about unitialized
647     variables. */
648    
649     #ifdef SUPPORT_UCP
650 nigel 87 prop_value = 0;
651 nigel 77 prop_fail_result = 0;
652     #endif
653    
654 nigel 93
655 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
656     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
657     used. Thanks to Ian Taylor for noticing this possibility and sending the
658     original patch. */
659    
660     TAIL_RECURSE:
661    
662 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
663     are specified by the macro RMATCH and RRETURN is used to return. When
664     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
665 ph10 475 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
666 nigel 87 defined). However, RMATCH isn't like a function call because it's quite a
667     complicated macro. It has to be used in one particular way. This shouldn't,
668     however, impact performance when true recursion is being used. */
669 nigel 77
670 ph10 164 #ifdef SUPPORT_UTF8
671     utf8 = md->utf8; /* Local copy of the flag */
672     #else
673     utf8 = FALSE;
674     #endif
675    
676 nigel 87 /* First check that we haven't called match() too many times, or that we
677     haven't exceeded the recursive call limit. */
678    
679 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
680 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
681 nigel 77
682 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
683 ph10 604 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
684     done this way to save having to use another function argument, which would take
685     up space on the stack. See also MATCH_CONDASSERT below.
686 nigel 77
687 ph10 604 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
688     such remembered pointers, to be checked when we hit the closing ket, in order
689     to break infinite loops that match no characters. When match() is called in
690     other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
691     NOT be used with tail recursion, because the memory block that is used is on
692     the stack, so a new one may be required for each match(). */
693    
694     if (md->match_function_type == MATCH_CBEGROUP)
695 nigel 77 {
696 ph10 197 newptrb.epb_saved_eptr = eptr;
697     newptrb.epb_prev = eptrb;
698     eptrb = &newptrb;
699 ph10 604 md->match_function_type = 0;
700 nigel 77 }
701    
702 nigel 93 /* Now start processing the opcodes. */
703 nigel 77
704     for (;;)
705     {
706 nigel 93 minimize = possessive = FALSE;
707 nigel 77 op = *ecode;
708 ph10 604
709 nigel 93 switch(op)
710     {
711 ph10 510 case OP_MARK:
712     markptr = ecode + 2;
713     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
714 ph10 604 eptrb, RM55);
715 ph10 512
716     /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
717     argument, and we must check whether that argument matches this MARK's
718     argument. It is passed back in md->start_match_ptr (an overloading of that
719     variable). If it does match, we reset that variable to the current subject
720     position and return MATCH_SKIP. Otherwise, pass back the return code
721 ph10 510 unaltered. */
722 ph10 512
723     if (rrc == MATCH_SKIP_ARG &&
724 ph10 510 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
725     {
726     md->start_match_ptr = eptr;
727     RRETURN(MATCH_SKIP);
728     }
729    
730 ph10 512 if (md->mark == NULL) md->mark = markptr;
731 ph10 510 RRETURN(rrc);
732    
733 ph10 210 case OP_FAIL:
734 ph10 510 MRRETURN(MATCH_NOMATCH);
735 ph10 211
736 ph10 551 /* COMMIT overrides PRUNE, SKIP, and THEN */
737 ph10 553
738 ph10 510 case OP_COMMIT:
739     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
740 ph10 604 eptrb, RM52);
741 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
742 ph10 553 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
743     rrc != MATCH_THEN)
744 ph10 551 RRETURN(rrc);
745 ph10 510 MRRETURN(MATCH_COMMIT);
746    
747 ph10 551 /* PRUNE overrides THEN */
748 ph10 553
749 ph10 210 case OP_PRUNE:
750     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
751 ph10 604 eptrb, RM51);
752 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
753 ph10 510 MRRETURN(MATCH_PRUNE);
754 ph10 211
755 ph10 510 case OP_PRUNE_ARG:
756     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
757 ph10 604 eptrb, RM56);
758 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
759 ph10 510 md->mark = ecode + 2;
760     RRETURN(MATCH_PRUNE);
761 ph10 211
762 ph10 551 /* SKIP overrides PRUNE and THEN */
763 ph10 553
764 ph10 210 case OP_SKIP:
765     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
766 ph10 604 eptrb, RM53);
767 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 ph10 551 RRETURN(rrc);
769 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
770 ph10 510 MRRETURN(MATCH_SKIP);
771 ph10 211
772 ph10 510 case OP_SKIP_ARG:
773     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
774 ph10 604 eptrb, RM57);
775 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
776 ph10 551 RRETURN(rrc);
777 ph10 512
778     /* Pass back the current skip name by overloading md->start_match_ptr and
779     returning the special MATCH_SKIP_ARG return code. This will either be
780     caught by a matching MARK, or get to the top, where it is treated the same
781 ph10 510 as PRUNE. */
782 ph10 512
783 ph10 510 md->start_match_ptr = ecode + 2;
784 ph10 512 RRETURN(MATCH_SKIP_ARG);
785 ph10 553
786 ph10 550 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
787 ph10 553 the alt that is at the start of the current branch. This makes it possible
788     to skip back past alternatives that precede the THEN within the current
789     branch. */
790 ph10 512
791 ph10 210 case OP_THEN:
792     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
793 ph10 604 eptrb, RM54);
794 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
795 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
796 ph10 510 MRRETURN(MATCH_THEN);
797    
798     case OP_THEN_ARG:
799 ph10 553 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
800 ph10 604 offset_top, md, eptrb, RM58);
801 ph10 510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
803     md->mark = ecode + LINK_SIZE + 2;
804 ph10 212 RRETURN(MATCH_THEN);
805 ph10 211
806 ph10 604 /* Handle a capturing bracket, other than those that are possessive with an
807     unlimited repeat. If there is space in the offset vector, save the current
808     subject position in the working slot at the top of the vector. We mustn't
809     change the current values of the data slot, because they may be set from a
810     previous iteration of this group, and be referred to by a reference inside
811     the group. If we fail to match, we need to restore this value and also the
812 nigel 93 values of the final offsets, in case they were set by a previous iteration
813     of the same bracket.
814 nigel 77
815 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
816     a non-capturing bracket. Don't worry about setting the flag for the error
817     case here; that is handled in the code for KET. */
818 nigel 77
819 nigel 93 case OP_CBRA:
820     case OP_SCBRA:
821     number = GET2(ecode, 1+LINK_SIZE);
822 nigel 77 offset = number << 1;
823 ph10 604
824 ph10 475 #ifdef PCRE_DEBUG
825 nigel 93 printf("start bracket %d\n", number);
826     printf("subject=");
827 nigel 77 pchars(eptr, 16, TRUE, md);
828     printf("\n");
829     #endif
830    
831     if (offset < md->offset_max)
832     {
833     save_offset1 = md->offset_vector[offset];
834     save_offset2 = md->offset_vector[offset+1];
835     save_offset3 = md->offset_vector[md->offset_end - number];
836     save_capture_last = md->capture_last;
837    
838     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
839 ph10 531 md->offset_vector[md->offset_end - number] =
840 ph10 530 (int)(eptr - md->start_subject);
841 nigel 77
842 ph10 604 for (;;)
843 nigel 77 {
844 ph10 604 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
845     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
846     eptrb, RM1);
847 ph10 550 if (rrc != MATCH_NOMATCH &&
848     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
849     RRETURN(rrc);
850 ph10 614
851     /* If md->end_offset_top is greater than offset_top, it means that the
852     branch we have just failed to match did manage to match some capturing
853     parentheses within an atomic group or an assertion. Although offset_top
854     reverts to its original value at this level, we must unset the captured
855     values in case a later match sets a higher capturing number. Example:
856     matching /((?>(a))b|(a)c)/ against "ac". This captures 3, but we need
857     to ensure that 2 - which was captured in the atomic matching - is
858     unset. */
859    
860     if (md->end_offset_top > offset_top)
861     {
862     register int *iptr = md->offset_vector + offset_top;
863     register int *iend = md->offset_vector + md->end_offset_top;
864     while (iptr < iend) *iptr++ = -1;
865     }
866    
867 nigel 77 md->capture_last = save_capture_last;
868     ecode += GET(ecode, 1);
869 ph10 604 if (*ecode != OP_ALT) break;
870 nigel 77 }
871    
872     DPRINTF(("bracket %d failed\n", number));
873    
874     md->offset_vector[offset] = save_offset1;
875     md->offset_vector[offset+1] = save_offset2;
876     md->offset_vector[md->offset_end - number] = save_offset3;
877    
878 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
879 nigel 77 RRETURN(MATCH_NOMATCH);
880     }
881    
882 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
883     as a non-capturing bracket. */
884 nigel 77
885 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
886     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
887    
888 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
889 nigel 77
890 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
891     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
892    
893 ph10 604 /* Non-capturing bracket, except for possessive with unlimited repeat. Loop
894     for all the alternatives. When we get to the final alternative within the
895 ph10 609 brackets, we used to return the result of a recursive call to match()
896     whatever happened so it was possible to reduce stack usage by turning this
897     into a tail recursion, except in the case of a possibly empty group.
898     However, now that there is the possiblity of (*THEN) occurring in the final
899     alternative, this optimization is no longer possible. */
900 nigel 77
901 nigel 93 case OP_BRA:
902     case OP_SBRA:
903     DPRINTF(("start non-capturing bracket\n"));
904 nigel 91 for (;;)
905 nigel 77 {
906 ph10 604 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
907 ph10 602 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
908 ph10 604 RM2);
909 ph10 550 if (rrc != MATCH_NOMATCH &&
910     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
911     RRETURN(rrc);
912 ph10 614
913     /* See explanatory comment above under OP_CBRA. */
914    
915     if (md->end_offset_top > offset_top)
916     {
917     register int *iptr = md->offset_vector + offset_top;
918     register int *iend = md->offset_vector + md->end_offset_top;
919     while (iptr < iend) *iptr++ = -1;
920     }
921    
922 nigel 77 ecode += GET(ecode, 1);
923 ph10 609 if (*ecode != OP_ALT) break;
924 nigel 77 }
925    
926 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
927     RRETURN(MATCH_NOMATCH);
928    
929 ph10 604 /* Handle possessive capturing brackets with an unlimited repeat. We come
930     here from BRAZERO with allow_zero set TRUE. The offset_vector values are
931     handled similarly to the normal case above. However, the matching is
932     different. The end of these brackets will always be OP_KETRPOS, which
933     returns MATCH_KETRPOS without going further in the pattern. By this means
934     we can handle the group by iteration rather than recursion, thereby
935     reducing the amount of stack needed. */
936    
937     case OP_CBRAPOS:
938     case OP_SCBRAPOS:
939     allow_zero = FALSE;
940    
941     POSSESSIVE_CAPTURE:
942     number = GET2(ecode, 1+LINK_SIZE);
943     offset = number << 1;
944    
945     #ifdef PCRE_DEBUG
946     printf("start possessive bracket %d\n", number);
947     printf("subject=");
948     pchars(eptr, 16, TRUE, md);
949     printf("\n");
950     #endif
951    
952     if (offset < md->offset_max)
953     {
954     matched_once = FALSE;
955     code_offset = ecode - md->start_code;
956    
957     save_offset1 = md->offset_vector[offset];
958     save_offset2 = md->offset_vector[offset+1];
959     save_offset3 = md->offset_vector[md->offset_end - number];
960     save_capture_last = md->capture_last;
961    
962     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
963    
964     /* Each time round the loop, save the current subject position for use
965     when the group matches. For MATCH_MATCH, the group has matched, so we
966     restart it with a new subject starting position, remembering that we had
967     at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
968     usual. If we haven't matched any alternatives in any iteration, check to
969     see if a previous iteration matched. If so, the group has matched;
970     continue from afterwards. Otherwise it has failed; restore the previous
971     capture values before returning NOMATCH. */
972    
973     for (;;)
974     {
975     md->offset_vector[md->offset_end - number] =
976     (int)(eptr - md->start_subject);
977     if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
978     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
979     eptrb, RM63);
980     if (rrc == MATCH_KETRPOS)
981     {
982     offset_top = md->end_offset_top;
983     eptr = md->end_match_ptr;
984     ecode = md->start_code + code_offset;
985     save_capture_last = md->capture_last;
986     matched_once = TRUE;
987     continue;
988     }
989     if (rrc != MATCH_NOMATCH &&
990     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
991     RRETURN(rrc);
992 ph10 614
993     /* See explanatory comment above under OP_CBRA. */
994    
995     if (md->end_offset_top > offset_top)
996     {
997     register int *iptr = md->offset_vector + offset_top;
998     register int *iend = md->offset_vector + md->end_offset_top;
999     while (iptr < iend) *iptr++ = -1;
1000     }
1001    
1002 ph10 604 md->capture_last = save_capture_last;
1003     ecode += GET(ecode, 1);
1004     if (*ecode != OP_ALT) break;
1005     }
1006 ph10 610
1007 ph10 604 if (!matched_once)
1008     {
1009     md->offset_vector[offset] = save_offset1;
1010     md->offset_vector[offset+1] = save_offset2;
1011     md->offset_vector[md->offset_end - number] = save_offset3;
1012     }
1013    
1014 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
1015 ph10 604 if (allow_zero || matched_once)
1016     {
1017     ecode += 1 + LINK_SIZE;
1018     break;
1019     }
1020    
1021     RRETURN(MATCH_NOMATCH);
1022     }
1023    
1024     /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1025     as a non-capturing bracket. */
1026    
1027     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1028     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1029    
1030     DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1031    
1032     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1033     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1034    
1035     /* Non-capturing possessive bracket with unlimited repeat. We come here
1036     from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1037     without the capturing complication. It is written out separately for speed
1038     and cleanliness. */
1039    
1040     case OP_BRAPOS:
1041     case OP_SBRAPOS:
1042     allow_zero = FALSE;
1043    
1044     POSSESSIVE_NON_CAPTURE:
1045     matched_once = FALSE;
1046     code_offset = ecode - md->start_code;
1047    
1048     for (;;)
1049     {
1050     if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1051     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1052 ph10 609 eptrb, RM48);
1053 ph10 604 if (rrc == MATCH_KETRPOS)
1054     {
1055 ph10 610 offset_top = md->end_offset_top;
1056 ph10 604 eptr = md->end_match_ptr;
1057     ecode = md->start_code + code_offset;
1058     matched_once = TRUE;
1059     continue;
1060     }
1061     if (rrc != MATCH_NOMATCH &&
1062     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1063     RRETURN(rrc);
1064 ph10 614
1065     /* See explanatory comment above under OP_CBRA. */
1066    
1067     if (md->end_offset_top > offset_top)
1068     {
1069     register int *iptr = md->offset_vector + offset_top;
1070     register int *iend = md->offset_vector + md->end_offset_top;
1071     while (iptr < iend) *iptr++ = -1;
1072     }
1073    
1074 ph10 604 ecode += GET(ecode, 1);
1075     if (*ecode != OP_ALT) break;
1076     }
1077 ph10 610
1078 ph10 604 if (matched_once || allow_zero)
1079     {
1080     ecode += 1 + LINK_SIZE;
1081     break;
1082     }
1083     RRETURN(MATCH_NOMATCH);
1084    
1085     /* Control never reaches here. */
1086    
1087 nigel 77 /* Conditional group: compilation checked that there are no more than
1088     two branches. If the condition is false, skipping the first branch takes us
1089     past the end if there is only one branch, but that's OK because that is
1090 ph10 609 exactly what going to the ket would do. */
1091 nigel 77
1092     case OP_COND:
1093 nigel 93 case OP_SCOND:
1094 ph10 604 codelink = GET(ecode, 1);
1095 ph10 406
1096 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
1097     inserted between OP_COND and an assertion condition. */
1098 ph10 392
1099 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1100     {
1101     if (pcre_callout != NULL)
1102     {
1103     pcre_callout_block cb;
1104     cb.version = 1; /* Version 1 of the callout block */
1105     cb.callout_number = ecode[LINK_SIZE+2];
1106     cb.offset_vector = md->offset_vector;
1107     cb.subject = (PCRE_SPTR)md->start_subject;
1108 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1109     cb.start_match = (int)(mstart - md->start_subject);
1110     cb.current_position = (int)(eptr - md->start_subject);
1111 ph10 381 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1112     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1113     cb.capture_top = offset_top/2;
1114     cb.capture_last = md->capture_last;
1115     cb.callout_data = md->callout_data;
1116 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1117 ph10 381 if (rrc < 0) RRETURN(rrc);
1118     }
1119     ecode += _pcre_OP_lengths[OP_CALLOUT];
1120     }
1121 ph10 392
1122 ph10 399 condcode = ecode[LINK_SIZE+1];
1123 ph10 406
1124 ph10 381 /* Now see what the actual condition is */
1125 ph10 392
1126 ph10 459 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1127 nigel 77 {
1128 ph10 459 if (md->recursive == NULL) /* Not recursing => FALSE */
1129     {
1130 ph10 461 condition = FALSE;
1131     ecode += GET(ecode, 1);
1132     }
1133 ph10 459 else
1134 ph10 461 {
1135 ph10 459 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1136     condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1137 ph10 461
1138 ph10 459 /* If the test is for recursion into a specific subpattern, and it is
1139     false, but the test was set up by name, scan the table to see if the
1140     name refers to any other numbers, and test them. The condition is true
1141     if any one is set. */
1142 ph10 461
1143 ph10 459 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1144     {
1145     uschar *slotA = md->name_table;
1146     for (i = 0; i < md->name_count; i++)
1147 ph10 461 {
1148     if (GET2(slotA, 0) == recno) break;
1149 ph10 459 slotA += md->name_entry_size;
1150     }
1151 ph10 461
1152 ph10 459 /* Found a name for the number - there can be only one; duplicate
1153     names for different numbers are allowed, but not vice versa. First
1154     scan down for duplicates. */
1155 ph10 461
1156 ph10 459 if (i < md->name_count)
1157 ph10 461 {
1158 ph10 459 uschar *slotB = slotA;
1159     while (slotB > md->name_table)
1160     {
1161     slotB -= md->name_entry_size;
1162     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1163     {
1164     condition = GET2(slotB, 0) == md->recursive->group_num;
1165 ph10 461 if (condition) break;
1166     }
1167 ph10 459 else break;
1168 ph10 461 }
1169    
1170 ph10 459 /* Scan up for duplicates */
1171 ph10 461
1172 ph10 459 if (!condition)
1173 ph10 461 {
1174 ph10 459 slotB = slotA;
1175     for (i++; i < md->name_count; i++)
1176     {
1177     slotB += md->name_entry_size;
1178     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1179     {
1180     condition = GET2(slotB, 0) == md->recursive->group_num;
1181     if (condition) break;
1182 ph10 461 }
1183 ph10 459 else break;
1184 ph10 461 }
1185     }
1186 ph10 459 }
1187 ph10 461 }
1188    
1189 ph10 459 /* Chose branch according to the condition */
1190 ph10 461
1191 ph10 459 ecode += condition? 3 : GET(ecode, 1);
1192     }
1193 ph10 461 }
1194 nigel 93
1195 ph10 459 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1196 nigel 93 {
1197 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1198 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1199 ph10 461
1200 ph10 459 /* If the numbered capture is unset, but the reference was by name,
1201 ph10 461 scan the table to see if the name refers to any other numbers, and test
1202     them. The condition is true if any one is set. This is tediously similar
1203     to the code above, but not close enough to try to amalgamate. */
1204    
1205 ph10 459 if (!condition && condcode == OP_NCREF)
1206     {
1207 ph10 461 int refno = offset >> 1;
1208 ph10 459 uschar *slotA = md->name_table;
1209 ph10 461
1210 ph10 459 for (i = 0; i < md->name_count; i++)
1211 ph10 461 {
1212     if (GET2(slotA, 0) == refno) break;
1213 ph10 459 slotA += md->name_entry_size;
1214     }
1215 ph10 461
1216     /* Found a name for the number - there can be only one; duplicate names
1217     for different numbers are allowed, but not vice versa. First scan down
1218 ph10 459 for duplicates. */
1219 ph10 461
1220 ph10 459 if (i < md->name_count)
1221 ph10 461 {
1222 ph10 459 uschar *slotB = slotA;
1223     while (slotB > md->name_table)
1224     {
1225     slotB -= md->name_entry_size;
1226     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1227     {
1228     offset = GET2(slotB, 0) << 1;
1229 ph10 461 condition = offset < offset_top &&
1230 ph10 459 md->offset_vector[offset] >= 0;
1231 ph10 461 if (condition) break;
1232     }
1233 ph10 459 else break;
1234 ph10 461 }
1235    
1236 ph10 459 /* Scan up for duplicates */
1237 ph10 461
1238 ph10 459 if (!condition)
1239 ph10 461 {
1240 ph10 459 slotB = slotA;
1241     for (i++; i < md->name_count; i++)
1242     {
1243     slotB += md->name_entry_size;
1244     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1245     {
1246     offset = GET2(slotB, 0) << 1;
1247 ph10 461 condition = offset < offset_top &&
1248 ph10 459 md->offset_vector[offset] >= 0;
1249 ph10 461 if (condition) break;
1250     }
1251 ph10 459 else break;
1252 ph10 461 }
1253     }
1254 ph10 459 }
1255 ph10 461 }
1256    
1257 ph10 459 /* Chose branch according to the condition */
1258    
1259 nigel 93 ecode += condition? 3 : GET(ecode, 1);
1260 nigel 77 }
1261    
1262 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
1263 nigel 93 {
1264     condition = FALSE;
1265     ecode += GET(ecode, 1);
1266     }
1267    
1268 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
1269 ph10 604 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1270     an assertion. */
1271 nigel 77
1272     else
1273     {
1274 ph10 604 md->match_function_type = MATCH_CONDASSERT;
1275     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1276 nigel 77 if (rrc == MATCH_MATCH)
1277     {
1278 nigel 93 condition = TRUE;
1279     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1280 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1281     }
1282 ph10 550 else if (rrc != MATCH_NOMATCH &&
1283     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1284 nigel 77 {
1285     RRETURN(rrc); /* Need braces because of following else */
1286     }
1287 nigel 93 else
1288     {
1289     condition = FALSE;
1290 ph10 399 ecode += codelink;
1291 nigel 93 }
1292     }
1293 nigel 91
1294 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
1295 ph10 609 we used to use tail recursion to avoid using another stack frame, except
1296     when there was unlimited repeat of a possibly empty group. However, that
1297     strategy no longer works because of the possibilty of (*THEN) being
1298     encountered in the branch. A recursive call to match() is always required,
1299     unless the second alternative doesn't exist, in which case we can just
1300     plough on. */
1301 nigel 91
1302 nigel 93 if (condition || *ecode == OP_ALT)
1303     {
1304 ph10 609 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1305     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1306     if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1307     rrc = MATCH_NOMATCH;
1308     RRETURN(rrc);
1309 nigel 77 }
1310 ph10 395 else /* Condition false & no alternative */
1311 nigel 93 {
1312     ecode += 1 + LINK_SIZE;
1313     }
1314     break;
1315 nigel 77
1316 ph10 461
1317 ph10 447 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1318     to close any currently open capturing brackets. */
1319 ph10 461
1320 ph10 447 case OP_CLOSE:
1321 ph10 461 number = GET2(ecode, 1);
1322 ph10 447 offset = number << 1;
1323 ph10 461
1324 ph10 475 #ifdef PCRE_DEBUG
1325 ph10 447 printf("end bracket %d at *ACCEPT", number);
1326     printf("\n");
1327     #endif
1328 nigel 77
1329 ph10 447 md->capture_last = number;
1330     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1331     {
1332     md->offset_vector[offset] =
1333     md->offset_vector[md->offset_end - number];
1334 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1335 ph10 447 if (offset_top <= offset) offset_top = offset + 2;
1336     }
1337     ecode += 3;
1338 ph10 461 break;
1339 ph10 447
1340    
1341 ph10 608 /* End of the pattern, either real or forced. If we are in a recursion, we
1342     should restore the offsets appropriately, and if it's a top-level
1343     recursion, continue from after the call. */
1344 nigel 77
1345 ph10 210 case OP_ACCEPT:
1346 ph10 613 case OP_ASSERT_ACCEPT:
1347 nigel 77 case OP_END:
1348 ph10 608 if (md->recursive != NULL)
1349 nigel 77 {
1350     recursion_info *rec = md->recursive;
1351     md->recursive = rec->prevrec;
1352 ph10 608 memmove(md->offset_vector, rec->offset_save,
1353 nigel 77 rec->saved_max * sizeof(int));
1354 ph10 461 offset_top = rec->save_offset_top;
1355 ph10 608 if (rec->group_num == 0)
1356     {
1357     ecode = rec->after_call;
1358     break;
1359     }
1360 nigel 77 }
1361    
1362 ph10 613 /* Otherwise, if we have matched an empty string, fail if not in an
1363     assertion and if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1364     is set and we have matched at the start of the subject. In both cases,
1365     backtracking will then try other alternatives, if any. */
1366 ph10 443
1367 ph10 613 else if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1368 ph10 442 (md->notempty ||
1369 ph10 443 (md->notempty_atstart &&
1370 ph10 442 mstart == md->start_subject + md->start_offset)))
1371 ph10 510 MRRETURN(MATCH_NOMATCH);
1372 ph10 443
1373 ph10 442 /* Otherwise, we have a match. */
1374 ph10 608
1375 ph10 168 md->end_match_ptr = eptr; /* Record where we ended */
1376     md->end_offset_top = offset_top; /* and how many extracts were taken */
1377 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1378 nigel 77
1379 ph10 512 /* For some reason, the macros don't work properly if an expression is
1380     given as the argument to MRRETURN when the heap is in use. */
1381    
1382     rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1383     MRRETURN(rrc);
1384    
1385 nigel 77 /* Assertion brackets. Check the alternative branches in turn - the
1386     matching won't pass the KET for an assertion. If any one branch matches,
1387     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1388     start of each branch to move the current point backwards, so the code at
1389 ph10 604 this level is identical to the lookahead case. When the assertion is part
1390     of a condition, we want to return immediately afterwards. The caller of
1391     this incarnation of the match() function will have set MATCH_CONDASSERT in
1392     md->match_function type, and one of these opcodes will be the first opcode
1393     that is processed. We use a local variable that is preserved over calls to
1394     match() to remember this case. */
1395 nigel 77
1396     case OP_ASSERT:
1397     case OP_ASSERTBACK:
1398 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1399     {
1400     condassert = TRUE;
1401     md->match_function_type = 0;
1402     }
1403     else condassert = FALSE;
1404    
1405 nigel 77 do
1406     {
1407 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1408 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1409 ph10 500 {
1410     mstart = md->start_match_ptr; /* In case \K reset it */
1411     break;
1412 ph10 501 }
1413 ph10 550 if (rrc != MATCH_NOMATCH &&
1414     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1415     RRETURN(rrc);
1416 ph10 614
1417     /* See explanatory comment above under OP_CBRA. */
1418    
1419     if (md->end_offset_top > offset_top)
1420     {
1421     register int *iptr = md->offset_vector + offset_top;
1422     register int *iend = md->offset_vector + md->end_offset_top;
1423     while (iptr < iend) *iptr++ = -1;
1424     }
1425    
1426 nigel 77 ecode += GET(ecode, 1);
1427     }
1428     while (*ecode == OP_ALT);
1429 ph10 604
1430 ph10 510 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1431 nigel 77
1432     /* If checking an assertion for a condition, return MATCH_MATCH. */
1433    
1434 ph10 604 if (condassert) RRETURN(MATCH_MATCH);
1435 nigel 77
1436     /* Continue from after the assertion, updating the offsets high water
1437     mark, since extracts may have been taken during the assertion. */
1438    
1439     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1440     ecode += 1 + LINK_SIZE;
1441     offset_top = md->end_offset_top;
1442     continue;
1443    
1444 ph10 473 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1445 ph10 482 PRUNE, or COMMIT means we must assume failure without checking subsequent
1446 ph10 473 branches. */
1447 nigel 77
1448     case OP_ASSERT_NOT:
1449     case OP_ASSERTBACK_NOT:
1450 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1451     {
1452     condassert = TRUE;
1453     md->match_function_type = 0;
1454     }
1455     else condassert = FALSE;
1456    
1457 nigel 77 do
1458     {
1459 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1460 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1461 ph10 473 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1462     {
1463     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1464 ph10 482 break;
1465     }
1466 ph10 550 if (rrc != MATCH_NOMATCH &&
1467     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1468     RRETURN(rrc);
1469 nigel 77 ecode += GET(ecode,1);
1470     }
1471     while (*ecode == OP_ALT);
1472    
1473 ph10 604 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1474    
1475 nigel 77 ecode += 1 + LINK_SIZE;
1476     continue;
1477    
1478     /* Move the subject pointer back. This occurs only at the start of
1479     each branch of a lookbehind assertion. If we are too close to the start to
1480     move back, this match function fails. When working with UTF-8 we move
1481     back a number of characters, not bytes. */
1482    
1483     case OP_REVERSE:
1484     #ifdef SUPPORT_UTF8
1485     if (utf8)
1486     {
1487 nigel 93 i = GET(ecode, 1);
1488     while (i-- > 0)
1489 nigel 77 {
1490     eptr--;
1491 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1492 ph10 207 BACKCHAR(eptr);
1493 nigel 77 }
1494     }
1495     else
1496     #endif
1497    
1498     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1499    
1500     {
1501 nigel 93 eptr -= GET(ecode, 1);
1502 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1503 nigel 77 }
1504    
1505 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1506 nigel 77
1507 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1508 nigel 77 ecode += 1 + LINK_SIZE;
1509     break;
1510    
1511     /* The callout item calls an external function, if one is provided, passing
1512     details of the match so far. This is mainly for debugging, though the
1513     function is able to force a failure. */
1514    
1515     case OP_CALLOUT:
1516     if (pcre_callout != NULL)
1517     {
1518     pcre_callout_block cb;
1519     cb.version = 1; /* Version 1 of the callout block */
1520     cb.callout_number = ecode[1];
1521     cb.offset_vector = md->offset_vector;
1522 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1523 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1524     cb.start_match = (int)(mstart - md->start_subject);
1525     cb.current_position = (int)(eptr - md->start_subject);
1526 nigel 77 cb.pattern_position = GET(ecode, 2);
1527     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1528     cb.capture_top = offset_top/2;
1529     cb.capture_last = md->capture_last;
1530     cb.callout_data = md->callout_data;
1531 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1532 nigel 77 if (rrc < 0) RRETURN(rrc);
1533     }
1534     ecode += 2 + 2*LINK_SIZE;
1535     break;
1536    
1537     /* Recursion either matches the current regex, or some subexpression. The
1538     offset data is the offset to the starting bracket from the start of the
1539     whole pattern. (This is so that it works from duplicated subpatterns.)
1540    
1541     If there are any capturing brackets started but not finished, we have to
1542     save their starting points and reinstate them after the recursion. However,
1543     we don't know how many such there are (offset_top records the completed
1544     total) so we just have to save all the potential data. There may be up to
1545     65535 such values, which is too large to put on the stack, but using malloc
1546     for small numbers seems expensive. As a compromise, the stack is used when
1547     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1548     is used. A problem is what to do if the malloc fails ... there is no way of
1549     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1550     values on the stack, and accept that the rest may be wrong.
1551    
1552     There are also other values that have to be saved. We use a chained
1553     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1554     for the original version of this logic. */
1555    
1556     case OP_RECURSE:
1557     {
1558     callpat = md->start_code + GET(ecode, 1);
1559 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1560     GET2(callpat, 1 + LINK_SIZE);
1561 nigel 77
1562     /* Add to "recursing stack" */
1563    
1564     new_recursive.prevrec = md->recursive;
1565     md->recursive = &new_recursive;
1566    
1567     /* Find where to continue from afterwards */
1568    
1569     ecode += 1 + LINK_SIZE;
1570     new_recursive.after_call = ecode;
1571    
1572     /* Now save the offset data. */
1573    
1574     new_recursive.saved_max = md->offset_end;
1575     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1576     new_recursive.offset_save = stacksave;
1577     else
1578     {
1579     new_recursive.offset_save =
1580     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1581     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1582     }
1583    
1584     memcpy(new_recursive.offset_save, md->offset_vector,
1585     new_recursive.saved_max * sizeof(int));
1586 ph10 461 new_recursive.save_offset_top = offset_top;
1587 ph10 608
1588 nigel 77 /* OK, now we can do the recursion. For each top-level alternative we
1589     restore the offset and recursion data. */
1590    
1591     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1592 ph10 604 cbegroup = (*callpat >= OP_SBRA);
1593 nigel 77 do
1594     {
1595 ph10 604 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1596 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1597 ph10 604 md, eptrb, RM6);
1598 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1599 nigel 77 {
1600 nigel 87 DPRINTF(("Recursion matched\n"));
1601 nigel 77 md->recursive = new_recursive.prevrec;
1602     if (new_recursive.offset_save != stacksave)
1603     (pcre_free)(new_recursive.offset_save);
1604 ph10 510 MRRETURN(MATCH_MATCH);
1605 nigel 77 }
1606 ph10 550 else if (rrc != MATCH_NOMATCH &&
1607     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1608 nigel 87 {
1609     DPRINTF(("Recursion gave error %d\n", rrc));
1610 ph10 400 if (new_recursive.offset_save != stacksave)
1611     (pcre_free)(new_recursive.offset_save);
1612 nigel 87 RRETURN(rrc);
1613     }
1614 nigel 77
1615     md->recursive = &new_recursive;
1616     memcpy(md->offset_vector, new_recursive.offset_save,
1617     new_recursive.saved_max * sizeof(int));
1618     callpat += GET(callpat, 1);
1619     }
1620     while (*callpat == OP_ALT);
1621    
1622     DPRINTF(("Recursion didn't match\n"));
1623     md->recursive = new_recursive.prevrec;
1624     if (new_recursive.offset_save != stacksave)
1625     (pcre_free)(new_recursive.offset_save);
1626 ph10 510 MRRETURN(MATCH_NOMATCH);
1627 nigel 77 }
1628     /* Control never reaches here */
1629    
1630     /* "Once" brackets are like assertion brackets except that after a match,
1631     the point in the subject string is not moved back. Thus there can never be
1632     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1633     Check the alternative branches in turn - the matching won't pass the KET
1634     for this kind of subpattern. If any one branch matches, we carry on as at
1635 ph10 500 the end of a normal bracket, leaving the subject pointer, but resetting
1636     the start-of-match value in case it was changed by \K. */
1637 nigel 77
1638     case OP_ONCE:
1639 nigel 91 prev = ecode;
1640     saved_eptr = eptr;
1641    
1642     do
1643 nigel 77 {
1644 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1645 ph10 511 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1646 ph10 500 {
1647     mstart = md->start_match_ptr;
1648     break;
1649 ph10 501 }
1650 ph10 550 if (rrc != MATCH_NOMATCH &&
1651     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1652     RRETURN(rrc);
1653 ph10 614
1654     /* See explanatory comment above under OP_CBRA. */
1655    
1656     if (md->end_offset_top > offset_top)
1657     {
1658     register int *iptr = md->offset_vector + offset_top;
1659     register int *iend = md->offset_vector + md->end_offset_top;
1660     while (iptr < iend) *iptr++ = -1;
1661     }
1662    
1663 nigel 91 ecode += GET(ecode,1);
1664     }
1665     while (*ecode == OP_ALT);
1666 nigel 77
1667 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1668 nigel 77
1669 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1670 nigel 77
1671 ph10 614 /* Continue after the group, updating the offsets high water mark, since
1672     extracts may have been taken. */
1673 nigel 77
1674 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1675 nigel 77
1676 nigel 91 offset_top = md->end_offset_top;
1677     eptr = md->end_match_ptr;
1678 nigel 77
1679 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1680     happens for a repeating ket if no characters were matched in the group.
1681     This is the forcible breaking of infinite loops as implemented in Perl
1682     5.005. If there is an options reset, it will get obeyed in the normal
1683     course of events. */
1684 nigel 77
1685 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1686     {
1687     ecode += 1+LINK_SIZE;
1688     break;
1689     }
1690 nigel 77
1691 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1692     preceding bracket, in the appropriate order. The second "call" of match()
1693 ph10 602 uses tail recursion, to avoid using another stack frame. */
1694 nigel 77
1695 nigel 91 if (*ecode == OP_KETRMIN)
1696     {
1697 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM8);
1698 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1699     ecode = prev;
1700     goto TAIL_RECURSE;
1701 nigel 77 }
1702 nigel 91 else /* OP_KETRMAX */
1703     {
1704 ph10 604 md->match_function_type = MATCH_CBEGROUP;
1705     RMATCH(eptr, prev, offset_top, md, eptrb, RM9);
1706 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1707     ecode += 1 + LINK_SIZE;
1708     goto TAIL_RECURSE;
1709     }
1710     /* Control never gets here */
1711 nigel 77
1712     /* An alternation is the end of a branch; scan along to find the end of the
1713     bracketed group and go to there. */
1714    
1715     case OP_ALT:
1716     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1717     break;
1718    
1719 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1720     indicating that it may occur zero times. It may repeat infinitely, or not
1721     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1722     with fixed upper repeat limits are compiled as a number of copies, with the
1723     optional ones preceded by BRAZERO or BRAMINZERO. */
1724 ph10 604
1725 nigel 77 case OP_BRAZERO:
1726 ph10 604 next = ecode + 1;
1727     RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1728     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1729     do next += GET(next, 1); while (*next == OP_ALT);
1730     ecode = next + 1 + LINK_SIZE;
1731 nigel 77 break;
1732 ph10 604
1733 nigel 77 case OP_BRAMINZERO:
1734 ph10 604 next = ecode + 1;
1735     do next += GET(next, 1); while (*next == OP_ALT);
1736     RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1737     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1738     ecode++;
1739 nigel 77 break;
1740    
1741 ph10 335 case OP_SKIPZERO:
1742 ph10 604 next = ecode+1;
1743     do next += GET(next,1); while (*next == OP_ALT);
1744     ecode = next + 1 + LINK_SIZE;
1745 ph10 335 break;
1746 ph10 604
1747     /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1748     here; just jump to the group, with allow_zero set TRUE. */
1749    
1750     case OP_BRAPOSZERO:
1751     op = *(++ecode);
1752     allow_zero = TRUE;
1753     if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1754     goto POSSESSIVE_NON_CAPTURE;
1755 ph10 335
1756 nigel 93 /* End of a group, repeated or non-repeating. */
1757 nigel 77
1758     case OP_KET:
1759     case OP_KETRMIN:
1760     case OP_KETRMAX:
1761 ph10 604 case OP_KETRPOS:
1762 nigel 91 prev = ecode - GET(ecode, 1);
1763 nigel 77
1764 nigel 93 /* If this was a group that remembered the subject start, in order to break
1765     infinite repeats of empty string matches, retrieve the subject start from
1766     the chain. Otherwise, set it NULL. */
1767 nigel 77
1768 nigel 93 if (*prev >= OP_SBRA)
1769     {
1770     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1771     eptrb = eptrb->epb_prev; /* Backup to previous group */
1772     }
1773     else saved_eptr = NULL;
1774 nigel 77
1775 ph10 500 /* If we are at the end of an assertion group or an atomic group, stop
1776     matching and return MATCH_MATCH, but record the current high water mark for
1777     use by positive assertions. We also need to record the match start in case
1778     it was changed by \K. */
1779 nigel 93
1780 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1781     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1782     *prev == OP_ONCE)
1783     {
1784     md->end_match_ptr = eptr; /* For ONCE */
1785     md->end_offset_top = offset_top;
1786 ph10 500 md->start_match_ptr = mstart;
1787 ph10 510 MRRETURN(MATCH_MATCH);
1788 nigel 91 }
1789 nigel 77
1790 nigel 93 /* For capturing groups we have to check the group number back at the start
1791     and if necessary complete handling an extraction by setting the offsets and
1792     bumping the high water mark. Note that whole-pattern recursion is coded as
1793     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1794     when the OP_END is reached. Other recursion is handled here. */
1795 nigel 77
1796 ph10 604 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1797     *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1798 nigel 91 {
1799 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1800 nigel 91 offset = number << 1;
1801 ph10 461
1802 ph10 475 #ifdef PCRE_DEBUG
1803 nigel 91 printf("end bracket %d", number);
1804     printf("\n");
1805 nigel 77 #endif
1806    
1807 nigel 93 md->capture_last = number;
1808     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1809 nigel 91 {
1810 nigel 93 md->offset_vector[offset] =
1811     md->offset_vector[md->offset_end - number];
1812 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1813 nigel 93 if (offset_top <= offset) offset_top = offset + 2;
1814     }
1815 nigel 77
1816 nigel 93 /* Handle a recursively called group. Restore the offsets
1817     appropriately and continue from after the call. */
1818 nigel 77
1819 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1820     {
1821     recursion_info *rec = md->recursive;
1822     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1823     md->recursive = rec->prevrec;
1824     memcpy(md->offset_vector, rec->offset_save,
1825     rec->saved_max * sizeof(int));
1826 ph10 461 offset_top = rec->save_offset_top;
1827 nigel 93 ecode = rec->after_call;
1828     break;
1829 nigel 77 }
1830 nigel 91 }
1831 nigel 77
1832 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1833     happens for a repeating ket if no characters were matched in the group.
1834     This is the forcible breaking of infinite loops as implemented in Perl
1835     5.005. If there is an options reset, it will get obeyed in the normal
1836     course of events. */
1837 nigel 77
1838 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1839     {
1840     ecode += 1 + LINK_SIZE;
1841     break;
1842     }
1843 ph10 604
1844     /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1845     and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1846     at a time from the outer level, thus saving stack. */
1847    
1848     if (*ecode == OP_KETRPOS)
1849     {
1850     md->end_match_ptr = eptr;
1851     md->end_offset_top = offset_top;
1852     RRETURN(MATCH_KETRPOS);
1853     }
1854 nigel 77
1855 ph10 604 /* The normal repeating kets try the rest of the pattern or restart from
1856     the preceding bracket, in the appropriate order. In the second case, we can
1857     use tail recursion to avoid using another stack frame, unless we have an
1858 ph10 197 unlimited repeat of a group that can match an empty string. */
1859 nigel 77
1860 nigel 91 if (*ecode == OP_KETRMIN)
1861     {
1862 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1863 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1864 ph10 604 if (*prev >= OP_SBRA) /* Could match an empty string */
1865 ph10 197 {
1866 ph10 604 md->match_function_type = MATCH_CBEGROUP;
1867     RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1868 ph10 197 RRETURN(rrc);
1869     }
1870 nigel 91 ecode = prev;
1871     goto TAIL_RECURSE;
1872 nigel 77 }
1873 nigel 91 else /* OP_KETRMAX */
1874     {
1875 ph10 604 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1876     RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1877 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1878     ecode += 1 + LINK_SIZE;
1879     goto TAIL_RECURSE;
1880     }
1881     /* Control never gets here */
1882 nigel 77
1883 ph10 602 /* Not multiline mode: start of subject assertion, unless notbol. */
1884 nigel 77
1885     case OP_CIRC:
1886 ph10 510 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1887 ph10 602
1888 nigel 77 /* Start of subject assertion */
1889    
1890     case OP_SOD:
1891 ph10 510 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1892 nigel 77 ecode++;
1893     break;
1894 ph10 602
1895     /* Multiline mode: start of subject unless notbol, or after any newline. */
1896 nigel 77
1897 ph10 602 case OP_CIRCM:
1898     if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1899     if (eptr != md->start_subject &&
1900     (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1901     MRRETURN(MATCH_NOMATCH);
1902     ecode++;
1903     break;
1904    
1905 nigel 77 /* Start of match assertion */
1906    
1907     case OP_SOM:
1908 ph10 510 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1909 nigel 77 ecode++;
1910     break;
1911 ph10 172
1912 ph10 168 /* Reset the start of match point */
1913 ph10 172
1914 ph10 168 case OP_SET_SOM:
1915     mstart = eptr;
1916 ph10 172 ecode++;
1917     break;
1918 nigel 77
1919 ph10 602 /* Multiline mode: assert before any newline, or before end of subject
1920     unless noteol is set. */
1921 nigel 77
1922 ph10 602 case OP_DOLLM:
1923     if (eptr < md->end_subject)
1924     { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1925     else
1926 nigel 77 {
1927 ph10 510 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1928 ph10 602 SCHECK_PARTIAL();
1929 nigel 77 }
1930 ph10 602 ecode++;
1931     break;
1932 ph10 579
1933 ph10 602 /* Not multiline mode: assert before a terminating newline or before end of
1934     subject unless noteol is set. */
1935    
1936     case OP_DOLL:
1937     if (md->noteol) MRRETURN(MATCH_NOMATCH);
1938     if (!md->endonly) goto ASSERT_NL_OR_EOS;
1939    
1940 nigel 91 /* ... else fall through for endonly */
1941 nigel 77
1942     /* End of subject assertion (\z) */
1943    
1944     case OP_EOD:
1945 ph10 510 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1946 ph10 553 SCHECK_PARTIAL();
1947 nigel 77 ecode++;
1948     break;
1949    
1950     /* End of subject or ending \n assertion (\Z) */
1951    
1952     case OP_EODN:
1953 ph10 553 ASSERT_NL_OR_EOS:
1954     if (eptr < md->end_subject &&
1955 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1956 ph10 510 MRRETURN(MATCH_NOMATCH);
1957 ph10 579
1958 ph10 553 /* Either at end of string or \n before end. */
1959 ph10 579
1960 ph10 553 SCHECK_PARTIAL();
1961 nigel 77 ecode++;
1962     break;
1963    
1964     /* Word boundary assertions */
1965    
1966     case OP_NOT_WORD_BOUNDARY:
1967     case OP_WORD_BOUNDARY:
1968     {
1969    
1970     /* Find out if the previous and current characters are "word" characters.
1971     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1972 ph10 443 be "non-word" characters. Remember the earliest consulted character for
1973 ph10 435 partial matching. */
1974 nigel 77
1975     #ifdef SUPPORT_UTF8
1976     if (utf8)
1977     {
1978 ph10 518 /* Get status of previous character */
1979 ph10 527
1980 nigel 77 if (eptr == md->start_subject) prev_is_word = FALSE; else
1981     {
1982 ph10 409 USPTR lastptr = eptr - 1;
1983 nigel 77 while((*lastptr & 0xc0) == 0x80) lastptr--;
1984 ph10 443 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1985 nigel 77 GETCHAR(c, lastptr);
1986 ph10 527 #ifdef SUPPORT_UCP
1987 ph10 518 if (md->use_ucp)
1988     {
1989     if (c == '_') prev_is_word = TRUE; else
1990 ph10 527 {
1991 ph10 518 int cat = UCD_CATEGORY(c);
1992     prev_is_word = (cat == ucp_L || cat == ucp_N);
1993 ph10 527 }
1994     }
1995     else
1996     #endif
1997 nigel 77 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1998     }
1999 ph10 527
2000 ph10 518 /* Get status of next character */
2001 ph10 527
2002 ph10 443 if (eptr >= md->end_subject)
2003 nigel 77 {
2004 ph10 443 SCHECK_PARTIAL();
2005     cur_is_word = FALSE;
2006 ph10 428 }
2007     else
2008     {
2009 nigel 77 GETCHAR(c, eptr);
2010 ph10 527 #ifdef SUPPORT_UCP
2011 ph10 518 if (md->use_ucp)
2012     {
2013     if (c == '_') cur_is_word = TRUE; else
2014 ph10 527 {
2015 ph10 518 int cat = UCD_CATEGORY(c);
2016     cur_is_word = (cat == ucp_L || cat == ucp_N);
2017 ph10 527 }
2018     }
2019     else
2020     #endif
2021 nigel 77 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2022     }
2023     }
2024     else
2025     #endif
2026    
2027 ph10 527 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2028 ph10 518 consistency with the behaviour of \w we do use it in this case. */
2029 nigel 77
2030     {
2031 ph10 518 /* Get status of previous character */
2032 ph10 527
2033 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
2034     {
2035 ph10 443 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2036 ph10 527 #ifdef SUPPORT_UCP
2037 ph10 518 if (md->use_ucp)
2038     {
2039 ph10 527 c = eptr[-1];
2040 ph10 518 if (c == '_') prev_is_word = TRUE; else
2041 ph10 527 {
2042 ph10 518 int cat = UCD_CATEGORY(c);
2043     prev_is_word = (cat == ucp_L || cat == ucp_N);
2044 ph10 527 }
2045     }
2046     else
2047     #endif
2048 ph10 435 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2049     }
2050 ph10 527
2051 ph10 518 /* Get status of next character */
2052 ph10 527
2053 ph10 443 if (eptr >= md->end_subject)
2054 ph10 428 {
2055 ph10 443 SCHECK_PARTIAL();
2056     cur_is_word = FALSE;
2057 ph10 428 }
2058 ph10 527 else
2059     #ifdef SUPPORT_UCP
2060 ph10 518 if (md->use_ucp)
2061     {
2062 ph10 527 c = *eptr;
2063 ph10 518 if (c == '_') cur_is_word = TRUE; else
2064 ph10 527 {
2065 ph10 518 int cat = UCD_CATEGORY(c);
2066     cur_is_word = (cat == ucp_L || cat == ucp_N);
2067 ph10 527 }
2068     }
2069     else
2070     #endif
2071 ph10 518 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2072 nigel 77 }
2073    
2074     /* Now see if the situation is what we want */
2075    
2076     if ((*ecode++ == OP_WORD_BOUNDARY)?
2077     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2078 ph10 510 MRRETURN(MATCH_NOMATCH);
2079 nigel 77 }
2080     break;
2081    
2082     /* Match a single character type; inline for speed */
2083    
2084     case OP_ANY:
2085 ph10 510 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2086 ph10 345 /* Fall through */
2087    
2088 ph10 341 case OP_ALLANY:
2089 ph10 443 if (eptr++ >= md->end_subject)
2090 ph10 428 {
2091 ph10 443 SCHECK_PARTIAL();
2092 ph10 510 MRRETURN(MATCH_NOMATCH);
2093 ph10 443 }
2094 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2095 nigel 77 ecode++;
2096     break;
2097    
2098     /* Match a single byte, even in UTF-8 mode. This opcode really does match
2099     any byte, even newline, independent of the setting of PCRE_DOTALL. */
2100    
2101     case OP_ANYBYTE:
2102 ph10 443 if (eptr++ >= md->end_subject)
2103 ph10 428 {
2104 ph10 443 SCHECK_PARTIAL();
2105 ph10 510 MRRETURN(MATCH_NOMATCH);
2106 ph10 443 }
2107 nigel 77 ecode++;
2108     break;
2109    
2110     case OP_NOT_DIGIT:
2111 ph10 443 if (eptr >= md->end_subject)
2112 ph10 428 {
2113 ph10 443 SCHECK_PARTIAL();
2114 ph10 510 MRRETURN(MATCH_NOMATCH);
2115 ph10 443 }
2116 nigel 77 GETCHARINCTEST(c, eptr);
2117     if (
2118     #ifdef SUPPORT_UTF8
2119     c < 256 &&
2120     #endif
2121     (md->ctypes[c] & ctype_digit) != 0
2122     )
2123 ph10 510 MRRETURN(MATCH_NOMATCH);
2124 nigel 77 ecode++;
2125     break;
2126    
2127     case OP_DIGIT:
2128 ph10 443 if (eptr >= md->end_subject)
2129 ph10 428 {
2130 ph10 443 SCHECK_PARTIAL();
2131 ph10 510 MRRETURN(MATCH_NOMATCH);
2132 ph10 443 }
2133 nigel 77 GETCHARINCTEST(c, eptr);
2134     if (
2135     #ifdef SUPPORT_UTF8
2136     c >= 256 ||
2137     #endif
2138     (md->ctypes[c] & ctype_digit) == 0
2139     )
2140 ph10 510 MRRETURN(MATCH_NOMATCH);
2141 nigel 77 ecode++;
2142     break;
2143    
2144     case OP_NOT_WHITESPACE:
2145 ph10 443 if (eptr >= md->end_subject)
2146 ph10 428 {
2147 ph10 443 SCHECK_PARTIAL();
2148 ph10 510 MRRETURN(MATCH_NOMATCH);
2149 ph10 443 }
2150 nigel 77 GETCHARINCTEST(c, eptr);
2151     if (
2152     #ifdef SUPPORT_UTF8
2153     c < 256 &&
2154     #endif
2155     (md->ctypes[c] & ctype_space) != 0
2156     )
2157 ph10 510 MRRETURN(MATCH_NOMATCH);
2158 nigel 77 ecode++;
2159     break;
2160    
2161     case OP_WHITESPACE:
2162 ph10 443 if (eptr >= md->end_subject)
2163 ph10 428 {
2164 ph10 443 SCHECK_PARTIAL();
2165 ph10 510 MRRETURN(MATCH_NOMATCH);
2166 ph10 443 }
2167 nigel 77 GETCHARINCTEST(c, eptr);
2168     if (
2169     #ifdef SUPPORT_UTF8
2170     c >= 256 ||
2171     #endif
2172     (md->ctypes[c] & ctype_space) == 0
2173     )
2174 ph10 510 MRRETURN(MATCH_NOMATCH);
2175 nigel 77 ecode++;
2176     break;
2177    
2178     case OP_NOT_WORDCHAR:
2179 ph10 443 if (eptr >= md->end_subject)
2180 ph10 428 {
2181 ph10 443 SCHECK_PARTIAL();
2182 ph10 510 MRRETURN(MATCH_NOMATCH);
2183 ph10 443 }
2184 nigel 77 GETCHARINCTEST(c, eptr);
2185     if (
2186     #ifdef SUPPORT_UTF8
2187     c < 256 &&
2188     #endif
2189     (md->ctypes[c] & ctype_word) != 0
2190     )
2191 ph10 510 MRRETURN(MATCH_NOMATCH);
2192 nigel 77 ecode++;
2193     break;
2194    
2195     case OP_WORDCHAR:
2196 ph10 443 if (eptr >= md->end_subject)
2197 ph10 428 {
2198 ph10 443 SCHECK_PARTIAL();
2199 ph10 510 MRRETURN(MATCH_NOMATCH);
2200 ph10 443 }
2201 nigel 77 GETCHARINCTEST(c, eptr);
2202     if (
2203     #ifdef SUPPORT_UTF8
2204     c >= 256 ||
2205     #endif
2206     (md->ctypes[c] & ctype_word) == 0
2207     )
2208 ph10 510 MRRETURN(MATCH_NOMATCH);
2209 nigel 77 ecode++;
2210     break;
2211    
2212 nigel 93 case OP_ANYNL:
2213 ph10 443 if (eptr >= md->end_subject)
2214 ph10 428 {
2215 ph10 443 SCHECK_PARTIAL();
2216 ph10 510 MRRETURN(MATCH_NOMATCH);
2217 ph10 443 }
2218 nigel 93 GETCHARINCTEST(c, eptr);
2219     switch(c)
2220     {
2221 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2222 ph10 600
2223 nigel 93 case 0x000d:
2224     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2225     break;
2226 ph10 231
2227 nigel 93 case 0x000a:
2228 ph10 231 break;
2229    
2230 nigel 93 case 0x000b:
2231     case 0x000c:
2232     case 0x0085:
2233     case 0x2028:
2234     case 0x2029:
2235 ph10 510 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2236 nigel 93 break;
2237     }
2238     ecode++;
2239     break;
2240    
2241 ph10 178 case OP_NOT_HSPACE:
2242 ph10 443 if (eptr >= md->end_subject)
2243 ph10 428 {
2244 ph10 443 SCHECK_PARTIAL();
2245 ph10 510 MRRETURN(MATCH_NOMATCH);
2246 ph10 443 }
2247 ph10 178 GETCHARINCTEST(c, eptr);
2248     switch(c)
2249     {
2250     default: break;
2251     case 0x09: /* HT */
2252     case 0x20: /* SPACE */
2253     case 0xa0: /* NBSP */
2254     case 0x1680: /* OGHAM SPACE MARK */
2255     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2256     case 0x2000: /* EN QUAD */
2257     case 0x2001: /* EM QUAD */
2258     case 0x2002: /* EN SPACE */
2259     case 0x2003: /* EM SPACE */
2260     case 0x2004: /* THREE-PER-EM SPACE */
2261     case 0x2005: /* FOUR-PER-EM SPACE */
2262     case 0x2006: /* SIX-PER-EM SPACE */
2263     case 0x2007: /* FIGURE SPACE */
2264     case 0x2008: /* PUNCTUATION SPACE */
2265     case 0x2009: /* THIN SPACE */
2266     case 0x200A: /* HAIR SPACE */
2267     case 0x202f: /* NARROW NO-BREAK SPACE */
2268     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2269     case 0x3000: /* IDEOGRAPHIC SPACE */
2270 ph10 510 MRRETURN(MATCH_NOMATCH);
2271 ph10 178 }
2272     ecode++;
2273     break;
2274    
2275     case OP_HSPACE:
2276 ph10 443 if (eptr >= md->end_subject)
2277 ph10 428 {
2278 ph10 443 SCHECK_PARTIAL();
2279 ph10 510 MRRETURN(MATCH_NOMATCH);
2280 ph10 443 }
2281 ph10 178 GETCHARINCTEST(c, eptr);
2282     switch(c)
2283     {
2284 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2285 ph10 178 case 0x09: /* HT */
2286     case 0x20: /* SPACE */
2287     case 0xa0: /* NBSP */
2288     case 0x1680: /* OGHAM SPACE MARK */
2289     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2290     case 0x2000: /* EN QUAD */
2291     case 0x2001: /* EM QUAD */
2292     case 0x2002: /* EN SPACE */
2293     case 0x2003: /* EM SPACE */
2294     case 0x2004: /* THREE-PER-EM SPACE */
2295     case 0x2005: /* FOUR-PER-EM SPACE */
2296     case 0x2006: /* SIX-PER-EM SPACE */
2297     case 0x2007: /* FIGURE SPACE */
2298     case 0x2008: /* PUNCTUATION SPACE */
2299     case 0x2009: /* THIN SPACE */
2300     case 0x200A: /* HAIR SPACE */
2301     case 0x202f: /* NARROW NO-BREAK SPACE */
2302     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2303     case 0x3000: /* IDEOGRAPHIC SPACE */
2304     break;
2305     }
2306     ecode++;
2307     break;
2308    
2309     case OP_NOT_VSPACE:
2310 ph10 443 if (eptr >= md->end_subject)
2311 ph10 428 {
2312 ph10 443 SCHECK_PARTIAL();
2313 ph10 510 MRRETURN(MATCH_NOMATCH);
2314 ph10 443 }
2315 ph10 178 GETCHARINCTEST(c, eptr);
2316     switch(c)
2317     {
2318     default: break;
2319     case 0x0a: /* LF */
2320     case 0x0b: /* VT */
2321     case 0x0c: /* FF */
2322     case 0x0d: /* CR */
2323     case 0x85: /* NEL */
2324     case 0x2028: /* LINE SEPARATOR */
2325     case 0x2029: /* PARAGRAPH SEPARATOR */
2326 ph10 510 MRRETURN(MATCH_NOMATCH);
2327 ph10 178 }
2328     ecode++;
2329     break;
2330    
2331     case OP_VSPACE:
2332 ph10 443 if (eptr >= md->end_subject)
2333 ph10 428 {
2334 ph10 443 SCHECK_PARTIAL();
2335 ph10 510 MRRETURN(MATCH_NOMATCH);
2336 ph10 443 }
2337 ph10 178 GETCHARINCTEST(c, eptr);
2338     switch(c)
2339     {
2340 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2341 ph10 178 case 0x0a: /* LF */
2342     case 0x0b: /* VT */
2343     case 0x0c: /* FF */
2344     case 0x0d: /* CR */
2345     case 0x85: /* NEL */
2346     case 0x2028: /* LINE SEPARATOR */
2347     case 0x2029: /* PARAGRAPH SEPARATOR */
2348     break;
2349     }
2350     ecode++;
2351     break;
2352    
2353 nigel 77 #ifdef SUPPORT_UCP
2354     /* Check the next character by Unicode property. We will get here only
2355     if the support is in the binary; otherwise a compile-time error occurs. */
2356    
2357     case OP_PROP:
2358     case OP_NOTPROP:
2359 ph10 443 if (eptr >= md->end_subject)
2360 ph10 428 {
2361 ph10 443 SCHECK_PARTIAL();
2362 ph10 510 MRRETURN(MATCH_NOMATCH);
2363 ph10 443 }
2364 nigel 77 GETCHARINCTEST(c, eptr);
2365     {
2366 ph10 384 const ucd_record *prop = GET_UCD(c);
2367 nigel 77
2368 nigel 87 switch(ecode[1])
2369     {
2370     case PT_ANY:
2371 ph10 510 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2372 nigel 87 break;
2373 nigel 77
2374 nigel 87 case PT_LAMP:
2375 ph10 349 if ((prop->chartype == ucp_Lu ||
2376     prop->chartype == ucp_Ll ||
2377     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2378 ph10 510 MRRETURN(MATCH_NOMATCH);
2379 ph10 517 break;
2380 nigel 87
2381     case PT_GC:
2382 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2383 ph10 510 MRRETURN(MATCH_NOMATCH);
2384 nigel 87 break;
2385    
2386     case PT_PC:
2387 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2388 ph10 510 MRRETURN(MATCH_NOMATCH);
2389 nigel 87 break;
2390    
2391     case PT_SC:
2392 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
2393 ph10 510 MRRETURN(MATCH_NOMATCH);
2394 nigel 87 break;
2395 ph10 527
2396 ph10 517 /* These are specials */
2397 ph10 527
2398 ph10 517 case PT_ALNUM:
2399     if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2400     _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2401     MRRETURN(MATCH_NOMATCH);
2402 ph10 527 break;
2403    
2404 ph10 517 case PT_SPACE: /* Perl space */
2405     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2406     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2407     == (op == OP_NOTPROP))
2408     MRRETURN(MATCH_NOMATCH);
2409 ph10 527 break;
2410    
2411 ph10 517 case PT_PXSPACE: /* POSIX space */
2412     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2413 ph10 527 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2414 ph10 517 c == CHAR_FF || c == CHAR_CR)
2415     == (op == OP_NOTPROP))
2416     MRRETURN(MATCH_NOMATCH);
2417 ph10 527 break;
2418 nigel 87
2419 ph10 527 case PT_WORD:
2420 ph10 517 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2421 ph10 527 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2422 ph10 517 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2423     MRRETURN(MATCH_NOMATCH);
2424 ph10 527 break;
2425    
2426 ph10 517 /* This should never occur */
2427    
2428 nigel 87 default:
2429     RRETURN(PCRE_ERROR_INTERNAL);
2430 nigel 77 }
2431 nigel 87
2432     ecode += 3;
2433 nigel 77 }
2434     break;
2435    
2436     /* Match an extended Unicode sequence. We will get here only if the support
2437     is in the binary; otherwise a compile-time error occurs. */
2438    
2439     case OP_EXTUNI:
2440 ph10 443 if (eptr >= md->end_subject)
2441 ph10 428 {
2442 ph10 443 SCHECK_PARTIAL();
2443 ph10 510 MRRETURN(MATCH_NOMATCH);
2444 ph10 443 }
2445 nigel 77 GETCHARINCTEST(c, eptr);
2446     {
2447 ph10 349 int category = UCD_CATEGORY(c);
2448 ph10 510 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2449 nigel 77 while (eptr < md->end_subject)
2450     {
2451     int len = 1;
2452     if (!utf8) c = *eptr; else
2453     {
2454     GETCHARLEN(c, eptr, len);
2455     }
2456 ph10 349 category = UCD_CATEGORY(c);
2457 nigel 77 if (category != ucp_M) break;
2458     eptr += len;
2459     }
2460     }
2461     ecode++;
2462     break;
2463     #endif
2464    
2465    
2466     /* Match a back reference, possibly repeatedly. Look past the end of the
2467     item to see if there is repeat information following. The code is similar
2468     to that for character classes, but repeated for efficiency. Then obey
2469     similar code to character type repeats - written out again for speed.
2470     However, if the referenced string is the empty string, always treat
2471     it as matched, any number of times (otherwise there could be infinite
2472     loops). */
2473    
2474     case OP_REF:
2475 ph10 602 case OP_REFI:
2476     caseless = op == OP_REFI;
2477 ph10 595 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2478     ecode += 3;
2479 ph10 345
2480 ph10 595 /* If the reference is unset, there are two possibilities:
2481 ph10 345
2482 ph10 595 (a) In the default, Perl-compatible state, set the length negative;
2483     this ensures that every attempt at a match fails. We can't just fail
2484     here, because of the possibility of quantifiers with zero minima.
2485 ph10 345
2486 ph10 595 (b) If the JavaScript compatibility flag is set, set the length to zero
2487     so that the back reference matches an empty string.
2488 ph10 345
2489 ph10 595 Otherwise, set the length to the length of what was matched by the
2490     referenced subpattern. */
2491 ph10 345
2492 ph10 595 if (offset >= offset_top || md->offset_vector[offset] < 0)
2493     length = (md->jscript_compat)? 0 : -1;
2494     else
2495     length = md->offset_vector[offset+1] - md->offset_vector[offset];
2496 nigel 77
2497 ph10 595 /* Set up for repetition, or handle the non-repeated case */
2498 nigel 77
2499 ph10 595 switch (*ecode)
2500     {
2501     case OP_CRSTAR:
2502     case OP_CRMINSTAR:
2503     case OP_CRPLUS:
2504     case OP_CRMINPLUS:
2505     case OP_CRQUERY:
2506     case OP_CRMINQUERY:
2507     c = *ecode++ - OP_CRSTAR;
2508     minimize = (c & 1) != 0;
2509     min = rep_min[c]; /* Pick up values from tables; */
2510     max = rep_max[c]; /* zero for max => infinity */
2511     if (max == 0) max = INT_MAX;
2512     break;
2513 nigel 77
2514 ph10 595 case OP_CRRANGE:
2515     case OP_CRMINRANGE:
2516     minimize = (*ecode == OP_CRMINRANGE);
2517     min = GET2(ecode, 1);
2518     max = GET2(ecode, 3);
2519     if (max == 0) max = INT_MAX;
2520     ecode += 5;
2521     break;
2522 nigel 77
2523 ph10 595 default: /* No repeat follows */
2524 ph10 602 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2525 ph10 595 {
2526     CHECK_PARTIAL();
2527     MRRETURN(MATCH_NOMATCH);
2528 nigel 77 }
2529 ph10 595 eptr += length;
2530     continue; /* With the main loop */
2531     }
2532 nigel 77
2533 ph10 595 /* Handle repeated back references. If the length of the reference is
2534     zero, just continue with the main loop. */
2535 ph10 443
2536 ph10 595 if (length == 0) continue;
2537 nigel 77
2538 ph10 595 /* First, ensure the minimum number of matches are present. We get back
2539     the length of the reference string explicitly rather than passing the
2540     address of eptr, so that eptr can be a register variable. */
2541 nigel 77
2542 ph10 595 for (i = 1; i <= min; i++)
2543     {
2544     int slength;
2545 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2546 nigel 77 {
2547 ph10 595 CHECK_PARTIAL();
2548     MRRETURN(MATCH_NOMATCH);
2549 nigel 77 }
2550 ph10 595 eptr += slength;
2551     }
2552 nigel 77
2553 ph10 595 /* If min = max, continue at the same level without recursion.
2554     They are not both allowed to be zero. */
2555 nigel 77
2556 ph10 595 if (min == max) continue;
2557 nigel 77
2558 ph10 595 /* If minimizing, keep trying and advancing the pointer */
2559 nigel 77
2560 ph10 595 if (minimize)
2561     {
2562     for (fi = min;; fi++)
2563 nigel 77 {
2564 ph10 595 int slength;
2565 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2566 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2567     if (fi >= max) MRRETURN(MATCH_NOMATCH);
2568 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2569 nigel 77 {
2570 ph10 595 CHECK_PARTIAL();
2571     MRRETURN(MATCH_NOMATCH);
2572 nigel 77 }
2573 ph10 595 eptr += slength;
2574 nigel 77 }
2575 ph10 595 /* Control never gets here */
2576     }
2577 nigel 77
2578 ph10 595 /* If maximizing, find the longest string and work backwards */
2579 nigel 77
2580 ph10 595 else
2581     {
2582     pp = eptr;
2583     for (i = min; i < max; i++)
2584 nigel 77 {
2585 ph10 595 int slength;
2586 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2587 nigel 77 {
2588 ph10 595 CHECK_PARTIAL();
2589     break;
2590 nigel 77 }
2591 ph10 595 eptr += slength;
2592 nigel 77 }
2593 ph10 595 while (eptr >= pp)
2594     {
2595 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2596 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2597     eptr -= length;
2598     }
2599     MRRETURN(MATCH_NOMATCH);
2600 nigel 77 }
2601     /* Control never gets here */
2602    
2603     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2604     used when all the characters in the class have values in the range 0-255,
2605     and either the matching is caseful, or the characters are in the range
2606     0-127 when UTF-8 processing is enabled. The only difference between
2607     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2608     encountered.
2609    
2610     First, look past the end of the item to see if there is repeat information
2611     following. Then obey similar code to character type repeats - written out
2612     again for speed. */
2613    
2614     case OP_NCLASS:
2615     case OP_CLASS:
2616     {
2617     data = ecode + 1; /* Save for matching */
2618     ecode += 33; /* Advance past the item */
2619    
2620     switch (*ecode)
2621     {
2622     case OP_CRSTAR:
2623     case OP_CRMINSTAR:
2624     case OP_CRPLUS:
2625     case OP_CRMINPLUS:
2626     case OP_CRQUERY:
2627     case OP_CRMINQUERY:
2628     c = *ecode++ - OP_CRSTAR;
2629     minimize = (c & 1) != 0;
2630     min = rep_min[c]; /* Pick up values from tables; */
2631     max = rep_max[c]; /* zero for max => infinity */
2632     if (max == 0) max = INT_MAX;
2633     break;
2634    
2635     case OP_CRRANGE:
2636     case OP_CRMINRANGE:
2637     minimize = (*ecode == OP_CRMINRANGE);
2638     min = GET2(ecode, 1);
2639     max = GET2(ecode, 3);
2640     if (max == 0) max = INT_MAX;
2641     ecode += 5;
2642     break;
2643    
2644     default: /* No repeat follows */
2645     min = max = 1;
2646     break;
2647     }
2648    
2649     /* First, ensure the minimum number of matches are present. */
2650    
2651     #ifdef SUPPORT_UTF8
2652     /* UTF-8 mode */
2653     if (utf8)
2654     {
2655     for (i = 1; i <= min; i++)
2656     {
2657 ph10 427 if (eptr >= md->end_subject)
2658 ph10 426 {
2659 ph10 428 SCHECK_PARTIAL();
2660 ph10 510 MRRETURN(MATCH_NOMATCH);
2661 ph10 427 }
2662 nigel 77 GETCHARINC(c, eptr);
2663     if (c > 255)
2664     {
2665 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2666 nigel 77 }
2667     else
2668     {
2669 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2670 nigel 77 }
2671     }
2672     }
2673     else
2674     #endif
2675     /* Not UTF-8 mode */
2676     {
2677     for (i = 1; i <= min; i++)
2678     {
2679 ph10 427 if (eptr >= md->end_subject)
2680 ph10 426 {
2681 ph10 428 SCHECK_PARTIAL();
2682 ph10 510 MRRETURN(MATCH_NOMATCH);
2683 ph10 427 }
2684 nigel 77 c = *eptr++;
2685 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2686 nigel 77 }
2687     }
2688    
2689     /* If max == min we can continue with the main loop without the
2690     need to recurse. */
2691    
2692     if (min == max) continue;
2693    
2694     /* If minimizing, keep testing the rest of the expression and advancing
2695     the pointer while it matches the class. */
2696    
2697     if (minimize)
2698     {
2699     #ifdef SUPPORT_UTF8
2700     /* UTF-8 mode */
2701     if (utf8)
2702     {
2703     for (fi = min;; fi++)
2704     {
2705 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2706 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2707 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2708 ph10 427 if (eptr >= md->end_subject)
2709 ph10 426 {
2710 ph10 427 SCHECK_PARTIAL();
2711 ph10 510 MRRETURN(MATCH_NOMATCH);
2712 ph10 427 }
2713 nigel 77 GETCHARINC(c, eptr);
2714     if (c > 255)
2715     {
2716 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2717 nigel 77 }
2718     else
2719     {
2720 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2721 nigel 77 }
2722     }
2723     }
2724     else
2725     #endif
2726     /* Not UTF-8 mode */
2727     {
2728     for (fi = min;; fi++)
2729     {
2730 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2731 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2732 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2733 ph10 427 if (eptr >= md->end_subject)
2734 ph10 426 {
2735 ph10 427 SCHECK_PARTIAL();
2736 ph10 510 MRRETURN(MATCH_NOMATCH);
2737 ph10 427 }
2738 nigel 77 c = *eptr++;
2739 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2740 nigel 77 }
2741     }
2742     /* Control never gets here */
2743     }
2744    
2745     /* If maximizing, find the longest possible run, then work backwards. */
2746    
2747     else
2748     {
2749     pp = eptr;
2750    
2751     #ifdef SUPPORT_UTF8
2752     /* UTF-8 mode */
2753     if (utf8)
2754     {
2755     for (i = min; i < max; i++)
2756     {
2757     int len = 1;
2758 ph10 463 if (eptr >= md->end_subject)
2759 ph10 462 {
2760 ph10 463 SCHECK_PARTIAL();
2761 ph10 462 break;
2762 ph10 463 }
2763 nigel 77 GETCHARLEN(c, eptr, len);
2764     if (c > 255)
2765     {
2766     if (op == OP_CLASS) break;
2767     }
2768     else
2769     {
2770     if ((data[c/8] & (1 << (c&7))) == 0) break;
2771     }
2772     eptr += len;
2773     }
2774     for (;;)
2775     {
2776 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2777 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2778     if (eptr-- == pp) break; /* Stop if tried at original pos */
2779     BACKCHAR(eptr);
2780     }
2781     }
2782     else
2783     #endif
2784     /* Not UTF-8 mode */
2785     {
2786     for (i = min; i < max; i++)
2787     {
2788 ph10 463 if (eptr >= md->end_subject)
2789 ph10 462 {
2790 ph10 463 SCHECK_PARTIAL();
2791 ph10 462 break;
2792 ph10 463 }
2793 nigel 77 c = *eptr;
2794     if ((data[c/8] & (1 << (c&7))) == 0) break;
2795     eptr++;
2796     }
2797     while (eptr >= pp)
2798     {
2799 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2800 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2801 nigel 77 eptr--;
2802     }
2803     }
2804    
2805 ph10 510 MRRETURN(MATCH_NOMATCH);
2806 nigel 77 }
2807     }
2808     /* Control never gets here */
2809    
2810    
2811     /* Match an extended character class. This opcode is encountered only
2812 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2813     mode, because Unicode properties are supported in non-UTF-8 mode. */
2814 nigel 77
2815     #ifdef SUPPORT_UTF8
2816     case OP_XCLASS:
2817     {
2818     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2819     ecode += GET(ecode, 1); /* Advance past the item */
2820    
2821     switch (*ecode)
2822     {
2823     case OP_CRSTAR:
2824     case OP_CRMINSTAR:
2825     case OP_CRPLUS:
2826     case OP_CRMINPLUS:
2827     case OP_CRQUERY:
2828     case OP_CRMINQUERY:
2829     c = *ecode++ - OP_CRSTAR;
2830     minimize = (c & 1) != 0;
2831     min = rep_min[c]; /* Pick up values from tables; */
2832     max = rep_max[c]; /* zero for max => infinity */
2833     if (max == 0) max = INT_MAX;
2834     break;
2835    
2836     case OP_CRRANGE:
2837     case OP_CRMINRANGE:
2838     minimize = (*ecode == OP_CRMINRANGE);
2839     min = GET2(ecode, 1);
2840     max = GET2(ecode, 3);
2841     if (max == 0) max = INT_MAX;
2842     ecode += 5;
2843     break;
2844    
2845     default: /* No repeat follows */
2846     min = max = 1;
2847     break;
2848     }
2849    
2850     /* First, ensure the minimum number of matches are present. */
2851    
2852     for (i = 1; i <= min; i++)
2853     {
2854 ph10 427 if (eptr >= md->end_subject)
2855 ph10 426 {
2856     SCHECK_PARTIAL();
2857 ph10 510 MRRETURN(MATCH_NOMATCH);
2858 ph10 427 }
2859 ph10 384 GETCHARINCTEST(c, eptr);
2860 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2861 nigel 77 }
2862    
2863     /* If max == min we can continue with the main loop without the
2864     need to recurse. */
2865    
2866     if (min == max) continue;
2867    
2868     /* If minimizing, keep testing the rest of the expression and advancing
2869     the pointer while it matches the class. */
2870    
2871     if (minimize)
2872     {
2873     for (fi = min;; fi++)
2874     {
2875 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2876 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2877 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2878 ph10 427 if (eptr >= md->end_subject)
2879 ph10 426 {
2880 ph10 427 SCHECK_PARTIAL();
2881 ph10 510 MRRETURN(MATCH_NOMATCH);
2882 ph10 427 }
2883 ph10 384 GETCHARINCTEST(c, eptr);
2884 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2885 nigel 77 }
2886     /* Control never gets here */
2887     }
2888    
2889     /* If maximizing, find the longest possible run, then work backwards. */
2890    
2891     else
2892     {
2893     pp = eptr;
2894     for (i = min; i < max; i++)
2895     {
2896     int len = 1;
2897 ph10 463 if (eptr >= md->end_subject)
2898 ph10 462 {
2899 ph10 463 SCHECK_PARTIAL();
2900 ph10 462 break;
2901 ph10 463 }
2902 ph10 384 GETCHARLENTEST(c, eptr, len);
2903 nigel 77 if (!_pcre_xclass(c, data)) break;
2904     eptr += len;
2905     }
2906     for(;;)
2907     {
2908 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2909 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2910     if (eptr-- == pp) break; /* Stop if tried at original pos */
2911 ph10 214 if (utf8) BACKCHAR(eptr);
2912 nigel 77 }
2913 ph10 510 MRRETURN(MATCH_NOMATCH);
2914 nigel 77 }
2915    
2916     /* Control never gets here */
2917     }
2918     #endif /* End of XCLASS */
2919    
2920     /* Match a single character, casefully */
2921    
2922     case OP_CHAR:
2923     #ifdef SUPPORT_UTF8
2924     if (utf8)
2925     {
2926     length = 1;
2927     ecode++;
2928     GETCHARLEN(fc, ecode, length);
2929 ph10 443 if (length > md->end_subject - eptr)
2930 ph10 428 {
2931     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2932 ph10 510 MRRETURN(MATCH_NOMATCH);
2933 ph10 443 }
2934 ph10 510 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2935 nigel 77 }
2936     else
2937     #endif
2938    
2939     /* Non-UTF-8 mode */
2940     {
2941 ph10 443 if (md->end_subject - eptr < 1)
2942 ph10 428 {
2943     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2944 ph10 510 MRRETURN(MATCH_NOMATCH);
2945 ph10 443 }
2946 ph10 510 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2947 nigel 77 ecode += 2;
2948     }
2949     break;
2950    
2951     /* Match a single character, caselessly */
2952    
2953 ph10 602 case OP_CHARI:
2954 nigel 77 #ifdef SUPPORT_UTF8
2955     if (utf8)
2956     {
2957     length = 1;
2958     ecode++;
2959     GETCHARLEN(fc, ecode, length);
2960    
2961 ph10 443 if (length > md->end_subject - eptr)
2962 ph10 428 {
2963     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2964 ph10 510 MRRETURN(MATCH_NOMATCH);
2965 ph10 443 }
2966 nigel 77
2967     /* If the pattern character's value is < 128, we have only one byte, and
2968     can use the fast lookup table. */
2969    
2970     if (fc < 128)
2971     {
2972 ph10 510 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2973 nigel 77 }
2974    
2975     /* Otherwise we must pick up the subject character */
2976    
2977     else
2978     {
2979 nigel 93 unsigned int dc;
2980 nigel 77 GETCHARINC(dc, eptr);
2981     ecode += length;
2982    
2983     /* If we have Unicode property support, we can use it to test the other
2984 nigel 87 case of the character, if there is one. */
2985 nigel 77
2986     if (fc != dc)
2987     {
2988     #ifdef SUPPORT_UCP
2989 ph10 349 if (dc != UCD_OTHERCASE(fc))
2990 nigel 77 #endif
2991 ph10 510 MRRETURN(MATCH_NOMATCH);
2992 nigel 77 }
2993     }
2994     }
2995     else
2996     #endif /* SUPPORT_UTF8 */
2997    
2998     /* Non-UTF-8 mode */
2999     {
3000 ph10 443 if (md->end_subject - eptr < 1)
3001 ph10 428 {
3002 ph10 443 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3003 ph10 510 MRRETURN(MATCH_NOMATCH);
3004 ph10 443 }
3005 ph10 510 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3006 nigel 77 ecode += 2;
3007     }
3008     break;
3009    
3010 nigel 93 /* Match a single character repeatedly. */
3011 nigel 77
3012     case OP_EXACT:
3013 ph10 602 case OP_EXACTI:
3014 nigel 77 min = max = GET2(ecode, 1);
3015     ecode += 3;
3016     goto REPEATCHAR;
3017    
3018 nigel 93 case OP_POSUPTO:
3019 ph10 602 case OP_POSUPTOI:
3020 nigel 93 possessive = TRUE;
3021     /* Fall through */
3022    
3023 nigel 77 case OP_UPTO:
3024 ph10 602 case OP_UPTOI:
3025 nigel 77 case OP_MINUPTO:
3026 ph10 602 case OP_MINUPTOI:
3027 nigel 77 min = 0;
3028     max = GET2(ecode, 1);
3029 ph10 602 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3030 nigel 77 ecode += 3;
3031     goto REPEATCHAR;
3032    
3033 nigel 93 case OP_POSSTAR:
3034 ph10 602 case OP_POSSTARI:
3035 nigel 93 possessive = TRUE;
3036     min = 0;
3037     max = INT_MAX;
3038     ecode++;
3039     goto REPEATCHAR;
3040    
3041     case OP_POSPLUS:
3042 ph10 602 case OP_POSPLUSI:
3043 nigel 93 possessive = TRUE;
3044     min = 1;
3045     max = INT_MAX;
3046     ecode++;
3047     goto REPEATCHAR;
3048    
3049     case OP_POSQUERY:
3050 ph10 602 case OP_POSQUERYI:
3051 nigel 93 possessive = TRUE;
3052     min = 0;
3053     max = 1;
3054     ecode++;
3055     goto REPEATCHAR;
3056    
3057 nigel 77 case OP_STAR:
3058 ph10 602 case OP_STARI:
3059 nigel 77 case OP_MINSTAR:
3060 ph10 602 case OP_MINSTARI:
3061 nigel 77 case OP_PLUS:
3062 ph10 602 case OP_PLUSI:
3063 nigel 77 case OP_MINPLUS:
3064 ph10 602 case OP_MINPLUSI:
3065 nigel 77 case OP_QUERY:
3066 ph10 602 case OP_QUERYI:
3067 nigel 77 case OP_MINQUERY:
3068 ph10 602 case OP_MINQUERYI:
3069     c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3070 nigel 77 minimize = (c & 1) != 0;
3071     min = rep_min[c]; /* Pick up values from tables; */
3072     max = rep_max[c]; /* zero for max => infinity */
3073     if (max == 0) max = INT_MAX;
3074    
3075 ph10 426 /* Common code for all repeated single-character matches. */
3076 nigel 77
3077     REPEATCHAR:
3078     #ifdef SUPPORT_UTF8
3079     if (utf8)
3080     {
3081     length = 1;
3082     charptr = ecode;
3083     GETCHARLEN(fc, ecode, length);
3084     ecode += length;
3085    
3086     /* Handle multibyte character matching specially here. There is
3087     support for caseless matching if UCP support is present. */
3088    
3089     if (length > 1)
3090     {
3091     #ifdef SUPPORT_UCP
3092 nigel 93 unsigned int othercase;
3093 ph10 602 if (op >= OP_STARI && /* Caseless */
3094 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
3095 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
3096 ph10 115 else oclength = 0;
3097 nigel 77 #endif /* SUPPORT_UCP */
3098    
3099     for (i = 1; i <= min; i++)
3100     {
3101 ph10 426 if (eptr <= md->end_subject - length &&
3102     memcmp(eptr, charptr, length) == 0) eptr += length;
3103 ph10 123 #ifdef SUPPORT_UCP
3104 ph10 426 else if (oclength > 0 &&
3105     eptr <= md->end_subject - oclength &&
3106     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3107     #endif /* SUPPORT_UCP */
3108 nigel 77 else
3109     {
3110 ph10 426 CHECK_PARTIAL();
3111 ph10 510 MRRETURN(MATCH_NOMATCH);
3112 nigel 77 }
3113     }
3114    
3115     if (min == max) continue;
3116    
3117     if (minimize)
3118     {
3119     for (fi = min;; fi++)
3120     {
3121 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3122 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3123 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3124 ph10 426 if (eptr <= md->end_subject - length &&
3125     memcmp(eptr, charptr, length) == 0) eptr += length;
3126 ph10 123 #ifdef SUPPORT_UCP
3127 ph10 426 else if (oclength > 0 &&
3128     eptr <= md->end_subject - oclength &&
3129     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3130     #endif /* SUPPORT_UCP */
3131 nigel 77 else
3132     {
3133 ph10 426 CHECK_PARTIAL();
3134 ph10 510 MRRETURN(MATCH_NOMATCH);
3135 nigel 77 }
3136     }
3137     /* Control never gets here */
3138     }
3139 nigel 93
3140     else /* Maximize */
3141 nigel 77 {
3142     pp = eptr;
3143     for (i = min; i < max; i++)
3144     {
3145 ph10 426 if (eptr <= md->end_subject - length &&
3146     memcmp(eptr, charptr, length) == 0) eptr += length;
3147 ph10 123 #ifdef SUPPORT_UCP
3148 ph10 426 else if (oclength > 0 &&
3149     eptr <= md->end_subject - oclength &&
3150     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3151     #endif /* SUPPORT_UCP */
3152 ph10 463 else
3153 ph10 462 {
3154 ph10 463 CHECK_PARTIAL();
3155 ph10 462 break;
3156 ph10 463 }
3157 nigel 77 }
3158 nigel 93
3159     if (possessive) continue;
3160 ph10 427
3161 ph10 120 for(;;)
3162 ph10 426 {
3163 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3164 ph10 426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3165 ph10 510 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3166 ph10 115 #ifdef SUPPORT_UCP
3167 ph10 426 eptr--;
3168     BACKCHAR(eptr);
3169 ph10 123 #else /* without SUPPORT_UCP */
3170 ph10 426 eptr -= length;
3171 ph10 123 #endif /* SUPPORT_UCP */
3172 ph10 426 }
3173 nigel 77 }
3174     /* Control never gets here */
3175     }
3176    
3177     /* If the length of a UTF-8 character is 1, we fall through here, and
3178     obey the code as for non-UTF-8 characters below, though in this case the
3179     value of fc will always be < 128. */
3180     }
3181     else
3182     #endif /* SUPPORT_UTF8 */
3183    
3184     /* When not in UTF-8 mode, load a single-byte character. */
3185    
3186 ph10 426 fc = *ecode++;
3187 ph10 443
3188 nigel 77 /* The value of fc at this point is always less than 256, though we may or
3189     may not be in UTF-8 mode. The code is duplicated for the caseless and
3190     caseful cases, for speed, since matching characters is likely to be quite
3191     common. First, ensure the minimum number of matches are present. If min =
3192     max, continue at the same level without recursing. Otherwise, if
3193     minimizing, keep trying the rest of the expression and advancing one
3194     matching character if failing, up to the maximum. Alternatively, if
3195     maximizing, find the maximum number of characters and work backwards. */
3196    
3197     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3198     max, eptr));
3199    
3200 ph10 602 if (op >= OP_STARI) /* Caseless */
3201 nigel 77 {
3202     fc = md->lcc[fc];
3203     for (i = 1; i <= min; i++)
3204 ph10 426 {
3205     if (eptr >= md->end_subject)
3206     {
3207     SCHECK_PARTIAL();
3208 ph10 510 MRRETURN(MATCH_NOMATCH);
3209 ph10 426 }
3210 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3211 ph10 426 }
3212 nigel 77 if (min == max) continue;
3213     if (minimize)
3214     {
3215     for (fi = min;; fi++)
3216     {
3217 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3218 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3219 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3220 ph10 426 if (eptr >= md->end_subject)
3221     {
3222 ph10 427 SCHECK_PARTIAL();
3223 ph10 510 MRRETURN(MATCH_NOMATCH);
3224 ph10 426 }
3225 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3226 nigel 77 }
3227     /* Control never gets here */
3228     }
3229 nigel 93 else /* Maximize */
3230 nigel 77 {
3231     pp = eptr;
3232     for (i = min; i < max; i++)
3233     {
3234 ph10 463 if (eptr >= md->end_subject)
3235 ph10 462 {
3236     SCHECK_PARTIAL();
3237     break;
3238 ph10 463 }
3239 ph10 462 if (fc != md->lcc[*eptr]) break;
3240 nigel 77 eptr++;
3241     }
3242 ph10 427
3243 nigel 93 if (possessive) continue;
3244 ph10 427
3245 nigel 77 while (eptr >= pp)
3246     {
3247 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3248 nigel 77 eptr--;
3249     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3250     }
3251 ph10 510 MRRETURN(MATCH_NOMATCH);
3252 nigel 77 }
3253     /* Control never gets here */
3254     }
3255    
3256     /* Caseful comparisons (includes all multi-byte characters) */
3257    
3258     else
3259     {
3260 ph10 427 for (i = 1; i <= min; i++)
3261 ph10 426 {
3262     if (eptr >= md->end_subject)
3263     {
3264     SCHECK_PARTIAL();
3265 ph10 510 MRRETURN(MATCH_NOMATCH);
3266 ph10 426 }
3267 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3268 ph10 427 }
3269 ph10 443
3270 nigel 77 if (min == max) continue;
3271 ph10 443
3272 nigel 77 if (minimize)
3273     {
3274     for (fi = min;; fi++)
3275     {
3276 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3277 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3278 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3279 ph10 426 if (eptr >= md->end_subject)
3280 ph10 427 {
3281 ph10 426 SCHECK_PARTIAL();
3282 ph10 510 MRRETURN(MATCH_NOMATCH);
3283 ph10 427 }
3284 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3285 nigel 77 }
3286     /* Control never gets here */
3287     }
3288 nigel 93 else /* Maximize */
3289 nigel 77 {
3290     pp = eptr;
3291     for (i = min; i < max; i++)
3292     {
3293 ph10 463 if (eptr >= md->end_subject)
3294 ph10 462 {
3295 ph10 463 SCHECK_PARTIAL();
3296 ph10 462 break;
3297 ph10 463 }
3298 ph10 462 if (fc != *eptr) break;
3299 nigel 77 eptr++;
3300     }
3301 nigel 93 if (possessive) continue;
3302 ph10 443
3303 nigel 77 while (eptr >= pp)
3304     {
3305 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3306 nigel 77 eptr--;
3307     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3308     }
3309 ph10 510 MRRETURN(MATCH_NOMATCH);
3310 nigel 77 }
3311     }
3312     /* Control never gets here */
3313    
3314     /* Match a negated single one-byte character. The character we are
3315     checking can be multibyte. */
3316    
3317     case OP_NOT:
3318 ph10 602 case OP_NOTI:
3319 ph10 443 if (eptr >= md->end_subject)
3320 ph10 428 {
3321 ph10 443 SCHECK_PARTIAL();
3322 ph10 510 MRRETURN(MATCH_NOMATCH);
3323 ph10 443 }
3324 nigel 77 ecode++;
3325     GETCHARINCTEST(c, eptr);
3326 ph10 602 if (op == OP_NOTI) /* The caseless case */
3327 nigel 77 {
3328     #ifdef SUPPORT_UTF8
3329     if (c < 256)
3330     #endif
3331     c = md->lcc[c];
3332 ph10 510 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3333 nigel 77 }
3334 ph10 602 else /* Caseful */
3335 nigel 77 {
3336 ph10 510 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3337 nigel 77 }
3338     break;
3339    
3340     /* Match a negated single one-byte character repeatedly. This is almost a
3341     repeat of the code for a repeated single character, but I haven't found a
3342     nice way of commoning these up that doesn't require a test of the
3343     positive/negative option for each character match. Maybe that wouldn't add
3344     very much to the time taken, but character matching *is* what this is all
3345     about... */
3346    
3347     case OP_NOTEXACT:
3348 ph10 602 case OP_NOTEXACTI:
3349 nigel 77 min = max = GET2(ecode, 1);
3350     ecode += 3;
3351     goto REPEATNOTCHAR;
3352    
3353     case OP_NOTUPTO:
3354 ph10 602 case OP_NOTUPTOI:
3355 nigel 77 case OP_NOTMINUPTO:
3356 ph10 602 case OP_NOTMINUPTOI:
3357 nigel 77 min = 0;
3358     max = GET2(ecode, 1);
3359 ph10 602 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3360 nigel 77 ecode += 3;
3361     goto REPEATNOTCHAR;
3362    
3363 nigel 93 case OP_NOTPOSSTAR:
3364 ph10 602 case OP_NOTPOSSTARI:
3365 nigel 93 possessive = TRUE;
3366     min = 0;
3367     max = INT_MAX;
3368     ecode++;
3369     goto REPEATNOTCHAR;
3370    
3371     case OP_NOTPOSPLUS:
3372 ph10 602 case OP_NOTPOSPLUSI:
3373 nigel 93 possessive = TRUE;
3374     min = 1;
3375     max = INT_MAX;
3376     ecode++;
3377     goto REPEATNOTCHAR;
3378    
3379     case OP_NOTPOSQUERY:
3380 ph10 602 case OP_NOTPOSQUERYI:
3381 nigel 93 possessive = TRUE;
3382     min = 0;
3383     max = 1;
3384     ecode++;
3385     goto REPEATNOTCHAR;
3386    
3387     case OP_NOTPOSUPTO:
3388 ph10 602 case OP_NOTPOSUPTOI:
3389 nigel 93 possessive = TRUE;
3390     min = 0;
3391     max = GET2(ecode, 1);
3392     ecode += 3;
3393     goto REPEATNOTCHAR;
3394    
3395 nigel 77 case OP_NOTSTAR:
3396 ph10 602 case OP_NOTSTARI:
3397 nigel 77 case OP_NOTMINSTAR:
3398 ph10 602 case OP_NOTMINSTARI:
3399 nigel 77 case OP_NOTPLUS:
3400 ph10 602 case OP_NOTPLUSI:
3401 nigel 77 case OP_NOTMINPLUS:
3402 ph10 602 case OP_NOTMINPLUSI:
3403 nigel 77 case OP_NOTQUERY:
3404 ph10 602 case OP_NOTQUERYI:
3405 nigel 77 case OP_NOTMINQUERY:
3406 ph10 602 case OP_NOTMINQUERYI:
3407     c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3408 nigel 77 minimize = (c & 1) != 0;
3409     min = rep_min[c]; /* Pick up values from tables; */
3410     max = rep_max[c]; /* zero for max => infinity */
3411     if (max == 0) max = INT_MAX;
3412    
3413 ph10 426 /* Common code for all repeated single-byte matches. */
3414 nigel 77
3415     REPEATNOTCHAR:
3416     fc = *ecode++;
3417    
3418     /* The code is duplicated for the caseless and caseful cases, for speed,
3419     since matching characters is likely to be quite common. First, ensure the
3420     minimum number of matches are present. If min = max, continue at the same
3421     level without recursing. Otherwise, if minimizing, keep trying the rest of
3422     the expression and advancing one matching character if failing, up to the
3423     maximum. Alternatively, if maximizing, find the maximum number of
3424     characters and work backwards. */
3425    
3426     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3427     max, eptr));
3428    
3429 ph10 602 if (op >= OP_NOTSTARI) /* Caseless */
3430 nigel 77 {
3431     fc = md->lcc[fc];
3432    
3433     #ifdef SUPPORT_UTF8
3434     /* UTF-8 mode */
3435     if (utf8)
3436     {
3437 nigel 93 register unsigned int d;
3438 nigel 77 for (i = 1; i <= min; i++)
3439     {
3440 ph10 426 if (eptr >= md->end_subject)
3441     {
3442     SCHECK_PARTIAL();
3443 ph10 510 MRRETURN(MATCH_NOMATCH);
3444 ph10 427 }
3445 nigel 77 GETCHARINC(d, eptr);
3446     if (d < 256) d = md->lcc[d];
3447 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3448 nigel 77 }
3449     }
3450     else
3451     #endif
3452    
3453     /* Not UTF-8 mode */
3454     {
3455     for (i = 1; i <= min; i++)
3456 ph10 426 {
3457     if (eptr >= md->end_subject)
3458     {
3459     SCHECK_PARTIAL();
3460 ph10 510 MRRETURN(MATCH_NOMATCH);
3461 ph10 427 }
3462 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3463 ph10 427 }
3464 nigel 77 }
3465    
3466     if (min == max) continue;
3467    
3468     if (minimize)
3469     {
3470     #ifdef SUPPORT_UTF8
3471     /* UTF-8 mode */
3472     if (utf8)
3473     {
3474 nigel 93 register unsigned int d;
3475 nigel 77 for (fi = min;; fi++)
3476     {
3477 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3478 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3479 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3480 ph10 427 if (eptr >= md->end_subject)
3481 ph10 426 {
3482 ph10 427 SCHECK_PARTIAL();
3483 ph10 510 MRRETURN(MATCH_NOMATCH);
3484 ph10 427 }
3485 nigel 77 GETCHARINC(d, eptr);
3486     if (d < 256) d = md->lcc[d];
3487 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3488 nigel 77 }
3489     }
3490     else
3491     #endif
3492     /* Not UTF-8 mode */
3493     {
3494     for (fi = min;; fi++)
3495     {
3496 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3497 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3498 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3499 ph10 426 if (eptr >= md->end_subject)
3500     {
3501     SCHECK_PARTIAL();
3502 ph10 510 MRRETURN(MATCH_NOMATCH);
3503 ph10 426 }
3504 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3505 nigel 77 }
3506     }
3507     /* Control never gets here */
3508     }
3509    
3510     /* Maximize case */
3511    
3512     else
3513     {
3514     pp = eptr;
3515    
3516     #ifdef SUPPORT_UTF8
3517     /* UTF-8 mode */
3518     if (utf8)
3519     {
3520 nigel 93 register unsigned int d;
3521 nigel 77 for (i = min; i < max; i++)
3522     {
3523     int len = 1;
3524 ph10 463 if (eptr >= md->end_subject)
3525 ph10 462 {
3526 ph10 463 SCHECK_PARTIAL();
3527 ph10 462 break;
3528 ph10 463 }
3529 nigel 77 GETCHARLEN(d, eptr, len);
3530     if (d < 256) d = md->lcc[d];
3531     if (fc == d) break;
3532     eptr += len;
3533     }
3534 nigel 93 if (possessive) continue;
3535     for(;;)
3536 nigel 77 {
3537 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3538 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3539     if (eptr-- == pp) break; /* Stop if tried at original pos */
3540     BACKCHAR(eptr);
3541     }
3542     }
3543     else
3544     #endif
3545     /* Not UTF-8 mode */
3546     {
3547     for (i = min; i < max; i++)
3548     {
3549 ph10 463 if (eptr >= md->end_subject)
3550 ph10 462 {
3551     SCHECK_PARTIAL();
3552     break;
3553 ph10 463 }
3554 ph10 462 if (fc == md->lcc[*eptr]) break;
3555 nigel 77 eptr++;
3556     }
3557 nigel 93 if (possessive) continue;
3558 nigel 77 while (eptr >= pp)
3559     {
3560 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3561 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3562     eptr--;
3563     }
3564     }
3565    
3566 ph10 510 MRRETURN(MATCH_NOMATCH);
3567 nigel 77 }
3568     /* Control never gets here */
3569     }
3570    
3571     /* Caseful comparisons */
3572    
3573     else
3574     {
3575     #ifdef SUPPORT_UTF8
3576     /* UTF-8 mode */
3577     if (utf8)
3578     {
3579 nigel 93 register unsigned int d;
3580 nigel 77 for (i = 1; i <= min; i++)
3581     {
3582 ph10 426 if (eptr >= md->end_subject)
3583     {
3584     SCHECK_PARTIAL();
3585 ph10 510 MRRETURN(MATCH_NOMATCH);
3586 ph10 427 }
3587 nigel 77 GETCHARINC(d, eptr);
3588 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3589 nigel 77 }
3590     }
3591     else
3592     #endif
3593     /* Not UTF-8 mode */
3594     {
3595     for (i = 1; i <= min; i++)
3596 ph10 426 {
3597     if (eptr >= md->end_subject)
3598     {
3599     SCHECK_PARTIAL();
3600 ph10 510 MRRETURN(MATCH_NOMATCH);
3601 ph10 427 }
3602 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3603 ph10 427 }
3604 nigel 77 }
3605    
3606     if (min == max) continue;
3607    
3608     if (minimize)
3609     {
3610     #ifdef SUPPORT_UTF8
3611     /* UTF-8 mode */
3612     if (utf8)
3613     {
3614 nigel 93 register unsigned int d;
3615 nigel 77 for (fi = min;; fi++)
3616     {
3617 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3618 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3619 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3620 ph10 427 if (eptr >= md->end_subject)
3621 ph10 426 {
3622 ph10 427 SCHECK_PARTIAL();
3623 ph10 510 MRRETURN(MATCH_NOMATCH);
3624 ph10 427 }
3625 nigel 77 GETCHARINC(d, eptr);
3626 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3627 nigel 77 }
3628     }
3629     else
3630     #endif
3631     /* Not UTF-8 mode */
3632     {
3633     for (fi = min;; fi++)
3634     {
3635 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3636 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3637 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3638 ph10 426 if (eptr >= md->end_subject)
3639     {
3640     SCHECK_PARTIAL();
3641 ph10 510 MRRETURN(MATCH_NOMATCH);
3642 ph10 427 }
3643 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3644 nigel 77 }
3645     }
3646     /* Control never gets here */
3647     }
3648    
3649     /* Maximize case */
3650    
3651     else
3652     {
3653     pp = eptr;
3654    
3655     #ifdef SUPPORT_UTF8
3656     /* UTF-8 mode */
3657     if (utf8)
3658     {
3659 nigel 93 register unsigned int d;
3660 nigel 77 for (i = min; i < max; i++)
3661     {
3662     int len = 1;
3663 ph10 463 if (eptr >= md->end_subject)
3664 ph10 462 {
3665 ph10 463 SCHECK_PARTIAL();
3666 ph10 462 break;
3667 ph10 463 }
3668 nigel 77 GETCHARLEN(d, eptr, len);
3669     if (fc == d) break;
3670     eptr += len;
3671     }
3672 nigel 93 if (possessive) continue;
3673 nigel 77 for(;;)
3674     {
3675 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3676 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3677     if (eptr-- == pp) break; /* Stop if tried at original pos */
3678     BACKCHAR(eptr);
3679     }
3680     }
3681     else
3682     #endif
3683     /* Not UTF-8 mode */
3684     {
3685     for (i = min; i < max; i++)
3686     {
3687 ph10 463 if (eptr >= md->end_subject)
3688 ph10 462 {
3689 ph10 463 SCHECK_PARTIAL();
3690 ph10 462 break;
3691 ph10 463 }
3692 ph10 462 if (fc == *eptr) break;
3693 nigel 77 eptr++;
3694     }
3695 nigel 93 if (possessive) continue;
3696 nigel 77 while (eptr >= pp)
3697     {
3698 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3699 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3700     eptr--;
3701     }
3702     }
3703    
3704 ph10 510 MRRETURN(MATCH_NOMATCH);
3705 nigel 77 }
3706     }
3707     /* Control never gets here */
3708    
3709     /* Match a single character type repeatedly; several different opcodes
3710     share code. This is very similar to the code for single characters, but we
3711     repeat it in the interests of efficiency. */
3712    
3713     case OP_TYPEEXACT:
3714     min = max = GET2(ecode, 1);
3715     minimize = TRUE;
3716     ecode += 3;
3717     goto REPEATTYPE;
3718    
3719     case OP_TYPEUPTO:
3720     case OP_TYPEMINUPTO:
3721     min = 0;
3722     max = GET2(ecode, 1);
3723     minimize = *ecode == OP_TYPEMINUPTO;
3724     ecode += 3;
3725     goto REPEATTYPE;
3726    
3727 nigel 93 case OP_TYPEPOSSTAR:
3728     possessive = TRUE;
3729     min = 0;
3730     max = INT_MAX;
3731     ecode++;
3732     goto REPEATTYPE;
3733    
3734     case OP_TYPEPOSPLUS:
3735     possessive = TRUE;
3736     min = 1;
3737     max = INT_MAX;
3738     ecode++;
3739     goto REPEATTYPE;
3740    
3741     case OP_TYPEPOSQUERY:
3742     possessive = TRUE;
3743     min = 0;
3744     max = 1;
3745     ecode++;
3746     goto REPEATTYPE;
3747    
3748     case OP_TYPEPOSUPTO:
3749     possessive = TRUE;
3750     min = 0;
3751     max = GET2(ecode, 1);
3752     ecode += 3;
3753     goto REPEATTYPE;
3754    
3755 nigel 77 case OP_TYPESTAR:
3756     case OP_TYPEMINSTAR:
3757     case OP_TYPEPLUS:
3758     case OP_TYPEMINPLUS:
3759     case OP_TYPEQUERY:
3760     case OP_TYPEMINQUERY:
3761     c = *ecode++ - OP_TYPESTAR;
3762     minimize = (c & 1) != 0;
3763     min = rep_min[c]; /* Pick up values from tables; */
3764     max = rep_max[c]; /* zero for max => infinity */
3765     if (max == 0) max = INT_MAX;
3766    
3767     /* Common code for all repeated single character type matches. Note that
3768     in UTF-8 mode, '.' matches a character of any length, but for the other
3769     character types, the valid characters are all one-byte long. */
3770    
3771     REPEATTYPE:
3772     ctype = *ecode++; /* Code for the character type */
3773    
3774     #ifdef SUPPORT_UCP
3775     if (ctype == OP_PROP || ctype == OP_NOTPROP)
3776     {
3777     prop_fail_result = ctype == OP_NOTPROP;
3778     prop_type = *ecode++;
3779 nigel 87 prop_value = *ecode++;
3780 nigel 77 }
3781     else prop_type = -1;
3782     #endif
3783    
3784     /* First, ensure the minimum number of matches are present. Use inline
3785     code for maximizing the speed, and do the type test once at the start
3786 ph10 426 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3787 nigel 77 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3788     and single-bytes. */
3789    
3790     if (min > 0)
3791     {
3792     #ifdef SUPPORT_UCP
3793 nigel 87 if (prop_type >= 0)
3794 nigel 77 {
3795 nigel 87 switch(prop_type)
3796 nigel 77 {
3797 nigel 87 case PT_ANY:
3798 ph10 510 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3799 nigel 87 for (i = 1; i <= min; i++)
3800     {
3801 ph10 427 if (eptr >= md->end_subject)
3802 ph10 426 {
3803 ph10 427 SCHECK_PARTIAL();
3804 ph10 510 MRRETURN(MATCH_NOMATCH);
3805 ph10 427 }
3806 ph10 184 GETCHARINCTEST(c, eptr);
3807 nigel 87 }
3808     break;
3809    
3810     case PT_LAMP:
3811     for (i = 1; i <= min; i++)
3812     {
3813 ph10 427 if (eptr >= md->end_subject)
3814 ph10 426 {
3815 ph10 427 SCHECK_PARTIAL();
3816 ph10 510 MRRETURN(MATCH_NOMATCH);
3817 ph10 427 }
3818 ph10 184 GETCHARINCTEST(c, eptr);
3819 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3820 nigel 87 if ((prop_chartype == ucp_Lu ||
3821     prop_chartype == ucp_Ll ||
3822     prop_chartype == ucp_Lt) == prop_fail_result)
3823 ph10 510 MRRETURN(MATCH_NOMATCH);
3824 nigel 87 }
3825     break;
3826    
3827     case PT_GC:
3828     for (i = 1; i <= min; i++)
3829     {
3830 ph10 427 if (eptr >= md->end_subject)
3831 ph10 426 {
3832 ph10 427 SCHECK_PARTIAL();