/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 642 - (hide annotations) (download)
Thu Jul 28 18:59:40 2011 UTC (21 months, 2 weeks ago) by ph10
File MIME type: text/plain
File size: 194303 byte(s)
Avoid false positive for infinite recursion by not checking conditionals at 
compile time, but add tests at runtime that also catch infinite mutual 
recursion.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 ph10 625 /* Values for setting in md->match_function_type to indicate two special types
61     of call to match(). We do it this way to save on using another stack variable,
62 ph10 604 as stack usage is to be discouraged. */
63 nigel 77
64 ph10 604 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65     #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66 nigel 77
67     /* Non-error returns from the match() function. Error returns are externally
68     defined PCRE_ERROR_xxx codes, which are all negative. */
69    
70     #define MATCH_MATCH 1
71     #define MATCH_NOMATCH 0
72    
73 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
74 ph10 210 negative to avoid the external error codes. */
75    
76 ph10 511 #define MATCH_ACCEPT (-999)
77     #define MATCH_COMMIT (-998)
78 ph10 604 #define MATCH_KETRPOS (-997)
79 ph10 618 #define MATCH_ONCE (-996)
80     #define MATCH_PRUNE (-995)
81     #define MATCH_SKIP (-994)
82     #define MATCH_SKIP_ARG (-993)
83     #define MATCH_THEN (-992)
84 ph10 210
85 ph10 510 /* This is a convenience macro for code that occurs many times. */
86    
87     #define MRRETURN(ra) \
88     { \
89     md->mark = markptr; \
90     RRETURN(ra); \
91     }
92    
93 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
94     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95     because the offset vector is always a multiple of 3 long. */
96    
97     #define REC_STACK_SAVE_MAX 30
98    
99     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100    
101     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103    
104    
105    
106 ph10 475 #ifdef PCRE_DEBUG
107 nigel 77 /*************************************************
108     * Debugging function to print chars *
109     *************************************************/
110    
111     /* Print a sequence of chars in printable format, stopping at the end of the
112     subject if the requested.
113    
114     Arguments:
115     p points to characters
116     length number to print
117     is_subject TRUE if printing from within md->start_subject
118     md pointer to matching data block, if is_subject is TRUE
119    
120     Returns: nothing
121     */
122    
123     static void
124     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
125     {
126 nigel 93 unsigned int c;
127 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128     while (length-- > 0)
129     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130     }
131     #endif
132    
133    
134    
135     /*************************************************
136     * Match a back-reference *
137     *************************************************/
138    
139 ph10 595 /* Normally, if a back reference hasn't been set, the length that is passed is
140     negative, so the match always fails. However, in JavaScript compatibility mode,
141 ph10 625 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 ph10 595 subject bytes matched may be different to the number of reference bytes.
143 nigel 77
144     Arguments:
145     offset index into the offset vector
146 ph10 595 eptr pointer into the subject
147     length length of reference to be matched (number of bytes)
148 nigel 77 md points to match data block
149 ph10 602 caseless TRUE if caseless
150 nigel 77
151 ph10 595 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 nigel 77 */
153    
154 ph10 595 static int
155 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
156 ph10 602 BOOL caseless)
157 nigel 77 {
158 ph10 595 USPTR eptr_start = eptr;
159     register USPTR p = md->start_subject + md->offset_vector[offset];
160 nigel 77
161 ph10 475 #ifdef PCRE_DEBUG
162 nigel 77 if (eptr >= md->end_subject)
163     printf("matching subject <null>");
164     else
165     {
166     printf("matching subject ");
167     pchars(eptr, length, TRUE, md);
168     }
169     printf(" against backref ");
170     pchars(p, length, FALSE, md);
171     printf("\n");
172     #endif
173    
174 ph10 595 /* Always fail if reference not set (and not JavaScript compatible). */
175 nigel 77
176 ph10 595 if (length < 0) return -1;
177 nigel 77
178 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179     properly if Unicode properties are supported. Otherwise, we can check only
180     ASCII characters. */
181 nigel 77
182 ph10 602 if (caseless)
183 nigel 77 {
184 ph10 354 #ifdef SUPPORT_UTF8
185     #ifdef SUPPORT_UCP
186     if (md->utf8)
187     {
188 ph10 625 /* Match characters up to the end of the reference. NOTE: the number of
189 ph10 595 bytes matched may differ, because there are some characters whose upper and
190     lower case versions code as different numbers of bytes. For example, U+023A
191     (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192     a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 ph10 625 the latter. It is important, therefore, to check the length along the
194 ph10 595 reference, not along the subject (earlier code did this wrong). */
195 ph10 625
196 ph10 595 USPTR endptr = p + length;
197     while (p < endptr)
198 ph10 354 {
199 ph10 358 int c, d;
200 ph10 597 if (eptr >= md->end_subject) return -1;
201 ph10 354 GETCHARINC(c, eptr);
202     GETCHARINC(d, p);
203 ph10 595 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 ph10 358 }
205     }
206 ph10 354 else
207     #endif
208     #endif
209    
210     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211     is no UCP support. */
212 ph10 597 {
213 ph10 625 if (eptr + length > md->end_subject) return -1;
214 ph10 597 while (length-- > 0)
215     { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 ph10 625 }
217 nigel 77 }
218 ph10 358
219 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
220     are in UTF-8 mode. */
221 ph10 358
222 nigel 77 else
223 ph10 625 {
224     if (eptr + length > md->end_subject) return -1;
225     while (length-- > 0) if (*p++ != *eptr++) return -1;
226 ph10 597 }
227 nigel 77
228 ph10 595 return eptr - eptr_start;
229 nigel 77 }
230    
231    
232    
233     /***************************************************************************
234     ****************************************************************************
235     RECURSION IN THE match() FUNCTION
236    
237 nigel 87 The match() function is highly recursive, though not every recursive call
238     increases the recursive depth. Nevertheless, some regular expressions can cause
239     it to recurse to a great depth. I was writing for Unix, so I just let it call
240     itself recursively. This uses the stack for saving everything that has to be
241     saved for a recursive call. On Unix, the stack can be large, and this works
242     fine.
243 nigel 77
244 nigel 87 It turns out that on some non-Unix-like systems there are problems with
245     programs that use a lot of stack. (This despite the fact that every last chip
246     has oodles of memory these days, and techniques for extending the stack have
247     been known for decades.) So....
248 nigel 77
249     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250     calls by keeping local variables that need to be preserved in blocks of memory
251 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
252 nigel 77 achieve this so that the actual code doesn't look very different to what it
253     always used to.
254 ph10 164
255 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
256 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
257     Switzer, the use of longjmp() has been abolished, at the cost of having to
258     provide a unique number for each call to RMATCH. There is no way of generating
259     a sequence of numbers at compile time in C. I have given them names, to make
260     them stand out more clearly.
261    
262     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
265     don't have indeterminate values; this has meant that the frame size can be
266 ph10 164 reduced because the result can be "passed back" by straight setting of the
267     variable instead of being passed in the frame.
268 nigel 77 ****************************************************************************
269     ***************************************************************************/
270    
271 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272     below must be updated in sync. */
273 nigel 77
274 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 ph10 527 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 ph10 623 RM61, RM62, RM63 };
281 ph10 164
282 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
283 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 ph10 501 actually used in this definition. */
285 nigel 77
286     #ifndef NO_RECURSE
287     #define REGISTER register
288 ph10 164
289 ph10 475 #ifdef PCRE_DEBUG
290 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 nigel 87 { \
292     printf("match() called in line %d\n", __LINE__); \
293 ph10 604 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 nigel 87 printf("to line %d\n", __LINE__); \
295     }
296     #define RRETURN(ra) \
297     { \
298     printf("match() returned %d from line %d ", ra, __LINE__); \
299     return ra; \
300     }
301     #else
302 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
303     rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 nigel 77 #define RRETURN(ra) return ra
305 nigel 87 #endif
306    
307 nigel 77 #else
308    
309    
310 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
311     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312     argument of match(), which never changes. */
313 nigel 77
314     #define REGISTER
315    
316 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 nigel 77 {\
318 ph10 563 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 ph10 534 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 ph10 164 frame->Xwhere = rw; \
321     newframe->Xeptr = ra;\
322     newframe->Xecode = rb;\
323 ph10 168 newframe->Xmstart = mstart;\
324 ph10 501 newframe->Xmarkptr = markptr;\
325 ph10 164 newframe->Xoffset_top = rc;\
326 ph10 602 newframe->Xeptrb = re;\
327 ph10 164 newframe->Xrdepth = frame->Xrdepth + 1;\
328     newframe->Xprevframe = frame;\
329     frame = newframe;\
330     DPRINTF(("restarting from line %d\n", __LINE__));\
331     goto HEAP_RECURSE;\
332     L_##rw:\
333     DPRINTF(("jumped back to line %d\n", __LINE__));\
334 nigel 77 }
335    
336     #define RRETURN(ra)\
337     {\
338 ph10 527 heapframe *oldframe = frame;\
339     frame = oldframe->Xprevframe;\
340     (pcre_stack_free)(oldframe);\
341 nigel 77 if (frame != NULL)\
342     {\
343 ph10 164 rrc = ra;\
344     goto HEAP_RETURN;\
345 nigel 77 }\
346     return ra;\
347     }
348    
349    
350     /* Structure for remembering the local variables in a private frame */
351    
352     typedef struct heapframe {
353     struct heapframe *Xprevframe;
354    
355     /* Function arguments that may change */
356    
357 ph10 409 USPTR Xeptr;
358 nigel 77 const uschar *Xecode;
359 ph10 409 USPTR Xmstart;
360 ph10 501 USPTR Xmarkptr;
361 nigel 77 int Xoffset_top;
362     eptrblock *Xeptrb;
363 nigel 91 unsigned int Xrdepth;
364 nigel 77
365     /* Function local variables */
366    
367 ph10 409 USPTR Xcallpat;
368 ph10 406 #ifdef SUPPORT_UTF8
369 ph10 409 USPTR Xcharptr;
370 ph10 406 #endif
371 ph10 409 USPTR Xdata;
372     USPTR Xnext;
373     USPTR Xpp;
374     USPTR Xprev;
375     USPTR Xsaved_eptr;
376 nigel 77
377     recursion_info Xnew_recursive;
378    
379     BOOL Xcur_is_word;
380     BOOL Xcondition;
381     BOOL Xprev_is_word;
382    
383     #ifdef SUPPORT_UCP
384     int Xprop_type;
385 nigel 87 int Xprop_value;
386 nigel 77 int Xprop_fail_result;
387 ph10 123 int Xoclength;
388     uschar Xocchars[8];
389 nigel 77 #endif
390    
391 ph10 403 int Xcodelink;
392 nigel 77 int Xctype;
393 nigel 93 unsigned int Xfc;
394 nigel 77 int Xfi;
395     int Xlength;
396     int Xmax;
397     int Xmin;
398     int Xnumber;
399     int Xoffset;
400     int Xop;
401     int Xsave_capture_last;
402     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403     int Xstacksave[REC_STACK_SAVE_MAX];
404    
405     eptrblock Xnewptrb;
406    
407 ph10 164 /* Where to jump back to */
408 nigel 77
409 ph10 164 int Xwhere;
410 ph10 165
411 nigel 77 } heapframe;
412    
413     #endif
414    
415    
416     /***************************************************************************
417     ***************************************************************************/
418    
419    
420    
421     /*************************************************
422     * Match from current position *
423     *************************************************/
424    
425 nigel 93 /* This function is called recursively in many circumstances. Whenever it
426 nigel 77 returns a negative (error) response, the outer incarnation must also return the
427 ph10 426 same response. */
428 nigel 77
429 ph10 426 /* These macros pack up tests that are used for partial matching, and which
430     appears several times in the code. We set the "hit end" flag if the pointer is
431     at the end of the subject and also past the start of the subject (i.e.
432 ph10 427 something has been matched). For hard partial matching, we then return
433     immediately. The second one is used when we already know we are past the end of
434     the subject. */
435 ph10 426
436     #define CHECK_PARTIAL()\
437 ph10 553 if (md->partial != 0 && eptr >= md->end_subject && \
438     eptr > md->start_used_ptr) \
439     { \
440     md->hitend = TRUE; \
441     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 ph10 427 }
443 ph10 426
444     #define SCHECK_PARTIAL()\
445 ph10 553 if (md->partial != 0 && eptr > md->start_used_ptr) \
446     { \
447     md->hitend = TRUE; \
448     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 ph10 427 }
450 ph10 426
451 ph10 427
452 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
453     the md structure (e.g. utf8, end_subject) into individual variables to improve
454 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455     made performance worse.
456    
457     Arguments:
458 nigel 93 eptr pointer to current character in subject
459     ecode pointer to current position in compiled code
460 ph10 168 mstart pointer to the current match start position (can be modified
461 ph10 172 by encountering \K)
462 ph10 501 markptr pointer to the most recent MARK name, or NULL
463 nigel 77 offset_top current top pointer
464     md pointer to "static" info for the match
465     eptrb pointer to chain of blocks containing eptr at start of
466     brackets - for testing for empty matches
467 nigel 87 rdepth the recursion depth
468 nigel 77
469     Returns: MATCH_MATCH if matched ) these values are >= 0
470     MATCH_NOMATCH if failed to match )
471 ph10 510 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 nigel 77 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 nigel 87 (e.g. stopped by repeated call or recursion limit)
474 nigel 77 */
475    
476     static int
477 ph10 510 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
478 ph10 625 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
479 ph10 604 unsigned int rdepth)
480 nigel 77 {
481     /* These variables do not need to be preserved over recursion in this function,
482 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
483     "register" because they are used a lot in loops. */
484 nigel 77
485 nigel 91 register int rrc; /* Returns from recursive calls */
486     register int i; /* Used for loops not involving calls to RMATCH() */
487 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
489 nigel 77
490 nigel 93 BOOL minimize, possessive; /* Quantifier options */
491 ph10 602 BOOL caseless;
492 ph10 403 int condcode;
493 nigel 93
494 nigel 77 /* When recursion is not being used, all "local" variables that have to be
495     preserved over calls to RMATCH() are part of a "frame" which is obtained from
496     heap storage. Set up the top-level frame here; others are obtained from the
497     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498    
499     #ifdef NO_RECURSE
500 ph10 563 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 ph10 531 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 nigel 77 frame->Xprevframe = NULL; /* Marks the top level */
503    
504     /* Copy in the original argument variables */
505    
506     frame->Xeptr = eptr;
507     frame->Xecode = ecode;
508 ph10 168 frame->Xmstart = mstart;
509 ph10 501 frame->Xmarkptr = markptr;
510 nigel 77 frame->Xoffset_top = offset_top;
511     frame->Xeptrb = eptrb;
512 nigel 87 frame->Xrdepth = rdepth;
513 nigel 77
514     /* This is where control jumps back to to effect "recursion" */
515    
516     HEAP_RECURSE:
517    
518     /* Macros make the argument variables come from the current frame */
519    
520     #define eptr frame->Xeptr
521     #define ecode frame->Xecode
522 ph10 168 #define mstart frame->Xmstart
523 ph10 501 #define markptr frame->Xmarkptr
524 nigel 77 #define offset_top frame->Xoffset_top
525     #define eptrb frame->Xeptrb
526 nigel 87 #define rdepth frame->Xrdepth
527 nigel 77
528     /* Ditto for the local variables */
529    
530     #ifdef SUPPORT_UTF8
531     #define charptr frame->Xcharptr
532     #endif
533     #define callpat frame->Xcallpat
534 ph10 403 #define codelink frame->Xcodelink
535 nigel 77 #define data frame->Xdata
536     #define next frame->Xnext
537     #define pp frame->Xpp
538     #define prev frame->Xprev
539     #define saved_eptr frame->Xsaved_eptr
540    
541     #define new_recursive frame->Xnew_recursive
542    
543     #define cur_is_word frame->Xcur_is_word
544     #define condition frame->Xcondition
545     #define prev_is_word frame->Xprev_is_word
546    
547     #ifdef SUPPORT_UCP
548     #define prop_type frame->Xprop_type
549 nigel 87 #define prop_value frame->Xprop_value
550 nigel 77 #define prop_fail_result frame->Xprop_fail_result
551 ph10 115 #define oclength frame->Xoclength
552     #define occhars frame->Xocchars
553 nigel 77 #endif
554    
555     #define ctype frame->Xctype
556     #define fc frame->Xfc
557     #define fi frame->Xfi
558     #define length frame->Xlength
559     #define max frame->Xmax
560     #define min frame->Xmin
561     #define number frame->Xnumber
562     #define offset frame->Xoffset
563     #define op frame->Xop
564     #define save_capture_last frame->Xsave_capture_last
565     #define save_offset1 frame->Xsave_offset1
566     #define save_offset2 frame->Xsave_offset2
567     #define save_offset3 frame->Xsave_offset3
568     #define stacksave frame->Xstacksave
569    
570     #define newptrb frame->Xnewptrb
571    
572     /* When recursion is being used, local variables are allocated on the stack and
573     get preserved during recursion in the normal way. In this environment, fi and
574     i, and fc and c, can be the same variables. */
575    
576 nigel 93 #else /* NO_RECURSE not defined */
577 nigel 77 #define fi i
578     #define fc c
579    
580 ph10 604 /* Many of the following variables are used only in small blocks of the code.
581     My normal style of coding would have declared them within each of those blocks.
582     However, in order to accommodate the version of this code that uses an external
583     "stack" implemented on the heap, it is easier to declare them all here, so the
584     declarations can be cut out in a block. The only declarations within blocks
585     below are for variables that do not have to be preserved over a recursive call
586     to RMATCH(). */
587 nigel 77
588 ph10 625 #ifdef SUPPORT_UTF8
589     const uschar *charptr;
590     #endif
591     const uschar *callpat;
592     const uschar *data;
593     const uschar *next;
594     USPTR pp;
595     const uschar *prev;
596     USPTR saved_eptr;
597    
598     recursion_info new_recursive;
599    
600     BOOL cur_is_word;
601 nigel 87 BOOL condition;
602 nigel 77 BOOL prev_is_word;
603    
604     #ifdef SUPPORT_UCP
605     int prop_type;
606 nigel 87 int prop_value;
607 nigel 77 int prop_fail_result;
608 ph10 115 int oclength;
609     uschar occhars[8];
610 nigel 77 #endif
611    
612 ph10 399 int codelink;
613 nigel 77 int ctype;
614     int length;
615     int max;
616     int min;
617     int number;
618     int offset;
619     int op;
620     int save_capture_last;
621     int save_offset1, save_offset2, save_offset3;
622     int stacksave[REC_STACK_SAVE_MAX];
623    
624     eptrblock newptrb;
625 nigel 93 #endif /* NO_RECURSE */
626 nigel 77
627 ph10 625 /* To save space on the stack and in the heap frame, I have doubled up on some
628     of the local variables that are used only in localised parts of the code, but
629     still need to be preserved over recursive calls of match(). These macros define
630 ph10 604 the alternative names that are used. */
631    
632     #define allow_zero cur_is_word
633     #define cbegroup condition
634     #define code_offset codelink
635     #define condassert condition
636     #define matched_once prev_is_word
637    
638 nigel 77 /* These statements are here to stop the compiler complaining about unitialized
639     variables. */
640    
641     #ifdef SUPPORT_UCP
642 nigel 87 prop_value = 0;
643 nigel 77 prop_fail_result = 0;
644     #endif
645    
646 nigel 93
647 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
648     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
649     used. Thanks to Ian Taylor for noticing this possibility and sending the
650     original patch. */
651    
652     TAIL_RECURSE:
653    
654 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
655     are specified by the macro RMATCH and RRETURN is used to return. When
656     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
657 ph10 475 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
658 nigel 87 defined). However, RMATCH isn't like a function call because it's quite a
659     complicated macro. It has to be used in one particular way. This shouldn't,
660     however, impact performance when true recursion is being used. */
661 nigel 77
662 ph10 164 #ifdef SUPPORT_UTF8
663     utf8 = md->utf8; /* Local copy of the flag */
664     #else
665     utf8 = FALSE;
666     #endif
667    
668 nigel 87 /* First check that we haven't called match() too many times, or that we
669     haven't exceeded the recursive call limit. */
670    
671 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
672 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
673 nigel 77
674 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
675 ph10 625 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
676     done this way to save having to use another function argument, which would take
677 ph10 604 up space on the stack. See also MATCH_CONDASSERT below.
678 nigel 77
679 ph10 604 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
680     such remembered pointers, to be checked when we hit the closing ket, in order
681     to break infinite loops that match no characters. When match() is called in
682     other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
683     NOT be used with tail recursion, because the memory block that is used is on
684     the stack, so a new one may be required for each match(). */
685    
686     if (md->match_function_type == MATCH_CBEGROUP)
687 nigel 77 {
688 ph10 197 newptrb.epb_saved_eptr = eptr;
689     newptrb.epb_prev = eptrb;
690     eptrb = &newptrb;
691 ph10 604 md->match_function_type = 0;
692 nigel 77 }
693    
694 nigel 93 /* Now start processing the opcodes. */
695 nigel 77
696     for (;;)
697     {
698 nigel 93 minimize = possessive = FALSE;
699 nigel 77 op = *ecode;
700 ph10 625
701 nigel 93 switch(op)
702     {
703 ph10 510 case OP_MARK:
704     markptr = ecode + 2;
705     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
706 ph10 604 eptrb, RM55);
707 ph10 512
708     /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
709     argument, and we must check whether that argument matches this MARK's
710     argument. It is passed back in md->start_match_ptr (an overloading of that
711     variable). If it does match, we reset that variable to the current subject
712     position and return MATCH_SKIP. Otherwise, pass back the return code
713 ph10 510 unaltered. */
714 ph10 512
715     if (rrc == MATCH_SKIP_ARG &&
716 ph10 510 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
717     {
718     md->start_match_ptr = eptr;
719     RRETURN(MATCH_SKIP);
720     }
721    
722 ph10 512 if (md->mark == NULL) md->mark = markptr;
723 ph10 510 RRETURN(rrc);
724    
725 ph10 210 case OP_FAIL:
726 ph10 510 MRRETURN(MATCH_NOMATCH);
727 ph10 211
728 ph10 551 /* COMMIT overrides PRUNE, SKIP, and THEN */
729 ph10 553
730 ph10 510 case OP_COMMIT:
731     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
732 ph10 604 eptrb, RM52);
733 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
734 ph10 553 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
735     rrc != MATCH_THEN)
736 ph10 551 RRETURN(rrc);
737 ph10 510 MRRETURN(MATCH_COMMIT);
738    
739 ph10 551 /* PRUNE overrides THEN */
740 ph10 553
741 ph10 210 case OP_PRUNE:
742     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
743 ph10 604 eptrb, RM51);
744 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
745 ph10 510 MRRETURN(MATCH_PRUNE);
746 ph10 211
747 ph10 510 case OP_PRUNE_ARG:
748     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
749 ph10 604 eptrb, RM56);
750 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 ph10 510 md->mark = ecode + 2;
752     RRETURN(MATCH_PRUNE);
753 ph10 211
754 ph10 551 /* SKIP overrides PRUNE and THEN */
755 ph10 553
756 ph10 210 case OP_SKIP:
757     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
758 ph10 604 eptrb, RM53);
759 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
760 ph10 551 RRETURN(rrc);
761 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
762 ph10 510 MRRETURN(MATCH_SKIP);
763 ph10 211
764 ph10 510 case OP_SKIP_ARG:
765     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
766 ph10 604 eptrb, RM57);
767 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 ph10 551 RRETURN(rrc);
769 ph10 512
770     /* Pass back the current skip name by overloading md->start_match_ptr and
771     returning the special MATCH_SKIP_ARG return code. This will either be
772     caught by a matching MARK, or get to the top, where it is treated the same
773 ph10 510 as PRUNE. */
774 ph10 512
775 ph10 510 md->start_match_ptr = ecode + 2;
776 ph10 512 RRETURN(MATCH_SKIP_ARG);
777 ph10 553
778 ph10 550 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
779 ph10 553 the alt that is at the start of the current branch. This makes it possible
780     to skip back past alternatives that precede the THEN within the current
781     branch. */
782 ph10 512
783 ph10 210 case OP_THEN:
784     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
785 ph10 604 eptrb, RM54);
786 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
787 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
788 ph10 510 MRRETURN(MATCH_THEN);
789    
790     case OP_THEN_ARG:
791 ph10 553 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
792 ph10 604 offset_top, md, eptrb, RM58);
793 ph10 510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
794 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
795     md->mark = ecode + LINK_SIZE + 2;
796 ph10 212 RRETURN(MATCH_THEN);
797 ph10 211
798 ph10 604 /* Handle a capturing bracket, other than those that are possessive with an
799     unlimited repeat. If there is space in the offset vector, save the current
800     subject position in the working slot at the top of the vector. We mustn't
801     change the current values of the data slot, because they may be set from a
802     previous iteration of this group, and be referred to by a reference inside
803 ph10 625 the group. A failure to match might occur after the group has succeeded,
804 ph10 617 if something later on doesn't match. For this reason, we need to restore
805     the working value and also the values of the final offsets, in case they
806     were set by a previous iteration of the same bracket.
807 nigel 77
808 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
809     a non-capturing bracket. Don't worry about setting the flag for the error
810     case here; that is handled in the code for KET. */
811 nigel 77
812 nigel 93 case OP_CBRA:
813     case OP_SCBRA:
814     number = GET2(ecode, 1+LINK_SIZE);
815 nigel 77 offset = number << 1;
816 ph10 625
817 ph10 475 #ifdef PCRE_DEBUG
818 nigel 93 printf("start bracket %d\n", number);
819     printf("subject=");
820 nigel 77 pchars(eptr, 16, TRUE, md);
821     printf("\n");
822     #endif
823    
824     if (offset < md->offset_max)
825     {
826     save_offset1 = md->offset_vector[offset];
827     save_offset2 = md->offset_vector[offset+1];
828     save_offset3 = md->offset_vector[md->offset_end - number];
829     save_capture_last = md->capture_last;
830    
831     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
832 ph10 531 md->offset_vector[md->offset_end - number] =
833 ph10 530 (int)(eptr - md->start_subject);
834 nigel 77
835 ph10 604 for (;;)
836 nigel 77 {
837 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
838     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
839 ph10 604 eptrb, RM1);
840 ph10 618 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
841 ph10 550 if (rrc != MATCH_NOMATCH &&
842     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
843     RRETURN(rrc);
844 nigel 77 md->capture_last = save_capture_last;
845     ecode += GET(ecode, 1);
846 ph10 625 if (*ecode != OP_ALT) break;
847 nigel 77 }
848    
849     DPRINTF(("bracket %d failed\n", number));
850     md->offset_vector[offset] = save_offset1;
851     md->offset_vector[offset+1] = save_offset2;
852     md->offset_vector[md->offset_end - number] = save_offset3;
853 ph10 625
854     /* At this point, rrc will be one of MATCH_ONCE, MATCH_NOMATCH, or
855 ph10 618 MATCH_THEN. */
856 nigel 77
857 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
858 ph10 618 RRETURN(((rrc == MATCH_ONCE)? MATCH_ONCE:MATCH_NOMATCH));
859 nigel 77 }
860    
861 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
862     as a non-capturing bracket. */
863 nigel 77
864 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
865     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
866    
867 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
868 nigel 77
869 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
871    
872 ph10 618 /* Non-capturing or atomic group, except for possessive with unlimited
873     repeat. Loop for all the alternatives. When we get to the final alternative
874     within the brackets, we used to return the result of a recursive call to
875     match() whatever happened so it was possible to reduce stack usage by
876     turning this into a tail recursion, except in the case of a possibly empty
877     group. However, now that there is the possiblity of (*THEN) occurring in
878 ph10 625 the final alternative, this optimization is no longer possible.
879    
880     MATCH_ONCE is returned when the end of an atomic group is successfully
881     reached, but subsequent matching fails. It passes back up the tree (causing
882     captured values to be reset) until the original atomic group level is
883 ph10 618 reached. This is tested by comparing md->once_target with the start of the
884     group. At this point, the return is converted into MATCH_NOMATCH so that
885     previous backup points can be taken. */
886 nigel 77
887 ph10 618 case OP_ONCE:
888 nigel 93 case OP_BRA:
889     case OP_SBRA:
890     DPRINTF(("start non-capturing bracket\n"));
891 ph10 618
892 nigel 91 for (;;)
893 nigel 77 {
894 ph10 618 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
895 ph10 625 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
896 ph10 604 RM2);
897 ph10 550 if (rrc != MATCH_NOMATCH &&
898     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
899 ph10 625 {
900 ph10 618 if (rrc == MATCH_ONCE)
901     {
902     const uschar *scode = ecode;
903     if (*scode != OP_ONCE) /* If not at start, find it */
904     {
905     while (*scode == OP_ALT) scode += GET(scode, 1);
906     scode -= GET(scode, 1);
907 ph10 625 }
908 ph10 618 if (md->once_target == scode) rrc = MATCH_NOMATCH;
909 ph10 625 }
910 ph10 550 RRETURN(rrc);
911 ph10 625 }
912 nigel 77 ecode += GET(ecode, 1);
913 ph10 625 if (*ecode != OP_ALT) break;
914 nigel 77 }
915 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
916     RRETURN(MATCH_NOMATCH);
917    
918 ph10 625 /* Handle possessive capturing brackets with an unlimited repeat. We come
919 ph10 604 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
920     handled similarly to the normal case above. However, the matching is
921     different. The end of these brackets will always be OP_KETRPOS, which
922     returns MATCH_KETRPOS without going further in the pattern. By this means
923     we can handle the group by iteration rather than recursion, thereby
924     reducing the amount of stack needed. */
925 ph10 625
926 ph10 604 case OP_CBRAPOS:
927     case OP_SCBRAPOS:
928     allow_zero = FALSE;
929 ph10 625
930 ph10 604 POSSESSIVE_CAPTURE:
931     number = GET2(ecode, 1+LINK_SIZE);
932     offset = number << 1;
933    
934     #ifdef PCRE_DEBUG
935     printf("start possessive bracket %d\n", number);
936     printf("subject=");
937     pchars(eptr, 16, TRUE, md);
938     printf("\n");
939     #endif
940    
941     if (offset < md->offset_max)
942     {
943     matched_once = FALSE;
944 ph10 625 code_offset = ecode - md->start_code;
945 ph10 604
946     save_offset1 = md->offset_vector[offset];
947     save_offset2 = md->offset_vector[offset+1];
948     save_offset3 = md->offset_vector[md->offset_end - number];
949     save_capture_last = md->capture_last;
950    
951     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
952 ph10 625
953     /* Each time round the loop, save the current subject position for use
954     when the group matches. For MATCH_MATCH, the group has matched, so we
955     restart it with a new subject starting position, remembering that we had
956     at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
957     usual. If we haven't matched any alternatives in any iteration, check to
958     see if a previous iteration matched. If so, the group has matched;
959     continue from afterwards. Otherwise it has failed; restore the previous
960 ph10 604 capture values before returning NOMATCH. */
961 ph10 625
962 ph10 604 for (;;)
963     {
964     md->offset_vector[md->offset_end - number] =
965     (int)(eptr - md->start_subject);
966 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
967 ph10 604 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
968     eptrb, RM63);
969     if (rrc == MATCH_KETRPOS)
970     {
971     offset_top = md->end_offset_top;
972     eptr = md->end_match_ptr;
973 ph10 625 ecode = md->start_code + code_offset;
974 ph10 604 save_capture_last = md->capture_last;
975 ph10 625 matched_once = TRUE;
976     continue;
977     }
978 ph10 604 if (rrc != MATCH_NOMATCH &&
979     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
980     RRETURN(rrc);
981     md->capture_last = save_capture_last;
982     ecode += GET(ecode, 1);
983 ph10 625 if (*ecode != OP_ALT) break;
984 ph10 604 }
985 ph10 610
986 ph10 604 if (!matched_once)
987 ph10 625 {
988 ph10 604 md->offset_vector[offset] = save_offset1;
989     md->offset_vector[offset+1] = save_offset2;
990     md->offset_vector[md->offset_end - number] = save_offset3;
991     }
992 ph10 625
993 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
994 ph10 604 if (allow_zero || matched_once)
995 ph10 625 {
996 ph10 604 ecode += 1 + LINK_SIZE;
997     break;
998 ph10 625 }
999    
1000 ph10 604 RRETURN(MATCH_NOMATCH);
1001     }
1002 ph10 625
1003 ph10 604 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1004     as a non-capturing bracket. */
1005    
1006     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1007     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1008    
1009     DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1010    
1011     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1012     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1013    
1014 ph10 625 /* Non-capturing possessive bracket with unlimited repeat. We come here
1015 ph10 604 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1016     without the capturing complication. It is written out separately for speed
1017     and cleanliness. */
1018    
1019     case OP_BRAPOS:
1020     case OP_SBRAPOS:
1021 ph10 625 allow_zero = FALSE;
1022    
1023 ph10 604 POSSESSIVE_NON_CAPTURE:
1024     matched_once = FALSE;
1025 ph10 625 code_offset = ecode - md->start_code;
1026 ph10 604
1027     for (;;)
1028     {
1029 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1030 ph10 604 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1031 ph10 609 eptrb, RM48);
1032 ph10 604 if (rrc == MATCH_KETRPOS)
1033     {
1034 ph10 610 offset_top = md->end_offset_top;
1035 ph10 604 eptr = md->end_match_ptr;
1036 ph10 625 ecode = md->start_code + code_offset;
1037     matched_once = TRUE;
1038     continue;
1039     }
1040 ph10 604 if (rrc != MATCH_NOMATCH &&
1041     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1042     RRETURN(rrc);
1043     ecode += GET(ecode, 1);
1044 ph10 625 if (*ecode != OP_ALT) break;
1045 ph10 604 }
1046 ph10 625
1047     if (matched_once || allow_zero)
1048 ph10 604 {
1049     ecode += 1 + LINK_SIZE;
1050     break;
1051 ph10 625 }
1052 ph10 604 RRETURN(MATCH_NOMATCH);
1053    
1054     /* Control never reaches here. */
1055    
1056 nigel 77 /* Conditional group: compilation checked that there are no more than
1057     two branches. If the condition is false, skipping the first branch takes us
1058     past the end if there is only one branch, but that's OK because that is
1059 ph10 609 exactly what going to the ket would do. */
1060 nigel 77
1061     case OP_COND:
1062 nigel 93 case OP_SCOND:
1063 ph10 604 codelink = GET(ecode, 1);
1064 ph10 406
1065 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
1066     inserted between OP_COND and an assertion condition. */
1067 ph10 392
1068 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1069     {
1070     if (pcre_callout != NULL)
1071     {
1072     pcre_callout_block cb;
1073     cb.version = 1; /* Version 1 of the callout block */
1074     cb.callout_number = ecode[LINK_SIZE+2];
1075     cb.offset_vector = md->offset_vector;
1076     cb.subject = (PCRE_SPTR)md->start_subject;
1077 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1078     cb.start_match = (int)(mstart - md->start_subject);
1079     cb.current_position = (int)(eptr - md->start_subject);
1080 ph10 381 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1081     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1082     cb.capture_top = offset_top/2;
1083     cb.capture_last = md->capture_last;
1084     cb.callout_data = md->callout_data;
1085 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1086 ph10 381 if (rrc < 0) RRETURN(rrc);
1087     }
1088     ecode += _pcre_OP_lengths[OP_CALLOUT];
1089     }
1090 ph10 392
1091 ph10 399 condcode = ecode[LINK_SIZE+1];
1092 ph10 406
1093 ph10 381 /* Now see what the actual condition is */
1094 ph10 392
1095 ph10 459 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1096 nigel 77 {
1097 ph10 459 if (md->recursive == NULL) /* Not recursing => FALSE */
1098     {
1099 ph10 461 condition = FALSE;
1100     ecode += GET(ecode, 1);
1101     }
1102 ph10 459 else
1103 ph10 461 {
1104 ph10 459 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1105     condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1106 ph10 461
1107 ph10 459 /* If the test is for recursion into a specific subpattern, and it is
1108     false, but the test was set up by name, scan the table to see if the
1109     name refers to any other numbers, and test them. The condition is true
1110     if any one is set. */
1111 ph10 461
1112 ph10 459 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1113     {
1114     uschar *slotA = md->name_table;
1115     for (i = 0; i < md->name_count; i++)
1116 ph10 461 {
1117     if (GET2(slotA, 0) == recno) break;
1118 ph10 459 slotA += md->name_entry_size;
1119     }
1120 ph10 461
1121 ph10 459 /* Found a name for the number - there can be only one; duplicate
1122     names for different numbers are allowed, but not vice versa. First
1123     scan down for duplicates. */
1124 ph10 461
1125 ph10 459 if (i < md->name_count)
1126 ph10 461 {
1127 ph10 459 uschar *slotB = slotA;
1128     while (slotB > md->name_table)
1129     {
1130     slotB -= md->name_entry_size;
1131     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1132     {
1133     condition = GET2(slotB, 0) == md->recursive->group_num;
1134 ph10 461 if (condition) break;
1135     }
1136 ph10 459 else break;
1137 ph10 461 }
1138    
1139 ph10 459 /* Scan up for duplicates */
1140 ph10 461
1141 ph10 459 if (!condition)
1142 ph10 461 {
1143 ph10 459 slotB = slotA;
1144     for (i++; i < md->name_count; i++)
1145     {
1146     slotB += md->name_entry_size;
1147     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1148     {
1149     condition = GET2(slotB, 0) == md->recursive->group_num;
1150     if (condition) break;
1151 ph10 461 }
1152 ph10 459 else break;
1153 ph10 461 }
1154     }
1155 ph10 459 }
1156 ph10 461 }
1157    
1158 ph10 459 /* Chose branch according to the condition */
1159 ph10 461
1160 ph10 459 ecode += condition? 3 : GET(ecode, 1);
1161     }
1162 ph10 461 }
1163 nigel 93
1164 ph10 459 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1165 nigel 93 {
1166 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1167 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1168 ph10 461
1169 ph10 459 /* If the numbered capture is unset, but the reference was by name,
1170 ph10 461 scan the table to see if the name refers to any other numbers, and test
1171     them. The condition is true if any one is set. This is tediously similar
1172     to the code above, but not close enough to try to amalgamate. */
1173    
1174 ph10 459 if (!condition && condcode == OP_NCREF)
1175     {
1176 ph10 461 int refno = offset >> 1;
1177 ph10 459 uschar *slotA = md->name_table;
1178 ph10 461
1179 ph10 459 for (i = 0; i < md->name_count; i++)
1180 ph10 461 {
1181     if (GET2(slotA, 0) == refno) break;
1182 ph10 459 slotA += md->name_entry_size;
1183     }
1184 ph10 461
1185     /* Found a name for the number - there can be only one; duplicate names
1186     for different numbers are allowed, but not vice versa. First scan down
1187 ph10 459 for duplicates. */
1188 ph10 461
1189 ph10 459 if (i < md->name_count)
1190 ph10 461 {
1191 ph10 459 uschar *slotB = slotA;
1192     while (slotB > md->name_table)
1193     {
1194     slotB -= md->name_entry_size;
1195     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1196     {
1197     offset = GET2(slotB, 0) << 1;
1198 ph10 461 condition = offset < offset_top &&
1199 ph10 459 md->offset_vector[offset] >= 0;
1200 ph10 461 if (condition) break;
1201     }
1202 ph10 459 else break;
1203 ph10 461 }
1204    
1205 ph10 459 /* Scan up for duplicates */
1206 ph10 461
1207 ph10 459 if (!condition)
1208 ph10 461 {
1209 ph10 459 slotB = slotA;
1210     for (i++; i < md->name_count; i++)
1211     {
1212     slotB += md->name_entry_size;
1213     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1214     {
1215     offset = GET2(slotB, 0) << 1;
1216 ph10 461 condition = offset < offset_top &&
1217 ph10 459 md->offset_vector[offset] >= 0;
1218 ph10 461 if (condition) break;
1219     }
1220 ph10 459 else break;
1221 ph10 461 }
1222     }
1223 ph10 459 }
1224 ph10 461 }
1225    
1226 ph10 459 /* Chose branch according to the condition */
1227    
1228 nigel 93 ecode += condition? 3 : GET(ecode, 1);
1229 nigel 77 }
1230    
1231 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
1232 nigel 93 {
1233     condition = FALSE;
1234     ecode += GET(ecode, 1);
1235     }
1236    
1237 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
1238 ph10 604 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1239     an assertion. */
1240 nigel 77
1241     else
1242     {
1243 ph10 625 md->match_function_type = MATCH_CONDASSERT;
1244 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1245 nigel 77 if (rrc == MATCH_MATCH)
1246     {
1247 ph10 619 if (md->end_offset_top > offset_top)
1248     offset_top = md->end_offset_top; /* Captures may have happened */
1249 nigel 93 condition = TRUE;
1250     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1251 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1252     }
1253 ph10 550 else if (rrc != MATCH_NOMATCH &&
1254     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1255 nigel 77 {
1256     RRETURN(rrc); /* Need braces because of following else */
1257     }
1258 nigel 93 else
1259     {
1260     condition = FALSE;
1261 ph10 399 ecode += codelink;
1262 nigel 93 }
1263     }
1264 nigel 91
1265 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
1266 ph10 609 we used to use tail recursion to avoid using another stack frame, except
1267 ph10 625 when there was unlimited repeat of a possibly empty group. However, that
1268     strategy no longer works because of the possibilty of (*THEN) being
1269 ph10 609 encountered in the branch. A recursive call to match() is always required,
1270     unless the second alternative doesn't exist, in which case we can just
1271     plough on. */
1272 nigel 91
1273 nigel 93 if (condition || *ecode == OP_ALT)
1274     {
1275 ph10 625 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1276 ph10 609 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1277 ph10 625 if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1278 ph10 609 rrc = MATCH_NOMATCH;
1279     RRETURN(rrc);
1280 nigel 77 }
1281 ph10 395 else /* Condition false & no alternative */
1282 nigel 93 {
1283     ecode += 1 + LINK_SIZE;
1284     }
1285     break;
1286 nigel 77
1287 ph10 461
1288 ph10 447 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1289     to close any currently open capturing brackets. */
1290 ph10 461
1291 ph10 447 case OP_CLOSE:
1292 ph10 461 number = GET2(ecode, 1);
1293 ph10 447 offset = number << 1;
1294 ph10 461
1295 ph10 475 #ifdef PCRE_DEBUG
1296 ph10 447 printf("end bracket %d at *ACCEPT", number);
1297     printf("\n");
1298     #endif
1299 nigel 77
1300 ph10 447 md->capture_last = number;
1301     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1302     {
1303     md->offset_vector[offset] =
1304     md->offset_vector[md->offset_end - number];
1305 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1306 ph10 447 if (offset_top <= offset) offset_top = offset + 2;
1307     }
1308     ecode += 3;
1309 ph10 461 break;
1310 ph10 447
1311    
1312 ph10 619 /* End of the pattern, either real or forced. */
1313 nigel 77
1314 ph10 619 case OP_END:
1315 ph10 210 case OP_ACCEPT:
1316 ph10 625 case OP_ASSERT_ACCEPT:
1317    
1318 ph10 619 /* If we have matched an empty string, fail if not in an assertion and not
1319     in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1320 ph10 613 is set and we have matched at the start of the subject. In both cases,
1321     backtracking will then try other alternatives, if any. */
1322 ph10 443
1323 ph10 619 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1324 ph10 618 md->recursive == NULL &&
1325 ph10 619 (md->notempty ||
1326     (md->notempty_atstart &&
1327     mstart == md->start_subject + md->start_offset)))
1328 ph10 510 MRRETURN(MATCH_NOMATCH);
1329 ph10 443
1330 ph10 442 /* Otherwise, we have a match. */
1331 ph10 625
1332 ph10 168 md->end_match_ptr = eptr; /* Record where we ended */
1333     md->end_offset_top = offset_top; /* and how many extracts were taken */
1334 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1335 nigel 77
1336 ph10 512 /* For some reason, the macros don't work properly if an expression is
1337     given as the argument to MRRETURN when the heap is in use. */
1338    
1339     rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1340     MRRETURN(rrc);
1341    
1342 nigel 77 /* Assertion brackets. Check the alternative branches in turn - the
1343     matching won't pass the KET for an assertion. If any one branch matches,
1344     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1345     start of each branch to move the current point backwards, so the code at
1346 ph10 625 this level is identical to the lookahead case. When the assertion is part
1347     of a condition, we want to return immediately afterwards. The caller of
1348     this incarnation of the match() function will have set MATCH_CONDASSERT in
1349     md->match_function type, and one of these opcodes will be the first opcode
1350     that is processed. We use a local variable that is preserved over calls to
1351 ph10 604 match() to remember this case. */
1352 nigel 77
1353     case OP_ASSERT:
1354     case OP_ASSERTBACK:
1355 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1356     {
1357     condassert = TRUE;
1358     md->match_function_type = 0;
1359     }
1360 ph10 625 else condassert = FALSE;
1361    
1362 nigel 77 do
1363     {
1364 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1365 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1366 ph10 500 {
1367     mstart = md->start_match_ptr; /* In case \K reset it */
1368 ph10 630 markptr = md->mark;
1369 ph10 500 break;
1370 ph10 501 }
1371 ph10 550 if (rrc != MATCH_NOMATCH &&
1372     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1373     RRETURN(rrc);
1374 nigel 77 ecode += GET(ecode, 1);
1375     }
1376     while (*ecode == OP_ALT);
1377 ph10 625
1378 ph10 510 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1379 nigel 77
1380     /* If checking an assertion for a condition, return MATCH_MATCH. */
1381    
1382 ph10 604 if (condassert) RRETURN(MATCH_MATCH);
1383 nigel 77
1384     /* Continue from after the assertion, updating the offsets high water
1385     mark, since extracts may have been taken during the assertion. */
1386    
1387     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1388     ecode += 1 + LINK_SIZE;
1389     offset_top = md->end_offset_top;
1390     continue;
1391    
1392 ph10 473 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1393 ph10 482 PRUNE, or COMMIT means we must assume failure without checking subsequent
1394 ph10 473 branches. */
1395 nigel 77
1396     case OP_ASSERT_NOT:
1397     case OP_ASSERTBACK_NOT:
1398 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1399     {
1400     condassert = TRUE;
1401     md->match_function_type = 0;
1402     }
1403 ph10 625 else condassert = FALSE;
1404 ph10 604
1405 nigel 77 do
1406     {
1407 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1408 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1409 ph10 473 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1410     {
1411     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1412 ph10 482 break;
1413     }
1414 ph10 550 if (rrc != MATCH_NOMATCH &&
1415     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1416     RRETURN(rrc);
1417 nigel 77 ecode += GET(ecode,1);
1418     }
1419     while (*ecode == OP_ALT);
1420    
1421 ph10 604 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1422 ph10 625
1423 nigel 77 ecode += 1 + LINK_SIZE;
1424     continue;
1425    
1426     /* Move the subject pointer back. This occurs only at the start of
1427     each branch of a lookbehind assertion. If we are too close to the start to
1428     move back, this match function fails. When working with UTF-8 we move
1429     back a number of characters, not bytes. */
1430    
1431     case OP_REVERSE:
1432     #ifdef SUPPORT_UTF8
1433     if (utf8)
1434     {
1435 nigel 93 i = GET(ecode, 1);
1436     while (i-- > 0)
1437 nigel 77 {
1438     eptr--;
1439 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1440 ph10 207 BACKCHAR(eptr);
1441 nigel 77 }
1442     }
1443     else
1444     #endif
1445    
1446     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1447    
1448     {
1449 nigel 93 eptr -= GET(ecode, 1);
1450 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1451 nigel 77 }
1452    
1453 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1454 nigel 77
1455 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1456 nigel 77 ecode += 1 + LINK_SIZE;
1457     break;
1458    
1459     /* The callout item calls an external function, if one is provided, passing
1460     details of the match so far. This is mainly for debugging, though the
1461     function is able to force a failure. */
1462    
1463     case OP_CALLOUT:
1464     if (pcre_callout != NULL)
1465     {
1466     pcre_callout_block cb;
1467     cb.version = 1; /* Version 1 of the callout block */
1468     cb.callout_number = ecode[1];
1469     cb.offset_vector = md->offset_vector;
1470 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1471 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1472     cb.start_match = (int)(mstart - md->start_subject);
1473     cb.current_position = (int)(eptr - md->start_subject);
1474 nigel 77 cb.pattern_position = GET(ecode, 2);
1475     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1476     cb.capture_top = offset_top/2;
1477     cb.capture_last = md->capture_last;
1478     cb.callout_data = md->callout_data;
1479 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1480 nigel 77 if (rrc < 0) RRETURN(rrc);
1481     }
1482     ecode += 2 + 2*LINK_SIZE;
1483     break;
1484    
1485     /* Recursion either matches the current regex, or some subexpression. The
1486     offset data is the offset to the starting bracket from the start of the
1487     whole pattern. (This is so that it works from duplicated subpatterns.)
1488 ph10 625
1489 ph10 618 The state of the capturing groups is preserved over recursion, and
1490 ph10 625 re-instated afterwards. We don't know how many are started and not yet
1491 ph10 618 finished (offset_top records the completed total) so we just have to save
1492     all the potential data. There may be up to 65535 such values, which is too
1493     large to put on the stack, but using malloc for small numbers seems
1494     expensive. As a compromise, the stack is used when there are no more than
1495     REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1496 nigel 77
1497     There are also other values that have to be saved. We use a chained
1498     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1499 ph10 625 for the original version of this logic. It has, however, been hacked around
1500 ph10 618 a lot, so he is not to blame for the current way it works. */
1501 nigel 77
1502     case OP_RECURSE:
1503     {
1504 ph10 642 recursion_info *ri;
1505     int recno;
1506    
1507 nigel 77 callpat = md->start_code + GET(ecode, 1);
1508 ph10 642 recno = (callpat == md->start_code)? 0 :
1509     GET2(callpat, 1 + LINK_SIZE);
1510    
1511     /* Check for repeating a recursion without advancing the subject pointer.
1512     This should catch convoluted mutual recursions. (Some simple cases are
1513     caught at compile time.) */
1514    
1515     for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1516     if (recno == ri->group_num && eptr == ri->subject_position)
1517     RRETURN(PCRE_ERROR_RECURSELOOP);
1518 nigel 77
1519     /* Add to "recursing stack" */
1520    
1521 ph10 642 new_recursive.group_num = recno;
1522     new_recursive.subject_position = eptr;
1523 nigel 77 new_recursive.prevrec = md->recursive;
1524     md->recursive = &new_recursive;
1525    
1526 ph10 618 /* Where to continue from afterwards */
1527 nigel 77
1528     ecode += 1 + LINK_SIZE;
1529    
1530 ph10 618 /* Now save the offset data */
1531 nigel 77
1532     new_recursive.saved_max = md->offset_end;
1533     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1534     new_recursive.offset_save = stacksave;
1535     else
1536     {
1537     new_recursive.offset_save =
1538     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1539     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1540     }
1541     memcpy(new_recursive.offset_save, md->offset_vector,
1542     new_recursive.saved_max * sizeof(int));
1543 ph10 625
1544 ph10 618 /* OK, now we can do the recursion. After processing each alternative,
1545 ph10 625 restore the offset data. If there were nested recursions, md->recursive
1546 ph10 618 might be changed, so reset it before looping. */
1547 nigel 77
1548     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1549 ph10 604 cbegroup = (*callpat >= OP_SBRA);
1550 nigel 77 do
1551     {
1552 ph10 604 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1553 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1554 ph10 604 md, eptrb, RM6);
1555 ph10 618 memcpy(md->offset_vector, new_recursive.offset_save,
1556     new_recursive.saved_max * sizeof(int));
1557 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1558 nigel 77 {
1559 nigel 87 DPRINTF(("Recursion matched\n"));
1560 nigel 77 md->recursive = new_recursive.prevrec;
1561     if (new_recursive.offset_save != stacksave)
1562     (pcre_free)(new_recursive.offset_save);
1563 ph10 618
1564     /* Set where we got to in the subject, and reset the start in case
1565 ph10 625 it was changed by \K. This *is* propagated back out of a recursion,
1566     for Perl compatibility. */
1567    
1568 ph10 618 eptr = md->end_match_ptr;
1569     mstart = md->start_match_ptr;
1570     goto RECURSION_MATCHED; /* Exit loop; end processing */
1571 nigel 77 }
1572 ph10 550 else if (rrc != MATCH_NOMATCH &&
1573     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1574 nigel 87 {
1575     DPRINTF(("Recursion gave error %d\n", rrc));
1576 ph10 400 if (new_recursive.offset_save != stacksave)
1577     (pcre_free)(new_recursive.offset_save);
1578 nigel 87 RRETURN(rrc);
1579     }
1580 nigel 77
1581     md->recursive = &new_recursive;
1582     callpat += GET(callpat, 1);
1583     }
1584     while (*callpat == OP_ALT);
1585    
1586     DPRINTF(("Recursion didn't match\n"));
1587     md->recursive = new_recursive.prevrec;
1588     if (new_recursive.offset_save != stacksave)
1589     (pcre_free)(new_recursive.offset_save);
1590 ph10 510 MRRETURN(MATCH_NOMATCH);
1591 nigel 77 }
1592 ph10 625
1593 ph10 618 RECURSION_MATCHED:
1594     break;
1595 nigel 77
1596     /* An alternation is the end of a branch; scan along to find the end of the
1597     bracketed group and go to there. */
1598    
1599     case OP_ALT:
1600     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1601     break;
1602    
1603 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1604     indicating that it may occur zero times. It may repeat infinitely, or not
1605     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1606     with fixed upper repeat limits are compiled as a number of copies, with the
1607     optional ones preceded by BRAZERO or BRAMINZERO. */
1608 ph10 625
1609 nigel 77 case OP_BRAZERO:
1610 ph10 604 next = ecode + 1;
1611     RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1612     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1613     do next += GET(next, 1); while (*next == OP_ALT);
1614     ecode = next + 1 + LINK_SIZE;
1615 nigel 77 break;
1616 ph10 625
1617 nigel 77 case OP_BRAMINZERO:
1618 ph10 604 next = ecode + 1;
1619     do next += GET(next, 1); while (*next == OP_ALT);
1620     RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1621     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1622     ecode++;
1623 nigel 77 break;
1624    
1625 ph10 335 case OP_SKIPZERO:
1626 ph10 604 next = ecode+1;
1627     do next += GET(next,1); while (*next == OP_ALT);
1628     ecode = next + 1 + LINK_SIZE;
1629 ph10 335 break;
1630 ph10 625
1631 ph10 604 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1632     here; just jump to the group, with allow_zero set TRUE. */
1633 ph10 625
1634 ph10 604 case OP_BRAPOSZERO:
1635 ph10 625 op = *(++ecode);
1636 ph10 604 allow_zero = TRUE;
1637     if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1638     goto POSSESSIVE_NON_CAPTURE;
1639 ph10 335
1640 nigel 93 /* End of a group, repeated or non-repeating. */
1641 nigel 77
1642     case OP_KET:
1643     case OP_KETRMIN:
1644     case OP_KETRMAX:
1645 ph10 625 case OP_KETRPOS:
1646 nigel 91 prev = ecode - GET(ecode, 1);
1647 ph10 625
1648 nigel 93 /* If this was a group that remembered the subject start, in order to break
1649     infinite repeats of empty string matches, retrieve the subject start from
1650     the chain. Otherwise, set it NULL. */
1651 nigel 77
1652 ph10 618 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1653 nigel 93 {
1654     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1655     eptrb = eptrb->epb_prev; /* Backup to previous group */
1656     }
1657     else saved_eptr = NULL;
1658 nigel 77
1659 ph10 618 /* If we are at the end of an assertion group, stop matching and return
1660     MATCH_MATCH, but record the current high water mark for use by positive
1661     assertions. We also need to record the match start in case it was changed
1662     by \K. */
1663 nigel 93
1664 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1665 ph10 618 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT)
1666 nigel 91 {
1667     md->end_match_ptr = eptr; /* For ONCE */
1668     md->end_offset_top = offset_top;
1669 ph10 500 md->start_match_ptr = mstart;
1670 ph10 630 MRRETURN(MATCH_MATCH); /* Sets md->mark */
1671 nigel 91 }
1672 nigel 77
1673 nigel 93 /* For capturing groups we have to check the group number back at the start
1674     and if necessary complete handling an extraction by setting the offsets and
1675 ph10 618 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1676     into group 0, so it won't be picked up here. Instead, we catch it when the
1677     OP_END is reached. Other recursion is handled here. We just have to record
1678     the current subject position and start match pointer and give a MATCH
1679     return. */
1680 nigel 77
1681 ph10 604 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1682     *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1683 nigel 91 {
1684 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1685 nigel 91 offset = number << 1;
1686 ph10 461
1687 ph10 475 #ifdef PCRE_DEBUG
1688 nigel 91 printf("end bracket %d", number);
1689     printf("\n");
1690 nigel 77 #endif
1691    
1692 ph10 618 /* Handle a recursively called group. */
1693    
1694     if (md->recursive != NULL && md->recursive->group_num == number)
1695     {
1696     md->end_match_ptr = eptr;
1697     md->start_match_ptr = mstart;
1698     RRETURN(MATCH_MATCH);
1699     }
1700    
1701     /* Deal with capturing */
1702    
1703 nigel 93 md->capture_last = number;
1704     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1705 nigel 91 {
1706 ph10 625 /* If offset is greater than offset_top, it means that we are
1707     "skipping" a capturing group, and that group's offsets must be marked
1708     unset. In earlier versions of PCRE, all the offsets were unset at the
1709     start of matching, but this doesn't work because atomic groups and
1710 ph10 615 assertions can cause a value to be set that should later be unset.
1711     Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1712 ph10 625 part of the atomic group, but this is not on the final matching path,
1713     so must be unset when 2 is set. (If there is no group 2, there is no
1714 ph10 615 problem, because offset_top will then be 2, indicating no capture.) */
1715 ph10 625
1716 ph10 615 if (offset > offset_top)
1717     {
1718     register int *iptr = md->offset_vector + offset_top;
1719     register int *iend = md->offset_vector + offset;
1720     while (iptr < iend) *iptr++ = -1;
1721 ph10 625 }
1722    
1723 ph10 615 /* Now make the extraction */
1724    
1725 nigel 93 md->offset_vector[offset] =
1726     md->offset_vector[md->offset_end - number];
1727 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1728 nigel 93 if (offset_top <= offset) offset_top = offset + 2;
1729     }
1730 nigel 91 }
1731 nigel 77
1732 ph10 618 /* For an ordinary non-repeating ket, just continue at this level. This
1733     also happens for a repeating ket if no characters were matched in the
1734     group. This is the forcible breaking of infinite loops as implemented in
1735 ph10 625 Perl 5.005. For a non-repeating atomic group, establish a backup point by
1736     processing the rest of the pattern at a lower level. If this results in a
1737     NOMATCH return, pass MATCH_ONCE back to the original OP_ONCE level, thereby
1738     bypassing intermediate backup points, but resetting any captures that
1739 ph10 618 happened along the way. */
1740 nigel 77
1741 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1742     {
1743 ph10 618 if (*prev == OP_ONCE)
1744     {
1745     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1746     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1747     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1748 ph10 625 RRETURN(MATCH_ONCE);
1749     }
1750 ph10 618 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1751 nigel 91 break;
1752     }
1753 ph10 625
1754     /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1755 ph10 604 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1756     at a time from the outer level, thus saving stack. */
1757 ph10 625
1758 ph10 604 if (*ecode == OP_KETRPOS)
1759 ph10 625 {
1760 ph10 604 md->end_match_ptr = eptr;
1761 ph10 625 md->end_offset_top = offset_top;
1762 ph10 604 RRETURN(MATCH_KETRPOS);
1763 ph10 625 }
1764 nigel 77
1765 ph10 604 /* The normal repeating kets try the rest of the pattern or restart from
1766     the preceding bracket, in the appropriate order. In the second case, we can
1767     use tail recursion to avoid using another stack frame, unless we have an
1768 ph10 618 an atomic group or an unlimited repeat of a group that can match an empty
1769     string. */
1770 nigel 77
1771 nigel 91 if (*ecode == OP_KETRMIN)
1772     {
1773 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1774 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1775 ph10 618 if (*prev == OP_ONCE)
1776     {
1777 ph10 623 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1778 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1779     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1780 ph10 625 RRETURN(MATCH_ONCE);
1781     }
1782 ph10 604 if (*prev >= OP_SBRA) /* Could match an empty string */
1783 ph10 197 {
1784 ph10 625 md->match_function_type = MATCH_CBEGROUP;
1785 ph10 604 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1786 ph10 197 RRETURN(rrc);
1787     }
1788 nigel 91 ecode = prev;
1789     goto TAIL_RECURSE;
1790 nigel 77 }
1791 nigel 91 else /* OP_KETRMAX */
1792     {
1793 ph10 625 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1794 ph10 604 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1795 ph10 618 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1796 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1797 ph10 618 if (*prev == OP_ONCE)
1798     {
1799 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1800 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1801     md->once_target = prev;
1802 ph10 625 RRETURN(MATCH_ONCE);
1803     }
1804 nigel 91 ecode += 1 + LINK_SIZE;
1805     goto TAIL_RECURSE;
1806     }
1807     /* Control never gets here */
1808 nigel 77
1809 ph10 602 /* Not multiline mode: start of subject assertion, unless notbol. */
1810 nigel 77
1811     case OP_CIRC:
1812 ph10 510 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1813 ph10 625
1814 nigel 77 /* Start of subject assertion */
1815    
1816     case OP_SOD:
1817 ph10 510 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1818 nigel 77 ecode++;
1819     break;
1820 ph10 625
1821 ph10 602 /* Multiline mode: start of subject unless notbol, or after any newline. */
1822 nigel 77
1823 ph10 602 case OP_CIRCM:
1824     if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1825     if (eptr != md->start_subject &&
1826     (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1827     MRRETURN(MATCH_NOMATCH);
1828     ecode++;
1829     break;
1830    
1831 nigel 77 /* Start of match assertion */
1832    
1833     case OP_SOM:
1834 ph10 510 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1835 nigel 77 ecode++;
1836     break;
1837 ph10 172
1838 ph10 168 /* Reset the start of match point */
1839 ph10 172
1840 ph10 168 case OP_SET_SOM:
1841     mstart = eptr;
1842 ph10 172 ecode++;
1843     break;
1844 nigel 77
1845 ph10 602 /* Multiline mode: assert before any newline, or before end of subject
1846     unless noteol is set. */
1847 nigel 77
1848 ph10 602 case OP_DOLLM:
1849     if (eptr < md->end_subject)
1850     { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1851     else
1852 nigel 77 {
1853 ph10 510 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1854 ph10 602 SCHECK_PARTIAL();
1855 nigel 77 }
1856 ph10 602 ecode++;
1857     break;
1858 ph10 579
1859 ph10 625 /* Not multiline mode: assert before a terminating newline or before end of
1860 ph10 602 subject unless noteol is set. */
1861    
1862     case OP_DOLL:
1863     if (md->noteol) MRRETURN(MATCH_NOMATCH);
1864     if (!md->endonly) goto ASSERT_NL_OR_EOS;
1865    
1866 nigel 91 /* ... else fall through for endonly */
1867 nigel 77
1868     /* End of subject assertion (\z) */
1869    
1870     case OP_EOD:
1871 ph10 510 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1872 ph10 553 SCHECK_PARTIAL();
1873 nigel 77 ecode++;
1874     break;
1875    
1876     /* End of subject or ending \n assertion (\Z) */
1877    
1878     case OP_EODN:
1879 ph10 553 ASSERT_NL_OR_EOS:
1880     if (eptr < md->end_subject &&
1881 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1882 ph10 510 MRRETURN(MATCH_NOMATCH);
1883 ph10 579
1884 ph10 553 /* Either at end of string or \n before end. */
1885 ph10 579
1886 ph10 553 SCHECK_PARTIAL();
1887 nigel 77 ecode++;
1888     break;
1889    
1890     /* Word boundary assertions */
1891    
1892     case OP_NOT_WORD_BOUNDARY:
1893     case OP_WORD_BOUNDARY:
1894     {
1895    
1896     /* Find out if the previous and current characters are "word" characters.
1897     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1898 ph10 443 be "non-word" characters. Remember the earliest consulted character for
1899 ph10 435 partial matching. */
1900 nigel 77
1901     #ifdef SUPPORT_UTF8
1902     if (utf8)
1903     {
1904 ph10 518 /* Get status of previous character */
1905 ph10 527
1906 nigel 77 if (eptr == md->start_subject) prev_is_word = FALSE; else
1907     {
1908 ph10 409 USPTR lastptr = eptr - 1;
1909 nigel 77 while((*lastptr & 0xc0) == 0x80) lastptr--;
1910 ph10 443 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1911 nigel 77 GETCHAR(c, lastptr);
1912 ph10 527 #ifdef SUPPORT_UCP
1913 ph10 518 if (md->use_ucp)
1914     {
1915     if (c == '_') prev_is_word = TRUE; else
1916 ph10 527 {
1917 ph10 518 int cat = UCD_CATEGORY(c);
1918     prev_is_word = (cat == ucp_L || cat == ucp_N);
1919 ph10 527 }
1920     }
1921     else
1922     #endif
1923 nigel 77 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1924     }
1925 ph10 527
1926 ph10 518 /* Get status of next character */
1927 ph10 527
1928 ph10 443 if (eptr >= md->end_subject)
1929 nigel 77 {
1930 ph10 443 SCHECK_PARTIAL();
1931     cur_is_word = FALSE;
1932 ph10 428 }
1933     else
1934     {
1935 nigel 77 GETCHAR(c, eptr);
1936 ph10 527 #ifdef SUPPORT_UCP
1937 ph10 518 if (md->use_ucp)
1938     {
1939     if (c == '_') cur_is_word = TRUE; else
1940 ph10 527 {
1941 ph10 518 int cat = UCD_CATEGORY(c);
1942     cur_is_word = (cat == ucp_L || cat == ucp_N);
1943 ph10 527 }
1944     }
1945     else
1946     #endif
1947 nigel 77 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1948     }
1949     }
1950     else
1951     #endif
1952    
1953 ph10 527 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1954 ph10 518 consistency with the behaviour of \w we do use it in this case. */
1955 nigel 77
1956     {
1957 ph10 518 /* Get status of previous character */
1958 ph10 527
1959 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
1960     {
1961 ph10 443 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1962 ph10 527 #ifdef SUPPORT_UCP
1963 ph10 518 if (md->use_ucp)
1964     {
1965 ph10 527 c = eptr[-1];
1966 ph10 518 if (c == '_') prev_is_word = TRUE; else
1967 ph10 527 {
1968 ph10 518 int cat = UCD_CATEGORY(c);
1969     prev_is_word = (cat == ucp_L || cat == ucp_N);
1970 ph10 527 }
1971     }
1972     else
1973     #endif
1974 ph10 435 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1975     }
1976 ph10 527
1977 ph10 518 /* Get status of next character */
1978 ph10 527
1979 ph10 443 if (eptr >= md->end_subject)
1980 ph10 428 {
1981 ph10 443 SCHECK_PARTIAL();
1982     cur_is_word = FALSE;
1983 ph10 428 }
1984 ph10 527 else
1985     #ifdef SUPPORT_UCP
1986 ph10 518 if (md->use_ucp)
1987     {
1988 ph10 527 c = *eptr;
1989 ph10 518 if (c == '_') cur_is_word = TRUE; else
1990 ph10 527 {
1991 ph10 518 int cat = UCD_CATEGORY(c);
1992     cur_is_word = (cat == ucp_L || cat == ucp_N);
1993 ph10 527 }
1994     }
1995     else
1996     #endif
1997 ph10 518 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1998 nigel 77 }
1999    
2000     /* Now see if the situation is what we want */
2001    
2002     if ((*ecode++ == OP_WORD_BOUNDARY)?
2003     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2004 ph10 510 MRRETURN(MATCH_NOMATCH);
2005 nigel 77 }
2006     break;
2007    
2008     /* Match a single character type; inline for speed */
2009    
2010     case OP_ANY:
2011 ph10 510 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2012 ph10 345 /* Fall through */
2013    
2014 ph10 341 case OP_ALLANY:
2015 ph10 443 if (eptr++ >= md->end_subject)
2016 ph10 428 {
2017 ph10 443 SCHECK_PARTIAL();
2018 ph10 510 MRRETURN(MATCH_NOMATCH);
2019 ph10 443 }
2020 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2021 nigel 77 ecode++;
2022     break;
2023    
2024     /* Match a single byte, even in UTF-8 mode. This opcode really does match
2025     any byte, even newline, independent of the setting of PCRE_DOTALL. */
2026    
2027     case OP_ANYBYTE:
2028 ph10 443 if (eptr++ >= md->end_subject)
2029 ph10 428 {
2030 ph10 443 SCHECK_PARTIAL();
2031 ph10 510 MRRETURN(MATCH_NOMATCH);
2032 ph10 443 }
2033 nigel 77 ecode++;
2034     break;
2035    
2036     case OP_NOT_DIGIT:
2037 ph10 443 if (eptr >= md->end_subject)
2038 ph10 428 {
2039 ph10 443 SCHECK_PARTIAL();
2040 ph10 510 MRRETURN(MATCH_NOMATCH);
2041 ph10 443 }
2042 nigel 77 GETCHARINCTEST(c, eptr);
2043     if (
2044     #ifdef SUPPORT_UTF8
2045     c < 256 &&
2046     #endif
2047     (md->ctypes[c] & ctype_digit) != 0
2048     )
2049 ph10 510 MRRETURN(MATCH_NOMATCH);
2050 nigel 77 ecode++;
2051     break;
2052    
2053     case OP_DIGIT:
2054 ph10 443 if (eptr >= md->end_subject)
2055 ph10 428 {
2056 ph10 443 SCHECK_PARTIAL();
2057 ph10 510 MRRETURN(MATCH_NOMATCH);
2058 ph10 443 }
2059 nigel 77 GETCHARINCTEST(c, eptr);
2060     if (
2061     #ifdef SUPPORT_UTF8
2062     c >= 256 ||
2063     #endif
2064     (md->ctypes[c] & ctype_digit) == 0
2065     )
2066 ph10 510 MRRETURN(MATCH_NOMATCH);
2067 nigel 77 ecode++;
2068     break;
2069    
2070     case OP_NOT_WHITESPACE:
2071 ph10 443 if (eptr >= md->end_subject)
2072 ph10 428 {
2073 ph10 443 SCHECK_PARTIAL();
2074 ph10 510 MRRETURN(MATCH_NOMATCH);
2075 ph10 443 }
2076 nigel 77 GETCHARINCTEST(c, eptr);
2077     if (
2078     #ifdef SUPPORT_UTF8
2079     c < 256 &&
2080     #endif
2081     (md->ctypes[c] & ctype_space) != 0
2082     )
2083 ph10 510 MRRETURN(MATCH_NOMATCH);
2084 nigel 77 ecode++;
2085     break;
2086    
2087     case OP_WHITESPACE:
2088 ph10 443 if (eptr >= md->end_subject)
2089 ph10 428 {
2090 ph10 443 SCHECK_PARTIAL();
2091 ph10 510 MRRETURN(MATCH_NOMATCH);
2092 ph10 443 }
2093 nigel 77 GETCHARINCTEST(c, eptr);
2094     if (
2095     #ifdef SUPPORT_UTF8
2096     c >= 256 ||
2097     #endif
2098     (md->ctypes[c] & ctype_space) == 0
2099     )
2100 ph10 510 MRRETURN(MATCH_NOMATCH);
2101 nigel 77 ecode++;
2102     break;
2103    
2104     case OP_NOT_WORDCHAR:
2105 ph10 443 if (eptr >= md->end_subject)
2106 ph10 428 {
2107 ph10 443 SCHECK_PARTIAL();
2108 ph10 510 MRRETURN(MATCH_NOMATCH);
2109 ph10 443 }
2110 nigel 77 GETCHARINCTEST(c, eptr);
2111     if (
2112     #ifdef SUPPORT_UTF8
2113     c < 256 &&
2114     #endif
2115     (md->ctypes[c] & ctype_word) != 0
2116     )
2117 ph10 510 MRRETURN(MATCH_NOMATCH);
2118 nigel 77 ecode++;
2119     break;
2120    
2121     case OP_WORDCHAR:
2122 ph10 443 if (eptr >= md->end_subject)
2123 ph10 428 {
2124 ph10 443 SCHECK_PARTIAL();
2125 ph10 510 MRRETURN(MATCH_NOMATCH);
2126 ph10 443 }
2127 nigel 77 GETCHARINCTEST(c, eptr);
2128     if (
2129     #ifdef SUPPORT_UTF8
2130     c >= 256 ||
2131     #endif
2132     (md->ctypes[c] & ctype_word) == 0
2133     )
2134 ph10 510 MRRETURN(MATCH_NOMATCH);
2135 nigel 77 ecode++;
2136     break;
2137    
2138 nigel 93 case OP_ANYNL:
2139 ph10 443 if (eptr >= md->end_subject)
2140 ph10 428 {
2141 ph10 443 SCHECK_PARTIAL();
2142 ph10 510 MRRETURN(MATCH_NOMATCH);
2143 ph10 443 }
2144 nigel 93 GETCHARINCTEST(c, eptr);
2145     switch(c)
2146     {
2147 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2148 ph10 625
2149 nigel 93 case 0x000d:
2150     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2151     break;
2152 ph10 231
2153 nigel 93 case 0x000a:
2154 ph10 231 break;
2155    
2156 nigel 93 case 0x000b:
2157     case 0x000c:
2158     case 0x0085:
2159     case 0x2028:
2160     case 0x2029:
2161 ph10 510 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2162 nigel 93 break;
2163     }
2164     ecode++;
2165     break;
2166    
2167 ph10 178 case OP_NOT_HSPACE:
2168 ph10 443 if (eptr >= md->end_subject)
2169 ph10 428 {
2170 ph10 443 SCHECK_PARTIAL();
2171 ph10 510 MRRETURN(MATCH_NOMATCH);
2172 ph10 443 }
2173 ph10 178 GETCHARINCTEST(c, eptr);
2174     switch(c)
2175     {
2176     default: break;
2177     case 0x09: /* HT */
2178     case 0x20: /* SPACE */
2179     case 0xa0: /* NBSP */
2180     case 0x1680: /* OGHAM SPACE MARK */
2181     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2182     case 0x2000: /* EN QUAD */
2183     case 0x2001: /* EM QUAD */
2184     case 0x2002: /* EN SPACE */
2185     case 0x2003: /* EM SPACE */
2186     case 0x2004: /* THREE-PER-EM SPACE */
2187     case 0x2005: /* FOUR-PER-EM SPACE */
2188     case 0x2006: /* SIX-PER-EM SPACE */
2189     case 0x2007: /* FIGURE SPACE */
2190     case 0x2008: /* PUNCTUATION SPACE */
2191     case 0x2009: /* THIN SPACE */
2192     case 0x200A: /* HAIR SPACE */
2193     case 0x202f: /* NARROW NO-BREAK SPACE */
2194     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2195     case 0x3000: /* IDEOGRAPHIC SPACE */
2196 ph10 510 MRRETURN(MATCH_NOMATCH);
2197 ph10 178 }
2198     ecode++;
2199     break;
2200    
2201     case OP_HSPACE:
2202 ph10 443 if (eptr >= md->end_subject)
2203 ph10 428 {
2204 ph10 443 SCHECK_PARTIAL();
2205 ph10 510 MRRETURN(MATCH_NOMATCH);
2206 ph10 443 }
2207 ph10 178 GETCHARINCTEST(c, eptr);
2208     switch(c)
2209     {
2210 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2211 ph10 178 case 0x09: /* HT */
2212     case 0x20: /* SPACE */
2213     case 0xa0: /* NBSP */
2214     case 0x1680: /* OGHAM SPACE MARK */
2215     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2216     case 0x2000: /* EN QUAD */
2217     case 0x2001: /* EM QUAD */
2218     case 0x2002: /* EN SPACE */
2219     case 0x2003: /* EM SPACE */
2220     case 0x2004: /* THREE-PER-EM SPACE */
2221     case 0x2005: /* FOUR-PER-EM SPACE */
2222     case 0x2006: /* SIX-PER-EM SPACE */
2223     case 0x2007: /* FIGURE SPACE */
2224     case 0x2008: /* PUNCTUATION SPACE */
2225     case 0x2009: /* THIN SPACE */
2226     case 0x200A: /* HAIR SPACE */
2227     case 0x202f: /* NARROW NO-BREAK SPACE */
2228     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2229     case 0x3000: /* IDEOGRAPHIC SPACE */
2230     break;
2231     }
2232     ecode++;
2233     break;
2234    
2235     case OP_NOT_VSPACE:
2236 ph10 443 if (eptr >= md->end_subject)
2237 ph10 428 {
2238 ph10 443 SCHECK_PARTIAL();
2239 ph10 510 MRRETURN(MATCH_NOMATCH);
2240 ph10 443 }
2241 ph10 178 GETCHARINCTEST(c, eptr);
2242     switch(c)
2243     {
2244     default: break;
2245     case 0x0a: /* LF */
2246     case 0x0b: /* VT */
2247     case 0x0c: /* FF */
2248     case 0x0d: /* CR */
2249     case 0x85: /* NEL */
2250     case 0x2028: /* LINE SEPARATOR */
2251     case 0x2029: /* PARAGRAPH SEPARATOR */
2252 ph10 510 MRRETURN(MATCH_NOMATCH);
2253 ph10 178 }
2254     ecode++;
2255     break;
2256    
2257     case OP_VSPACE:
2258 ph10 443 if (eptr >= md->end_subject)
2259 ph10 428 {
2260 ph10 443 SCHECK_PARTIAL();
2261 ph10 510 MRRETURN(MATCH_NOMATCH);
2262 ph10 443 }
2263 ph10 178 GETCHARINCTEST(c, eptr);
2264     switch(c)
2265     {
2266 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2267 ph10 178 case 0x0a: /* LF */
2268     case 0x0b: /* VT */
2269     case 0x0c: /* FF */
2270     case 0x0d: /* CR */
2271     case 0x85: /* NEL */
2272     case 0x2028: /* LINE SEPARATOR */
2273     case 0x2029: /* PARAGRAPH SEPARATOR */
2274     break;
2275     }
2276     ecode++;
2277     break;
2278    
2279 nigel 77 #ifdef SUPPORT_UCP
2280     /* Check the next character by Unicode property. We will get here only
2281     if the support is in the binary; otherwise a compile-time error occurs. */
2282    
2283     case OP_PROP:
2284     case OP_NOTPROP:
2285 ph10 443 if (eptr >= md->end_subject)
2286 ph10 428 {
2287 ph10 443 SCHECK_PARTIAL();
2288 ph10 510 MRRETURN(MATCH_NOMATCH);
2289 ph10 443 }
2290 nigel 77 GETCHARINCTEST(c, eptr);
2291     {
2292 ph10 384 const ucd_record *prop = GET_UCD(c);
2293 nigel 77
2294 nigel 87 switch(ecode[1])
2295     {
2296     case PT_ANY:
2297 ph10 510 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2298 nigel 87 break;
2299 nigel 77
2300 nigel 87 case PT_LAMP:
2301 ph10 349 if ((prop->chartype == ucp_Lu ||
2302     prop->chartype == ucp_Ll ||
2303     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2304 ph10 510 MRRETURN(MATCH_NOMATCH);
2305 ph10 517 break;
2306 nigel 87
2307     case PT_GC:
2308 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2309 ph10 510 MRRETURN(MATCH_NOMATCH);
2310 nigel 87 break;
2311    
2312     case PT_PC:
2313 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2314 ph10 510 MRRETURN(MATCH_NOMATCH);
2315 nigel 87 break;
2316    
2317     case PT_SC:
2318 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
2319 ph10 510 MRRETURN(MATCH_NOMATCH);
2320 nigel 87 break;
2321 ph10 527
2322 ph10 517 /* These are specials */
2323 ph10 527
2324 ph10 517 case PT_ALNUM:
2325     if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2326     _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2327     MRRETURN(MATCH_NOMATCH);
2328 ph10 527 break;
2329    
2330 ph10 517 case PT_SPACE: /* Perl space */
2331     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2332     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2333     == (op == OP_NOTPROP))
2334     MRRETURN(MATCH_NOMATCH);
2335 ph10 527 break;
2336    
2337 ph10 517 case PT_PXSPACE: /* POSIX space */
2338     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2339 ph10 527 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2340 ph10 517 c == CHAR_FF || c == CHAR_CR)
2341     == (op == OP_NOTPROP))
2342     MRRETURN(MATCH_NOMATCH);
2343 ph10 527 break;
2344 nigel 87
2345 ph10 527 case PT_WORD:
2346 ph10 517 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2347 ph10 527 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2348 ph10 517 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2349     MRRETURN(MATCH_NOMATCH);
2350 ph10 527 break;
2351    
2352 ph10 517 /* This should never occur */
2353    
2354 nigel 87 default:
2355     RRETURN(PCRE_ERROR_INTERNAL);
2356 nigel 77 }
2357 nigel 87
2358     ecode += 3;
2359 nigel 77 }
2360     break;
2361    
2362     /* Match an extended Unicode sequence. We will get here only if the support
2363     is in the binary; otherwise a compile-time error occurs. */
2364    
2365     case OP_EXTUNI:
2366 ph10 443 if (eptr >= md->end_subject)
2367 ph10 428 {
2368 ph10 443 SCHECK_PARTIAL();
2369 ph10 510 MRRETURN(MATCH_NOMATCH);
2370 ph10 443 }
2371 nigel 77 GETCHARINCTEST(c, eptr);
2372 ph10 623 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2373     while (eptr < md->end_subject)
2374 nigel 77 {
2375 ph10 623 int len = 1;
2376     if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2377     if (UCD_CATEGORY(c) != ucp_M) break;
2378     eptr += len;
2379 nigel 77 }
2380     ecode++;
2381     break;
2382     #endif
2383    
2384    
2385     /* Match a back reference, possibly repeatedly. Look past the end of the
2386     item to see if there is repeat information following. The code is similar
2387     to that for character classes, but repeated for efficiency. Then obey
2388     similar code to character type repeats - written out again for speed.
2389     However, if the referenced string is the empty string, always treat
2390     it as matched, any number of times (otherwise there could be infinite
2391     loops). */
2392    
2393     case OP_REF:
2394 ph10 625 case OP_REFI:
2395     caseless = op == OP_REFI;
2396 ph10 595 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2397     ecode += 3;
2398 ph10 345
2399 ph10 595 /* If the reference is unset, there are two possibilities:
2400 ph10 345
2401 ph10 595 (a) In the default, Perl-compatible state, set the length negative;
2402     this ensures that every attempt at a match fails. We can't just fail
2403     here, because of the possibility of quantifiers with zero minima.
2404 ph10 345
2405 ph10 595 (b) If the JavaScript compatibility flag is set, set the length to zero
2406     so that the back reference matches an empty string.
2407 ph10 345
2408 ph10 595 Otherwise, set the length to the length of what was matched by the
2409     referenced subpattern. */
2410 ph10 345
2411 ph10 595 if (offset >= offset_top || md->offset_vector[offset] < 0)
2412     length = (md->jscript_compat)? 0 : -1;
2413     else
2414     length = md->offset_vector[offset+1] - md->offset_vector[offset];
2415 nigel 77
2416 ph10 595 /* Set up for repetition, or handle the non-repeated case */
2417 nigel 77
2418 ph10 595 switch (*ecode)
2419     {
2420     case OP_CRSTAR:
2421     case OP_CRMINSTAR:
2422     case OP_CRPLUS:
2423     case OP_CRMINPLUS:
2424     case OP_CRQUERY:
2425     case OP_CRMINQUERY:
2426     c = *ecode++ - OP_CRSTAR;
2427     minimize = (c & 1) != 0;
2428     min = rep_min[c]; /* Pick up values from tables; */
2429     max = rep_max[c]; /* zero for max => infinity */
2430     if (max == 0) max = INT_MAX;
2431     break;
2432 nigel 77
2433 ph10 595 case OP_CRRANGE:
2434     case OP_CRMINRANGE:
2435     minimize = (*ecode == OP_CRMINRANGE);
2436     min = GET2(ecode, 1);
2437     max = GET2(ecode, 3);
2438     if (max == 0) max = INT_MAX;
2439     ecode += 5;
2440     break;
2441 nigel 77
2442 ph10 595 default: /* No repeat follows */
2443 ph10 602 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2444 ph10 595 {
2445     CHECK_PARTIAL();
2446     MRRETURN(MATCH_NOMATCH);
2447 nigel 77 }
2448 ph10 595 eptr += length;
2449     continue; /* With the main loop */
2450     }
2451 nigel 77
2452 ph10 595 /* Handle repeated back references. If the length of the reference is
2453     zero, just continue with the main loop. */
2454 ph10 443
2455 ph10 595 if (length == 0) continue;
2456 nigel 77
2457 ph10 595 /* First, ensure the minimum number of matches are present. We get back
2458     the length of the reference string explicitly rather than passing the
2459     address of eptr, so that eptr can be a register variable. */
2460 nigel 77
2461 ph10 595 for (i = 1; i <= min; i++)
2462     {
2463 ph10 625 int slength;
2464 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2465 nigel 77 {
2466 ph10 595 CHECK_PARTIAL();
2467     MRRETURN(MATCH_NOMATCH);
2468 nigel 77 }
2469 ph10 595 eptr += slength;
2470     }
2471 nigel 77
2472 ph10 595 /* If min = max, continue at the same level without recursion.
2473     They are not both allowed to be zero. */
2474 nigel 77
2475 ph10 595 if (min == max) continue;
2476 nigel 77
2477 ph10 595 /* If minimizing, keep trying and advancing the pointer */
2478 nigel 77
2479 ph10 595 if (minimize)
2480     {
2481     for (fi = min;; fi++)
2482 nigel 77 {
2483 ph10 625 int slength;
2484 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2485 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2486     if (fi >= max) MRRETURN(MATCH_NOMATCH);
2487 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2488 nigel 77 {
2489 ph10 595 CHECK_PARTIAL();
2490     MRRETURN(MATCH_NOMATCH);
2491 nigel 77 }
2492 ph10 595 eptr += slength;
2493 nigel 77 }
2494 ph10 595 /* Control never gets here */
2495     }
2496 nigel 77
2497 ph10 595 /* If maximizing, find the longest string and work backwards */
2498 nigel 77
2499 ph10 595 else
2500     {
2501     pp = eptr;
2502     for (i = min; i < max; i++)
2503 nigel 77 {
2504 ph10 625 int slength;
2505 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2506 nigel 77 {
2507 ph10 595 CHECK_PARTIAL();
2508     break;
2509 nigel 77 }
2510 ph10 595 eptr += slength;
2511 nigel 77 }
2512 ph10 595 while (eptr >= pp)
2513     {
2514 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2515 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2516     eptr -= length;
2517     }
2518     MRRETURN(MATCH_NOMATCH);
2519 nigel 77 }
2520     /* Control never gets here */
2521    
2522     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2523     used when all the characters in the class have values in the range 0-255,
2524     and either the matching is caseful, or the characters are in the range
2525     0-127 when UTF-8 processing is enabled. The only difference between
2526     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2527     encountered.
2528    
2529     First, look past the end of the item to see if there is repeat information
2530     following. Then obey similar code to character type repeats - written out
2531     again for speed. */
2532    
2533     case OP_NCLASS:
2534     case OP_CLASS:
2535     {
2536     data = ecode + 1; /* Save for matching */
2537     ecode += 33; /* Advance past the item */
2538    
2539     switch (*ecode)
2540     {
2541     case OP_CRSTAR:
2542     case OP_CRMINSTAR:
2543     case OP_CRPLUS:
2544     case OP_CRMINPLUS:
2545     case OP_CRQUERY:
2546     case OP_CRMINQUERY:
2547     c = *ecode++ - OP_CRSTAR;
2548     minimize = (c & 1) != 0;
2549     min = rep_min[c]; /* Pick up values from tables; */
2550     max = rep_max[c]; /* zero for max => infinity */
2551     if (max == 0) max = INT_MAX;
2552     break;
2553    
2554     case OP_CRRANGE:
2555     case OP_CRMINRANGE:
2556     minimize = (*ecode == OP_CRMINRANGE);
2557     min = GET2(ecode, 1);
2558     max = GET2(ecode, 3);
2559     if (max == 0) max = INT_MAX;
2560     ecode += 5;
2561     break;
2562    
2563     default: /* No repeat follows */
2564     min = max = 1;
2565     break;
2566     }
2567    
2568     /* First, ensure the minimum number of matches are present. */
2569    
2570     #ifdef SUPPORT_UTF8
2571     /* UTF-8 mode */
2572     if (utf8)
2573     {
2574     for (i = 1; i <= min; i++)
2575     {
2576 ph10 427 if (eptr >= md->end_subject)
2577 ph10 426 {
2578 ph10 428 SCHECK_PARTIAL();
2579 ph10 510 MRRETURN(MATCH_NOMATCH);
2580 ph10 427 }
2581 nigel 77 GETCHARINC(c, eptr);
2582     if (c > 255)
2583     {
2584 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2585 nigel 77 }
2586     else
2587     {
2588 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2589 nigel 77 }
2590     }
2591     }
2592     else
2593     #endif
2594     /* Not UTF-8 mode */
2595     {
2596     for (i = 1; i <= min; i++)
2597     {
2598 ph10 427 if (eptr >= md->end_subject)
2599 ph10 426 {
2600 ph10 428 SCHECK_PARTIAL();
2601 ph10 510 MRRETURN(MATCH_NOMATCH);
2602 ph10 427 }
2603 nigel 77 c = *eptr++;
2604 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2605 nigel 77 }
2606     }
2607    
2608     /* If max == min we can continue with the main loop without the
2609     need to recurse. */
2610    
2611     if (min == max) continue;
2612    
2613     /* If minimizing, keep testing the rest of the expression and advancing
2614     the pointer while it matches the class. */
2615    
2616     if (minimize)
2617     {
2618     #ifdef SUPPORT_UTF8
2619     /* UTF-8 mode */
2620     if (utf8)
2621     {
2622     for (fi = min;; fi++)
2623     {
2624 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2625 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2626 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2627 ph10 427 if (eptr >= md->end_subject)
2628 ph10 426 {
2629 ph10 427 SCHECK_PARTIAL();
2630 ph10 510 MRRETURN(MATCH_NOMATCH);
2631 ph10 427 }
2632 nigel 77 GETCHARINC(c, eptr);
2633     if (c > 255)
2634     {
2635 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2636 nigel 77 }
2637     else
2638     {
2639 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2640 nigel 77 }
2641     }
2642     }
2643     else
2644     #endif
2645     /* Not UTF-8 mode */
2646     {
2647     for (fi = min;; fi++)
2648     {
2649 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2650 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2651 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2652 ph10 427 if (eptr >= md->end_subject)
2653 ph10 426 {
2654 ph10 427 SCHECK_PARTIAL();
2655 ph10 510 MRRETURN(MATCH_NOMATCH);
2656 ph10 427 }
2657 nigel 77 c = *eptr++;
2658 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2659 nigel 77 }
2660     }
2661     /* Control never gets here */
2662     }
2663    
2664     /* If maximizing, find the longest possible run, then work backwards. */
2665    
2666     else
2667     {
2668     pp = eptr;
2669    
2670     #ifdef SUPPORT_UTF8
2671     /* UTF-8 mode */
2672     if (utf8)
2673     {
2674     for (i = min; i < max; i++)
2675     {
2676     int len = 1;
2677 ph10 463 if (eptr >= md->end_subject)
2678 ph10 462 {
2679 ph10 463 SCHECK_PARTIAL();
2680 ph10 462 break;
2681 ph10 463 }
2682 nigel 77 GETCHARLEN(c, eptr, len);
2683     if (c > 255)
2684     {
2685     if (op == OP_CLASS) break;
2686     }
2687     else
2688     {
2689     if ((data[c/8] & (1 << (c&7))) == 0) break;
2690     }
2691     eptr += len;
2692     }
2693     for (;;)
2694     {
2695 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2696 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2697     if (eptr-- == pp) break; /* Stop if tried at original pos */
2698     BACKCHAR(eptr);
2699     }
2700     }
2701     else
2702     #endif
2703     /* Not UTF-8 mode */
2704     {
2705     for (i = min; i < max; i++)
2706     {
2707 ph10 463 if (eptr >= md->end_subject)
2708 ph10 462 {
2709 ph10 463 SCHECK_PARTIAL();
2710 ph10 462 break;
2711 ph10 463 }
2712 nigel 77 c = *eptr;
2713     if ((data[c/8] & (1 << (c&7))) == 0) break;
2714     eptr++;
2715     }
2716     while (eptr >= pp)
2717     {
2718 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2719 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2720 nigel 77 eptr--;
2721     }
2722     }
2723    
2724 ph10 510 MRRETURN(MATCH_NOMATCH);
2725 nigel 77 }
2726     }
2727     /* Control never gets here */
2728    
2729    
2730     /* Match an extended character class. This opcode is encountered only
2731 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2732     mode, because Unicode properties are supported in non-UTF-8 mode. */
2733 nigel 77
2734     #ifdef SUPPORT_UTF8
2735     case OP_XCLASS:
2736     {
2737     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2738     ecode += GET(ecode, 1); /* Advance past the item */
2739    
2740     switch (*ecode)
2741     {
2742     case OP_CRSTAR:
2743     case OP_CRMINSTAR:
2744     case OP_CRPLUS:
2745     case OP_CRMINPLUS:
2746     case OP_CRQUERY:
2747     case OP_CRMINQUERY:
2748     c = *ecode++ - OP_CRSTAR;
2749     minimize = (c & 1) != 0;
2750     min = rep_min[c]; /* Pick up values from tables; */
2751     max = rep_max[c]; /* zero for max => infinity */
2752     if (max == 0) max = INT_MAX;
2753     break;
2754    
2755     case OP_CRRANGE:
2756     case OP_CRMINRANGE:
2757     minimize = (*ecode == OP_CRMINRANGE);
2758     min = GET2(ecode, 1);
2759     max = GET2(ecode, 3);
2760     if (max == 0) max = INT_MAX;
2761     ecode += 5;
2762     break;
2763    
2764     default: /* No repeat follows */
2765     min = max = 1;
2766     break;
2767     }
2768    
2769     /* First, ensure the minimum number of matches are present. */
2770    
2771     for (i = 1; i <= min; i++)
2772     {
2773 ph10 427 if (eptr >= md->end_subject)
2774 ph10 426 {
2775     SCHECK_PARTIAL();
2776 ph10 510 MRRETURN(MATCH_NOMATCH);
2777 ph10 427 }
2778 ph10 384 GETCHARINCTEST(c, eptr);
2779 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2780 nigel 77 }
2781    
2782     /* If max == min we can continue with the main loop without the
2783     need to recurse. */
2784    
2785     if (min == max) continue;
2786    
2787     /* If minimizing, keep testing the rest of the expression and advancing
2788     the pointer while it matches the class. */
2789    
2790     if (minimize)
2791     {
2792     for (fi = min;; fi++)
2793     {
2794 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2795 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2796 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2797 ph10 427 if (eptr >= md->end_subject)
2798 ph10 426 {
2799 ph10 427 SCHECK_PARTIAL();
2800 ph10 510 MRRETURN(MATCH_NOMATCH);
2801 ph10 427 }
2802 ph10 384 GETCHARINCTEST(c, eptr);
2803 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2804 nigel 77 }
2805     /* Control never gets here */
2806     }
2807    
2808     /* If maximizing, find the longest possible run, then work backwards. */
2809    
2810     else
2811     {
2812     pp = eptr;
2813     for (i = min; i < max; i++)
2814     {
2815     int len = 1;
2816 ph10 463 if (eptr >= md->end_subject)
2817 ph10 462 {
2818 ph10 463 SCHECK_PARTIAL();
2819 ph10 462 break;
2820 ph10 463 }
2821 ph10 384 GETCHARLENTEST(c, eptr, len);
2822 nigel 77 if (!_pcre_xclass(c, data)) break;
2823     eptr += len;
2824     }
2825     for(;;)
2826     {
2827 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2828 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2829     if (eptr-- == pp) break; /* Stop if tried at original pos */
2830 ph10 214 if (utf8) BACKCHAR(eptr);
2831 nigel 77 }
2832 ph10 510 MRRETURN(MATCH_NOMATCH);
2833 nigel 77 }
2834    
2835     /* Control never gets here */
2836     }
2837     #endif /* End of XCLASS */
2838    
2839     /* Match a single character, casefully */
2840    
2841     case OP_CHAR:
2842     #ifdef SUPPORT_UTF8
2843     if (utf8)
2844     {
2845     length = 1;
2846     ecode++;
2847     GETCHARLEN(fc, ecode, length);
2848 ph10 443 if (length > md->end_subject - eptr)
2849 ph10 428 {
2850     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2851 ph10 510 MRRETURN(MATCH_NOMATCH);
2852 ph10 443 }
2853 ph10 510 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2854 nigel 77 }
2855     else
2856     #endif
2857    
2858     /* Non-UTF-8 mode */
2859     {
2860 ph10 443 if (md->end_subject - eptr < 1)
2861 ph10 428 {
2862     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2863 ph10 510 MRRETURN(MATCH_NOMATCH);
2864 ph10 443 }
2865 ph10 510 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2866 nigel 77 ecode += 2;
2867     }
2868     break;
2869    
2870     /* Match a single character, caselessly */
2871    
2872 ph10 602 case OP_CHARI:
2873 nigel 77 #ifdef SUPPORT_UTF8
2874     if (utf8)
2875     {
2876     length = 1;
2877     ecode++;
2878     GETCHARLEN(fc, ecode, length);
2879    
2880 ph10 443 if (length > md->end_subject - eptr)
2881 ph10 428 {
2882     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2883 ph10 510 MRRETURN(MATCH_NOMATCH);
2884 ph10 443 }
2885 nigel 77
2886     /* If the pattern character's value is < 128, we have only one byte, and
2887     can use the fast lookup table. */
2888    
2889     if (fc < 128)
2890     {
2891 ph10 510 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2892 nigel 77 }
2893    
2894     /* Otherwise we must pick up the subject character */
2895    
2896     else
2897     {
2898 nigel 93 unsigned int dc;
2899 nigel 77 GETCHARINC(dc, eptr);
2900     ecode += length;
2901    
2902     /* If we have Unicode property support, we can use it to test the other
2903 nigel 87 case of the character, if there is one. */
2904 nigel 77
2905     if (fc != dc)
2906     {
2907     #ifdef SUPPORT_UCP
2908 ph10 349 if (dc != UCD_OTHERCASE(fc))
2909 nigel 77 #endif
2910 ph10 510 MRRETURN(MATCH_NOMATCH);
2911 nigel 77 }
2912     }
2913     }
2914     else
2915     #endif /* SUPPORT_UTF8 */
2916    
2917     /* Non-UTF-8 mode */
2918     {
2919 ph10 443 if (md->end_subject - eptr < 1)
2920 ph10 428 {
2921 ph10 443 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2922 ph10 510 MRRETURN(MATCH_NOMATCH);
2923 ph10 443 }
2924 ph10 510 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2925 nigel 77 ecode += 2;
2926     }
2927     break;
2928    
2929 nigel 93 /* Match a single character repeatedly. */
2930 nigel 77
2931     case OP_EXACT:
2932 ph10 602 case OP_EXACTI:
2933 nigel 77 min = max = GET2(ecode, 1);
2934     ecode += 3;
2935     goto REPEATCHAR;
2936    
2937 nigel 93 case OP_POSUPTO:
2938 ph10 602 case OP_POSUPTOI:
2939 nigel 93 possessive = TRUE;
2940     /* Fall through */
2941    
2942 nigel 77 case OP_UPTO:
2943 ph10 602 case OP_UPTOI:
2944 nigel 77 case OP_MINUPTO:
2945 ph10 602 case OP_MINUPTOI:
2946 nigel 77 min = 0;
2947     max = GET2(ecode, 1);
2948 ph10 602 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2949 nigel 77 ecode += 3;
2950     goto REPEATCHAR;
2951    
2952 nigel 93 case OP_POSSTAR:
2953 ph10 602 case OP_POSSTARI:
2954 nigel 93 possessive = TRUE;
2955     min = 0;
2956     max = INT_MAX;
2957     ecode++;
2958     goto REPEATCHAR;
2959    
2960     case OP_POSPLUS:
2961 ph10 602 case OP_POSPLUSI:
2962 nigel 93 possessive = TRUE;
2963     min = 1;
2964     max = INT_MAX;
2965     ecode++;
2966     goto REPEATCHAR;
2967    
2968     case OP_POSQUERY:
2969 ph10 602 case OP_POSQUERYI:
2970 nigel 93 possessive = TRUE;
2971     min = 0;
2972     max = 1;
2973     ecode++;
2974     goto REPEATCHAR;
2975    
2976 nigel 77 case OP_STAR:
2977 ph10 602 case OP_STARI:
2978 nigel 77 case OP_MINSTAR:
2979 ph10 602 case OP_MINSTARI:
2980 nigel 77 case OP_PLUS:
2981 ph10 602 case OP_PLUSI:
2982 nigel 77 case OP_MINPLUS:
2983 ph10 602 case OP_MINPLUSI:
2984 nigel 77 case OP_QUERY:
2985 ph10 602 case OP_QUERYI:
2986 nigel 77 case OP_MINQUERY:
2987 ph10 602 case OP_MINQUERYI:
2988     c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
2989 nigel 77 minimize = (c & 1) != 0;
2990     min = rep_min[c]; /* Pick up values from tables; */
2991     max = rep_max[c]; /* zero for max => infinity */
2992     if (max == 0) max = INT_MAX;
2993    
2994 ph10 426 /* Common code for all repeated single-character matches. */
2995 nigel 77
2996     REPEATCHAR:
2997     #ifdef SUPPORT_UTF8
2998     if (utf8)
2999     {
3000     length = 1;
3001     charptr = ecode;
3002     GETCHARLEN(fc, ecode, length);
3003     ecode += length;
3004    
3005     /* Handle multibyte character matching specially here. There is
3006     support for caseless matching if UCP support is present. */
3007    
3008     if (length > 1)
3009     {
3010     #ifdef SUPPORT_UCP
3011 nigel 93 unsigned int othercase;
3012 ph10 602 if (op >= OP_STARI && /* Caseless */
3013 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
3014 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
3015 ph10 115 else oclength = 0;
3016 nigel 77 #endif /* SUPPORT_UCP */
3017    
3018     for (i = 1; i <= min; i++)
3019     {
3020 ph10 426 if (eptr <= md->end_subject - length &&
3021     memcmp(eptr, charptr, length) == 0) eptr += length;
3022 ph10 123 #ifdef SUPPORT_UCP
3023 ph10 426 else if (oclength > 0 &&
3024     eptr <= md->end_subject - oclength &&
3025     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3026     #endif /* SUPPORT_UCP */
3027 nigel 77 else
3028     {
3029 ph10 426 CHECK_PARTIAL();
3030 ph10 510 MRRETURN(MATCH_NOMATCH);
3031 nigel 77 }
3032     }
3033    
3034     if (min == max) continue;
3035    
3036     if (minimize)
3037     {
3038     for (fi = min;; fi++)
3039     {
3040 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3041 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3042 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3043 ph10 426 if (eptr <= md->end_subject - length &&
3044     memcmp(eptr, charptr, length) == 0) eptr += length;
3045 ph10 123 #ifdef SUPPORT_UCP
3046 ph10 426 else if (oclength > 0 &&
3047     eptr <= md->end_subject - oclength &&
3048     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3049     #endif /* SUPPORT_UCP */
3050 nigel 77 else
3051     {
3052 ph10 426 CHECK_PARTIAL();
3053 ph10 510 MRRETURN(MATCH_NOMATCH);
3054 nigel 77 }
3055     }
3056     /* Control never gets here */
3057     }
3058 nigel 93
3059     else /* Maximize */
3060 nigel 77 {
3061     pp = eptr;
3062     for (i = min; i < max; i++)
3063     {
3064 ph10 426 if (eptr <= md->end_subject - length &&
3065     memcmp(eptr, charptr, length) == 0) eptr += length;
3066 ph10 123 #ifdef SUPPORT_UCP
3067 ph10 426 else if (oclength > 0 &&
3068     eptr <= md->end_subject - oclength &&
3069     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3070     #endif /* SUPPORT_UCP */
3071 ph10 463 else
3072 ph10 462 {
3073 ph10 463 CHECK_PARTIAL();
3074 ph10 462 break;
3075 ph10 463 }
3076 nigel 77 }
3077 nigel 93
3078     if (possessive) continue;
3079 ph10 427
3080 ph10 120 for(;;)
3081 ph10 426 {
3082 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3083 ph10 426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3084 ph10 510 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3085 ph10 115 #ifdef SUPPORT_UCP
3086 ph10 426 eptr--;
3087     BACKCHAR(eptr);
3088 ph10 123 #else /* without SUPPORT_UCP */
3089 ph10 426 eptr -= length;
3090 ph10 123 #endif /* SUPPORT_UCP */
3091 ph10 426 }
3092 nigel 77 }
3093     /* Control never gets here */
3094     }
3095    
3096     /* If the length of a UTF-8 character is 1, we fall through here, and
3097     obey the code as for non-UTF-8 characters below, though in this case the
3098     value of fc will always be < 128. */
3099     }
3100     else
3101     #endif /* SUPPORT_UTF8 */
3102    
3103     /* When not in UTF-8 mode, load a single-byte character. */
3104    
3105 ph10 426 fc = *ecode++;
3106 ph10 443
3107 nigel 77 /* The value of fc at this point is always less than 256, though we may or
3108     may not be in UTF-8 mode. The code is duplicated for the caseless and
3109     caseful cases, for speed, since matching characters is likely to be quite
3110     common. First, ensure the minimum number of matches are present. If min =
3111     max, continue at the same level without recursing. Otherwise, if
3112     minimizing, keep trying the rest of the expression and advancing one
3113     matching character if failing, up to the maximum. Alternatively, if
3114     maximizing, find the maximum number of characters and work backwards. */
3115    
3116     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3117     max, eptr));
3118    
3119 ph10 602 if (op >= OP_STARI) /* Caseless */
3120 nigel 77 {
3121     fc = md->lcc[fc];
3122     for (i = 1; i <= min; i++)
3123 ph10 426 {
3124     if (eptr >= md->end_subject)
3125     {
3126     SCHECK_PARTIAL();
3127 ph10 510 MRRETURN(MATCH_NOMATCH);
3128 ph10 426 }
3129 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3130 ph10 426 }
3131 nigel 77 if (min == max) continue;
3132     if (minimize)
3133     {
3134     for (fi = min;; fi++)
3135     {
3136 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3137 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3138 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3139 ph10 426 if (eptr >= md->end_subject)
3140     {
3141 ph10 427 SCHECK_PARTIAL();
3142 ph10 510 MRRETURN(MATCH_NOMATCH);
3143 ph10 426 }
3144 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3145 nigel 77 }
3146     /* Control never gets here */
3147     }
3148 nigel 93 else /* Maximize */
3149 nigel 77 {
3150     pp = eptr;
3151     for (i = min; i < max; i++)
3152     {
3153 ph10 463 if (eptr >= md->end_subject)
3154 ph10 462 {
3155     SCHECK_PARTIAL();
3156     break;
3157 ph10 463 }
3158 ph10 462 if (fc != md->lcc[*eptr]) break;
3159 nigel 77 eptr++;
3160     }
3161 ph10 427
3162 nigel 93 if (possessive) continue;
3163 ph10 427
3164 nigel 77 while (eptr >= pp)
3165     {
3166 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3167 nigel 77 eptr--;
3168     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3169     }
3170 ph10 510 MRRETURN(MATCH_NOMATCH);
3171 nigel 77 }
3172     /* Control never gets here */
3173     }
3174    
3175     /* Caseful comparisons (includes all multi-byte characters) */
3176    
3177     else
3178     {
3179 ph10 427 for (i = 1; i <= min; i++)
3180 ph10 426 {
3181     if (eptr >= md->end_subject)
3182     {
3183     SCHECK_PARTIAL();
3184 ph10 510 MRRETURN(MATCH_NOMATCH);
3185 ph10 426 }
3186 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3187 ph10 427 }
3188 ph10 443
3189 nigel 77 if (min == max) continue;
3190 ph10 443
3191 nigel 77 if (minimize)
3192     {
3193     for (fi = min;; fi++)
3194     {
3195 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3196 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3197 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3198 ph10 426 if (eptr >= md->end_subject)
3199 ph10 427 {
3200 ph10 426 SCHECK_PARTIAL();
3201 ph10 510 MRRETURN(MATCH_NOMATCH);
3202 ph10 427 }
3203 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3204 nigel 77 }
3205     /* Control never gets here */
3206     }
3207 nigel 93 else /* Maximize */
3208 nigel 77 {
3209     pp = eptr;
3210     for (i = min; i < max; i++)
3211     {
3212 ph10 463 if (eptr >= md->end_subject)
3213 ph10 462 {
3214 ph10 463 SCHECK_PARTIAL();
3215 ph10 462 break;
3216 ph10 463 }
3217 ph10 462 if (fc != *eptr) break;
3218 nigel 77 eptr++;
3219     }
3220 nigel 93 if (possessive) continue;
3221 ph10 443
3222 nigel 77 while (eptr >= pp)
3223     {
3224 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3225 nigel 77 eptr--;
3226     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3227     }
3228 ph10 510 MRRETURN(MATCH_NOMATCH);
3229 nigel 77 }
3230     }
3231     /* Control never gets here */
3232    
3233     /* Match a negated single one-byte character. The character we are
3234     checking can be multibyte. */
3235    
3236     case OP_NOT:
3237 ph10 625 case OP_NOTI:
3238 ph10 443 if (eptr >= md->end_subject)
3239 ph10 428 {
3240 ph10 443 SCHECK_PARTIAL();
3241 ph10 510 MRRETURN(MATCH_NOMATCH);
3242 ph10 443 }
3243 nigel 77 ecode++;
3244     GETCHARINCTEST(c, eptr);
3245 ph10 602 if (op == OP_NOTI) /* The caseless case */
3246 nigel 77 {
3247     #ifdef SUPPORT_UTF8
3248     if (c < 256)
3249     #endif
3250     c = md->lcc[c];
3251 ph10 510 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3252 nigel 77 }
3253 ph10 602 else /* Caseful */
3254 nigel 77 {
3255 ph10 510 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3256 nigel 77 }
3257     break;
3258    
3259     /* Match a negated single one-byte character repeatedly. This is almost a
3260     repeat of the code for a repeated single character, but I haven't found a
3261     nice way of commoning these up that doesn't require a test of the
3262     positive/negative option for each character match. Maybe that wouldn't add
3263     very much to the time taken, but character matching *is* what this is all
3264     about... */
3265    
3266     case OP_NOTEXACT:
3267 ph10 602 case OP_NOTEXACTI:
3268 nigel 77 min = max = GET2(ecode, 1);
3269     ecode += 3;
3270     goto REPEATNOTCHAR;
3271    
3272     case OP_NOTUPTO:
3273 ph10 602 case OP_NOTUPTOI:
3274 nigel 77 case OP_NOTMINUPTO:
3275 ph10 602 case OP_NOTMINUPTOI:
3276 nigel 77 min = 0;
3277     max = GET2(ecode, 1);
3278 ph10 602 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3279 nigel 77 ecode += 3;
3280     goto REPEATNOTCHAR;
3281    
3282 nigel 93 case OP_NOTPOSSTAR:
3283 ph10 602 case OP_NOTPOSSTARI:
3284 nigel 93 possessive = TRUE;
3285     min = 0;
3286     max = INT_MAX;
3287     ecode++;
3288     goto REPEATNOTCHAR;
3289    
3290     case OP_NOTPOSPLUS:
3291 ph10 602 case OP_NOTPOSPLUSI:
3292 nigel 93 possessive = TRUE;
3293     min = 1;
3294     max = INT_MAX;
3295     ecode++;
3296     goto REPEATNOTCHAR;
3297    
3298     case OP_NOTPOSQUERY:
3299 ph10 602 case OP_NOTPOSQUERYI:
3300 nigel 93 possessive = TRUE;
3301     min = 0;
3302     max = 1;
3303     ecode++;
3304     goto REPEATNOTCHAR;
3305    
3306     case OP_NOTPOSUPTO:
3307 ph10 602 case OP_NOTPOSUPTOI:
3308 nigel 93 possessive = TRUE;
3309     min = 0;
3310     max = GET2(ecode, 1);
3311     ecode += 3;
3312     goto REPEATNOTCHAR;
3313    
3314 nigel 77 case OP_NOTSTAR:
3315 ph10 602 case OP_NOTSTARI:
3316 nigel 77 case OP_NOTMINSTAR:
3317 ph10 602 case OP_NOTMINSTARI:
3318 nigel 77 case OP_NOTPLUS:
3319 ph10 602 case OP_NOTPLUSI:
3320 nigel 77 case OP_NOTMINPLUS:
3321 ph10 602 case OP_NOTMINPLUSI:
3322 nigel 77 case OP_NOTQUERY:
3323 ph10 602 case OP_NOTQUERYI:
3324 nigel 77 case OP_NOTMINQUERY:
3325 ph10 602 case OP_NOTMINQUERYI:
3326     c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3327 nigel 77 minimize = (c & 1) != 0;
3328     min = rep_min[c]; /* Pick up values from tables; */
3329     max = rep_max[c]; /* zero for max => infinity */
3330     if (max == 0) max = INT_MAX;
3331    
3332 ph10 426 /* Common code for all repeated single-byte matches. */
3333 nigel 77
3334     REPEATNOTCHAR:
3335     fc = *ecode++;
3336    
3337     /* The code is duplicated for the caseless and caseful cases, for speed,
3338     since matching characters is likely to be quite common. First, ensure the
3339     minimum number of matches are present. If min = max, continue at the same
3340     level without recursing. Otherwise, if minimizing, keep trying the rest of
3341     the expression and advancing one matching character if failing, up to the
3342     maximum. Alternatively, if maximizing, find the maximum number of
3343     characters and work backwards. */
3344    
3345     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3346     max, eptr));
3347    
3348 ph10 602 if (op >= OP_NOTSTARI) /* Caseless */
3349 nigel 77 {
3350     fc = md->lcc[fc];
3351    
3352     #ifdef SUPPORT_UTF8
3353     /* UTF-8 mode */
3354     if (utf8)
3355     {
3356 nigel 93 register unsigned int d;
3357 nigel 77 for (i = 1; i <= min; i++)
3358     {
3359 ph10 426 if (eptr >= md->end_subject)
3360     {
3361     SCHECK_PARTIAL();
3362 ph10 510 MRRETURN(MATCH_NOMATCH);
3363 ph10 427 }
3364 nigel 77 GETCHARINC(d, eptr);
3365     if (d < 256) d = md->lcc[d];
3366 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3367 nigel 77 }
3368     }
3369     else
3370     #endif
3371    
3372     /* Not UTF-8 mode */
3373     {
3374     for (i = 1; i <= min; i++)
3375 ph10 426 {
3376     if (eptr >= md->end_subject)
3377     {
3378     SCHECK_PARTIAL();
3379 ph10 510 MRRETURN(MATCH_NOMATCH);
3380 ph10 427 }
3381 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3382 ph10 427 }
3383 nigel 77 }
3384    
3385     if (min == max) continue;
3386    
3387     if (minimize)
3388     {
3389     #ifdef SUPPORT_UTF8
3390     /* UTF-8 mode */
3391     if (utf8)
3392     {
3393 nigel 93 register unsigned int d;
3394 nigel 77 for (fi = min;; fi++)
3395     {
3396 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3397 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3398 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3399 ph10 427 if (eptr >= md->end_subject)
3400 ph10 426 {
3401 ph10 427 SCHECK_PARTIAL();
3402 ph10 510 MRRETURN(MATCH_NOMATCH);
3403 ph10 427 }
3404 nigel 77 GETCHARINC(d, eptr);
3405     if (d < 256) d = md->lcc[d];
3406 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3407 nigel 77 }
3408     }
3409     else
3410     #endif
3411     /* Not UTF-8 mode */
3412     {
3413     for (fi = min;; fi++)
3414     {
3415 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3416 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3417 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3418 ph10 426 if (eptr >= md->end_subject)
3419     {
3420     SCHECK_PARTIAL();
3421 ph10 510 MRRETURN(MATCH_NOMATCH);
3422 ph10 426 }
3423 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3424 nigel 77 }
3425     }
3426     /* Control never gets here */
3427     }
3428    
3429     /* Maximize case */
3430    
3431     else
3432     {
3433     pp = eptr;
3434    
3435     #ifdef SUPPORT_UTF8
3436     /* UTF-8 mode */
3437     if (utf8)
3438     {
3439 nigel 93 register unsigned int d;
3440 nigel 77 for (i = min; i < max; i++)
3441     {
3442     int len = 1;
3443 ph10 463 if (eptr >= md->end_subject)
3444 ph10 462 {
3445 ph10 463 SCHECK_PARTIAL();
3446 ph10 462 break;
3447 ph10 463 }
3448 nigel 77 GETCHARLEN(d, eptr, len);
3449     if (d < 256) d = md->lcc[d];
3450     if (fc == d) break;
3451     eptr += len;
3452     }
3453 nigel 93 if (possessive) continue;
3454     for(;;)
3455 nigel 77 {
3456 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3457 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3458     if (eptr-- == pp) break; /* Stop if tried at original pos */
3459     BACKCHAR(eptr);
3460     }
3461     }
3462     else
3463     #endif
3464     /* Not UTF-8 mode */
3465     {
3466     for (i = min; i < max; i++)
3467     {
3468 ph10 463 if (eptr >= md->end_subject)
3469 ph10 462 {
3470     SCHECK_PARTIAL();
3471     break;
3472 ph10 463 }
3473 ph10 462 if (fc == md->lcc[*eptr]) break;
3474 nigel 77 eptr++;
3475     }
3476 nigel 93 if (possessive) continue;
3477 nigel 77 while (eptr >= pp)
3478     {
3479 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3480 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3481     eptr--;
3482     }
3483     }
3484    
3485 ph10 510 MRRETURN(MATCH_NOMATCH);
3486 nigel 77 }
3487     /* Control never gets here */
3488     }
3489    
3490     /* Caseful comparisons */
3491    
3492     else
3493     {
3494     #ifdef SUPPORT_UTF8
3495     /* UTF-8 mode */
3496     if (utf8)
3497     {
3498 nigel 93 register unsigned int d;
3499 nigel 77 for (i = 1; i <= min; i++)
3500     {
3501 ph10 426 if (eptr >= md->end_subject)
3502     {
3503     SCHECK_PARTIAL();
3504 ph10 510 MRRETURN(MATCH_NOMATCH);
3505 ph10 427 }
3506 nigel 77 GETCHARINC(d, eptr);
3507 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3508 nigel 77 }
3509     }
3510     else
3511     #endif
3512     /* Not UTF-8 mode */
3513     {
3514     for (i = 1; i <= min; i++)
3515 ph10 426 {
3516     if (eptr >= md->end_subject)
3517     {
3518     SCHECK_PARTIAL();
3519 ph10 510 MRRETURN(MATCH_NOMATCH);
3520 ph10 427 }
3521 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3522 ph10 427 }
3523 nigel 77 }
3524    
3525     if (min == max) continue;
3526    
3527     if (minimize)
3528     {
3529     #ifdef SUPPORT_UTF8
3530     /* UTF-8 mode */
3531     if (utf8)
3532     {
3533 nigel 93 register unsigned int d;
3534 nigel 77 for (fi = min;; fi++)
3535     {
3536 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3537 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3538 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3539 ph10 427 if (eptr >= md->end_subject)
3540 ph10 426 {
3541 ph10 427 SCHECK_PARTIAL();
3542 ph10 510 MRRETURN(MATCH_NOMATCH);
3543 ph10 427 }
3544 nigel 77 GETCHARINC(d, eptr);
3545 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3546 nigel 77 }
3547     }
3548     else
3549     #endif
3550     /* Not UTF-8 mode */
3551     {
3552     for (fi = min;; fi++)
3553     {
3554 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3555 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3556 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3557 ph10 426 if (eptr >= md->end_subject)
3558     {
3559     SCHECK_PARTIAL();
3560 ph10 510 MRRETURN(MATCH_NOMATCH);
3561 ph10 427 }
3562 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3563 nigel 77 }
3564     }
3565     /* Control never gets here */
3566     }
3567    
3568     /* Maximize case */
3569    
3570     else
3571     {
3572     pp = eptr;
3573    
3574     #ifdef SUPPORT_UTF8
3575     /* UTF-8 mode */
3576     if (utf8)
3577     {
3578 nigel 93 register unsigned int d;
3579 nigel 77 for (i = min; i < max; i++)
3580     {
3581     int len = 1;
3582 ph10 463 if (eptr >= md->end_subject)
3583 ph10 462 {
3584 ph10 463 SCHECK_PARTIAL();
3585 ph10 462 break;
3586 ph10 463 }
3587 nigel 77 GETCHARLEN(d, eptr, len);
3588     if (fc == d) break;
3589     eptr += len;
3590     }
3591 nigel 93 if (possessive) continue;
3592 nigel 77 for(;;)
3593     {
3594 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3595 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3596     if (eptr-- == pp) break; /* Stop if tried at original pos */
3597     BACKCHAR(eptr);
3598     }
3599     }
3600     else
3601     #endif
3602     /* Not UTF-8 mode */
3603     {
3604     for (i = min; i < max; i++)
3605     {
3606 ph10 463 if (eptr >= md->end_subject)
3607 ph10 462 {
3608 ph10 463 SCHECK_PARTIAL();
3609 ph10 462 break;
3610 ph10 463 }
3611 ph10 462 if (fc == *eptr) break;
3612 nigel 77 eptr++;
3613     }
3614 nigel 93 if (possessive) continue;
3615 nigel 77 while (eptr >= pp)
3616     {
3617 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3618 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3619     eptr--;
3620     }
3621     }
3622    
3623 ph10 510 MRRETURN(MATCH_NOMATCH);
3624 nigel 77 }
3625     }
3626     /* Control never gets here */
3627    
3628     /* Match a single character type repeatedly; several different opcodes
3629     share code. This is very similar to the code for single characters, but we
3630     repeat it in the interests of efficiency. */
3631    
3632     case OP_TYPEEXACT:
3633     min = max = GET2(ecode, 1);
3634     minimize = TRUE;
3635     ecode += 3;
3636     goto REPEATTYPE;
3637    
3638     case OP_TYPEUPTO:
3639     case OP_TYPEMINUPTO:
3640     min = 0;
3641     max = GET2(ecode, 1);
3642     minimize = *ecode == OP_TYPEMINUPTO;
3643     ecode += 3;
3644     goto REPEATTYPE;
3645    
3646 nigel 93 case OP_TYPEPOSSTAR:
3647     possessive = TRUE;
3648     min = 0;
3649     max = INT_MAX;
3650     ecode++;
3651     goto REPEATTYPE;
3652    
3653     case OP_TYPEPOSPLUS:
3654     possessive = TRUE;
3655     min = 1;
3656     max = INT_MAX;
3657     ecode++;
3658     goto REPEATTYPE;
3659    
3660     case OP_TYPEPOSQUERY:
3661     possessive = TRUE;
3662     min = 0;
3663     max = 1;
3664     ecode++;
3665     goto REPEATTYPE;
3666    
3667     case OP_TYPEPOSUPTO:
3668     possessive = TRUE;
3669     min = 0;
3670     max = GET2(ecode, 1);
3671     ecode += 3;
3672     goto REPEATTYPE;
3673    
3674 nigel 77 case OP_TYPESTAR:
3675     case OP_TYPEMINSTAR:
3676     case OP_TYPEPLUS:
3677     case OP_TYPEMINPLUS:
3678     case OP_TYPEQUERY:
3679     case OP_TYPEMINQUERY:
3680     c = *ecode++ - OP_TYPESTAR;
3681     minimize = (c & 1) != 0;
3682     min = rep_min[c]; /* Pick up values from tables; */
3683     max = rep_max[c]; /* zero for max => infinity */
3684     if (max == 0) max = INT_MAX;
3685    
3686     /* Common code for all repeated single character type matches. Note that
3687     in UTF-8 mode, '.' matches a character of any length, but for the other
3688     character types, the valid characters are all one-byte long. */
3689    
3690     REPEATTYPE:
3691     ctype = *ecode++; /* Code for the character type */
3692    
3693     #ifdef SUPPORT_UCP
3694     if (ctype == OP_PROP || ctype == OP_NOTPROP)
3695     {
3696     prop_fail_result = ctype == OP_NOTPROP;
3697     prop_type = *ecode++;
3698 nigel 87 prop_value = *ecode++;
3699 nigel 77 }
3700     else prop_type = -1;
3701     #endif
3702    
3703     /* First, ensure the minimum number of matches are present. Use inline
3704     code for maximizing the speed, and do the type test once at the start
3705 ph10 426 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3706 nigel 77 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3707     and single-bytes. */
3708    
3709     if (min > 0)
3710     {
3711     #ifdef SUPPORT_UCP
3712 nigel 87 if (prop_type >= 0)
3713 nigel 77 {
3714 nigel 87 switch(prop_type)
3715 nigel 77 {
3716 nigel 87 case PT_ANY:
3717 ph10 510 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3718 nigel 87 for (i = 1; i <= min; i++)
3719     {
3720 ph10 427 if (eptr >= md->end_subject)
3721 ph10 426 {
3722 ph10 427 SCHECK_PARTIAL();
3723 ph10 510 MRRETURN(MATCH_NOMATCH);
3724 ph10 427 }
3725 ph10 184 GETCHARINCTEST(c, eptr);
3726 nigel 87 }
3727     break;
3728    
3729     case PT_LAMP:
3730     for (i = 1; i <= min; i++)
3731     {
3732 ph10 625 int chartype;
3733 ph10 427 if (eptr >= md->end_subject)
3734 ph10 426 {
3735 ph10 427 SCHECK_PARTIAL();
3736 ph10 510 MRRETURN(MATCH_NOMATCH);
3737 ph10 427 }
3738 ph10 184 GETCHARINCTEST(c, eptr);
3739 ph10 623 chartype = UCD_CHARTYPE(c);
3740     if ((chartype == ucp_Lu ||
3741     chartype == ucp_Ll ||
3742     chartype == ucp_Lt) == prop_fail_result)
3743 ph10 510 MRRETURN(MATCH_NOMATCH);
3744 nigel 87 }
3745     break;
3746    
3747     case PT_GC:
3748     for (i = 1; i <= min; i++)
3749     {
3750 ph10 427 if (eptr >= md->end_subject)
3751 ph10 426 {
3752 ph10 427 SCHECK_PARTIAL();
3753 ph10 510 MRRETURN(MATCH_NOMATCH);
3754 ph10 427 }
3755 ph10 184 GETCHARINCTEST(c, eptr);
3756 ph10 623 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3757 ph10 510 MRRETURN(MATCH_NOMATCH);
3758 nigel 87 }
3759     break;
3760    
3761     case PT_PC:
3762     for (i = 1; i <= min; i++)
3763     {
3764 ph10 427 if (eptr >= md->end_subject)
3765 ph10 426 {
3766 ph10 427 SCHECK_PARTIAL();
3767 ph10 510 MRRETURN(MATCH_NOMATCH);
3768 ph10 427 }
3769 ph10 184 GETCHARINCTEST(c, eptr);
3770 ph10 623 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3771 ph10 510 MRRETURN(MATCH_NOMATCH);
3772 nigel 87 }
3773     break;
3774    
3775     case PT_SC:
3776     for (i = 1; i <= min; i++)
3777     {
3778 ph10 427 if (eptr >= md->end_subject)
3779 ph10 426 {
3780 ph10 427 SCHECK_PARTIAL();
3781 ph10 510 MRRETURN(MATCH_NOMATCH);
3782 ph10 427 }
3783 ph10 184 GETCHARINCTEST(c, eptr);
3784 ph10 623 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)