/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 630 - (hide annotations) (download)
Fri Jul 22 10:00:10 2011 UTC (2 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 193789 byte(s)
Make (*MARK) work in positive assertions.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 598 Copyright (c) 1997-2011 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 ph10 625 /* Values for setting in md->match_function_type to indicate two special types
61     of call to match(). We do it this way to save on using another stack variable,
62 ph10 604 as stack usage is to be discouraged. */
63 nigel 77
64 ph10 604 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65     #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66 nigel 77
67     /* Non-error returns from the match() function. Error returns are externally
68     defined PCRE_ERROR_xxx codes, which are all negative. */
69    
70     #define MATCH_MATCH 1
71     #define MATCH_NOMATCH 0
72    
73 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
74 ph10 210 negative to avoid the external error codes. */
75    
76 ph10 511 #define MATCH_ACCEPT (-999)
77     #define MATCH_COMMIT (-998)
78 ph10 604 #define MATCH_KETRPOS (-997)
79 ph10 618 #define MATCH_ONCE (-996)
80     #define MATCH_PRUNE (-995)
81     #define MATCH_SKIP (-994)
82     #define MATCH_SKIP_ARG (-993)
83     #define MATCH_THEN (-992)
84 ph10 210
85 ph10 510 /* This is a convenience macro for code that occurs many times. */
86    
87     #define MRRETURN(ra) \
88     { \
89     md->mark = markptr; \
90     RRETURN(ra); \
91     }
92    
93 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
94     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95     because the offset vector is always a multiple of 3 long. */
96    
97     #define REC_STACK_SAVE_MAX 30
98    
99     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100    
101     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103    
104    
105    
106 ph10 475 #ifdef PCRE_DEBUG
107 nigel 77 /*************************************************
108     * Debugging function to print chars *
109     *************************************************/
110    
111     /* Print a sequence of chars in printable format, stopping at the end of the
112     subject if the requested.
113    
114     Arguments:
115     p points to characters
116     length number to print
117     is_subject TRUE if printing from within md->start_subject
118     md pointer to matching data block, if is_subject is TRUE
119    
120     Returns: nothing
121     */
122    
123     static void
124     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
125     {
126 nigel 93 unsigned int c;
127 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128     while (length-- > 0)
129     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130     }
131     #endif
132    
133    
134    
135     /*************************************************
136     * Match a back-reference *
137     *************************************************/
138    
139 ph10 595 /* Normally, if a back reference hasn't been set, the length that is passed is
140     negative, so the match always fails. However, in JavaScript compatibility mode,
141 ph10 625 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 ph10 595 subject bytes matched may be different to the number of reference bytes.
143 nigel 77
144     Arguments:
145     offset index into the offset vector
146 ph10 595 eptr pointer into the subject
147     length length of reference to be matched (number of bytes)
148 nigel 77 md points to match data block
149 ph10 602 caseless TRUE if caseless
150 nigel 77
151 ph10 595 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 nigel 77 */
153    
154 ph10 595 static int
155 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
156 ph10 602 BOOL caseless)
157 nigel 77 {
158 ph10 595 USPTR eptr_start = eptr;
159     register USPTR p = md->start_subject + md->offset_vector[offset];
160 nigel 77
161 ph10 475 #ifdef PCRE_DEBUG
162 nigel 77 if (eptr >= md->end_subject)
163     printf("matching subject <null>");
164     else
165     {
166     printf("matching subject ");
167     pchars(eptr, length, TRUE, md);
168     }
169     printf(" against backref ");
170     pchars(p, length, FALSE, md);
171     printf("\n");
172     #endif
173    
174 ph10 595 /* Always fail if reference not set (and not JavaScript compatible). */
175 nigel 77
176 ph10 595 if (length < 0) return -1;
177 nigel 77
178 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179     properly if Unicode properties are supported. Otherwise, we can check only
180     ASCII characters. */
181 nigel 77
182 ph10 602 if (caseless)
183 nigel 77 {
184 ph10 354 #ifdef SUPPORT_UTF8
185     #ifdef SUPPORT_UCP
186     if (md->utf8)
187     {
188 ph10 625 /* Match characters up to the end of the reference. NOTE: the number of
189 ph10 595 bytes matched may differ, because there are some characters whose upper and
190     lower case versions code as different numbers of bytes. For example, U+023A
191     (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192     a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 ph10 625 the latter. It is important, therefore, to check the length along the
194 ph10 595 reference, not along the subject (earlier code did this wrong). */
195 ph10 625
196 ph10 595 USPTR endptr = p + length;
197     while (p < endptr)
198 ph10 354 {
199 ph10 358 int c, d;
200 ph10 597 if (eptr >= md->end_subject) return -1;
201 ph10 354 GETCHARINC(c, eptr);
202     GETCHARINC(d, p);
203 ph10 595 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 ph10 358 }
205     }
206 ph10 354 else
207     #endif
208     #endif
209    
210     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211     is no UCP support. */
212 ph10 597 {
213 ph10 625 if (eptr + length > md->end_subject) return -1;
214 ph10 597 while (length-- > 0)
215     { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 ph10 625 }
217 nigel 77 }
218 ph10 358
219 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
220     are in UTF-8 mode. */
221 ph10 358
222 nigel 77 else
223 ph10 625 {
224     if (eptr + length > md->end_subject) return -1;
225     while (length-- > 0) if (*p++ != *eptr++) return -1;
226 ph10 597 }
227 nigel 77
228 ph10 595 return eptr - eptr_start;
229 nigel 77 }
230    
231    
232    
233     /***************************************************************************
234     ****************************************************************************
235     RECURSION IN THE match() FUNCTION
236    
237 nigel 87 The match() function is highly recursive, though not every recursive call
238     increases the recursive depth. Nevertheless, some regular expressions can cause
239     it to recurse to a great depth. I was writing for Unix, so I just let it call
240     itself recursively. This uses the stack for saving everything that has to be
241     saved for a recursive call. On Unix, the stack can be large, and this works
242     fine.
243 nigel 77
244 nigel 87 It turns out that on some non-Unix-like systems there are problems with
245     programs that use a lot of stack. (This despite the fact that every last chip
246     has oodles of memory these days, and techniques for extending the stack have
247     been known for decades.) So....
248 nigel 77
249     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250     calls by keeping local variables that need to be preserved in blocks of memory
251 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
252 nigel 77 achieve this so that the actual code doesn't look very different to what it
253     always used to.
254 ph10 164
255 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
256 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
257     Switzer, the use of longjmp() has been abolished, at the cost of having to
258     provide a unique number for each call to RMATCH. There is no way of generating
259     a sequence of numbers at compile time in C. I have given them names, to make
260     them stand out more clearly.
261    
262     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
265     don't have indeterminate values; this has meant that the frame size can be
266 ph10 164 reduced because the result can be "passed back" by straight setting of the
267     variable instead of being passed in the frame.
268 nigel 77 ****************************************************************************
269     ***************************************************************************/
270    
271 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272     below must be updated in sync. */
273 nigel 77
274 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 ph10 527 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 ph10 623 RM61, RM62, RM63 };
281 ph10 164
282 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
283 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 ph10 501 actually used in this definition. */
285 nigel 77
286     #ifndef NO_RECURSE
287     #define REGISTER register
288 ph10 164
289 ph10 475 #ifdef PCRE_DEBUG
290 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 nigel 87 { \
292     printf("match() called in line %d\n", __LINE__); \
293 ph10 604 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 nigel 87 printf("to line %d\n", __LINE__); \
295     }
296     #define RRETURN(ra) \
297     { \
298     printf("match() returned %d from line %d ", ra, __LINE__); \
299     return ra; \
300     }
301     #else
302 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw) \
303     rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 nigel 77 #define RRETURN(ra) return ra
305 nigel 87 #endif
306    
307 nigel 77 #else
308    
309    
310 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
311     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312     argument of match(), which never changes. */
313 nigel 77
314     #define REGISTER
315    
316 ph10 604 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 nigel 77 {\
318 ph10 563 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 ph10 534 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 ph10 164 frame->Xwhere = rw; \
321     newframe->Xeptr = ra;\
322     newframe->Xecode = rb;\
323 ph10 168 newframe->Xmstart = mstart;\
324 ph10 501 newframe->Xmarkptr = markptr;\
325 ph10 164 newframe->Xoffset_top = rc;\
326 ph10 602 newframe->Xeptrb = re;\
327 ph10 164 newframe->Xrdepth = frame->Xrdepth + 1;\
328     newframe->Xprevframe = frame;\
329     frame = newframe;\
330     DPRINTF(("restarting from line %d\n", __LINE__));\
331     goto HEAP_RECURSE;\
332     L_##rw:\
333     DPRINTF(("jumped back to line %d\n", __LINE__));\
334 nigel 77 }
335    
336     #define RRETURN(ra)\
337     {\
338 ph10 527 heapframe *oldframe = frame;\
339     frame = oldframe->Xprevframe;\
340     (pcre_stack_free)(oldframe);\
341 nigel 77 if (frame != NULL)\
342     {\
343 ph10 164 rrc = ra;\
344     goto HEAP_RETURN;\
345 nigel 77 }\
346     return ra;\
347     }
348    
349    
350     /* Structure for remembering the local variables in a private frame */
351    
352     typedef struct heapframe {
353     struct heapframe *Xprevframe;
354    
355     /* Function arguments that may change */
356    
357 ph10 409 USPTR Xeptr;
358 nigel 77 const uschar *Xecode;
359 ph10 409 USPTR Xmstart;
360 ph10 501 USPTR Xmarkptr;
361 nigel 77 int Xoffset_top;
362     eptrblock *Xeptrb;
363 nigel 91 unsigned int Xrdepth;
364 nigel 77
365     /* Function local variables */
366    
367 ph10 409 USPTR Xcallpat;
368 ph10 406 #ifdef SUPPORT_UTF8
369 ph10 409 USPTR Xcharptr;
370 ph10 406 #endif
371 ph10 409 USPTR Xdata;
372     USPTR Xnext;
373     USPTR Xpp;
374     USPTR Xprev;
375     USPTR Xsaved_eptr;
376 nigel 77
377     recursion_info Xnew_recursive;
378    
379     BOOL Xcur_is_word;
380     BOOL Xcondition;
381     BOOL Xprev_is_word;
382    
383     #ifdef SUPPORT_UCP
384     int Xprop_type;
385 nigel 87 int Xprop_value;
386 nigel 77 int Xprop_fail_result;
387 ph10 123 int Xoclength;
388     uschar Xocchars[8];
389 nigel 77 #endif
390    
391 ph10 403 int Xcodelink;
392 nigel 77 int Xctype;
393 nigel 93 unsigned int Xfc;
394 nigel 77 int Xfi;
395     int Xlength;
396     int Xmax;
397     int Xmin;
398     int Xnumber;
399     int Xoffset;
400     int Xop;
401     int Xsave_capture_last;
402     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403     int Xstacksave[REC_STACK_SAVE_MAX];
404    
405     eptrblock Xnewptrb;
406    
407 ph10 164 /* Where to jump back to */
408 nigel 77
409 ph10 164 int Xwhere;
410 ph10 165
411 nigel 77 } heapframe;
412    
413     #endif
414    
415    
416     /***************************************************************************
417     ***************************************************************************/
418    
419    
420    
421     /*************************************************
422     * Match from current position *
423     *************************************************/
424    
425 nigel 93 /* This function is called recursively in many circumstances. Whenever it
426 nigel 77 returns a negative (error) response, the outer incarnation must also return the
427 ph10 426 same response. */
428 nigel 77
429 ph10 426 /* These macros pack up tests that are used for partial matching, and which
430     appears several times in the code. We set the "hit end" flag if the pointer is
431     at the end of the subject and also past the start of the subject (i.e.
432 ph10 427 something has been matched). For hard partial matching, we then return
433     immediately. The second one is used when we already know we are past the end of
434     the subject. */
435 ph10 426
436     #define CHECK_PARTIAL()\
437 ph10 553 if (md->partial != 0 && eptr >= md->end_subject && \
438     eptr > md->start_used_ptr) \
439     { \
440     md->hitend = TRUE; \
441     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 ph10 427 }
443 ph10 426
444     #define SCHECK_PARTIAL()\
445 ph10 553 if (md->partial != 0 && eptr > md->start_used_ptr) \
446     { \
447     md->hitend = TRUE; \
448     if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 ph10 427 }
450 ph10 426
451 ph10 427
452 ph10 426 /* Performance note: It might be tempting to extract commonly used fields from
453     the md structure (e.g. utf8, end_subject) into individual variables to improve
454 nigel 77 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455     made performance worse.
456    
457     Arguments:
458 nigel 93 eptr pointer to current character in subject
459     ecode pointer to current position in compiled code
460 ph10 168 mstart pointer to the current match start position (can be modified
461 ph10 172 by encountering \K)
462 ph10 501 markptr pointer to the most recent MARK name, or NULL
463 nigel 77 offset_top current top pointer
464     md pointer to "static" info for the match
465     eptrb pointer to chain of blocks containing eptr at start of
466     brackets - for testing for empty matches
467 nigel 87 rdepth the recursion depth
468 nigel 77
469     Returns: MATCH_MATCH if matched ) these values are >= 0
470     MATCH_NOMATCH if failed to match )
471 ph10 510 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 nigel 77 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 nigel 87 (e.g. stopped by repeated call or recursion limit)
474 nigel 77 */
475    
476     static int
477 ph10 510 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
478 ph10 625 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
479 ph10 604 unsigned int rdepth)
480 nigel 77 {
481     /* These variables do not need to be preserved over recursion in this function,
482 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
483     "register" because they are used a lot in loops. */
484 nigel 77
485 nigel 91 register int rrc; /* Returns from recursive calls */
486     register int i; /* Used for loops not involving calls to RMATCH() */
487 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
489 nigel 77
490 nigel 93 BOOL minimize, possessive; /* Quantifier options */
491 ph10 602 BOOL caseless;
492 ph10 403 int condcode;
493 nigel 93
494 nigel 77 /* When recursion is not being used, all "local" variables that have to be
495     preserved over calls to RMATCH() are part of a "frame" which is obtained from
496     heap storage. Set up the top-level frame here; others are obtained from the
497     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498    
499     #ifdef NO_RECURSE
500 ph10 563 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 ph10 531 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 nigel 77 frame->Xprevframe = NULL; /* Marks the top level */
503    
504     /* Copy in the original argument variables */
505    
506     frame->Xeptr = eptr;
507     frame->Xecode = ecode;
508 ph10 168 frame->Xmstart = mstart;
509 ph10 501 frame->Xmarkptr = markptr;
510 nigel 77 frame->Xoffset_top = offset_top;
511     frame->Xeptrb = eptrb;
512 nigel 87 frame->Xrdepth = rdepth;
513 nigel 77
514     /* This is where control jumps back to to effect "recursion" */
515    
516     HEAP_RECURSE:
517    
518     /* Macros make the argument variables come from the current frame */
519    
520     #define eptr frame->Xeptr
521     #define ecode frame->Xecode
522 ph10 168 #define mstart frame->Xmstart
523 ph10 501 #define markptr frame->Xmarkptr
524 nigel 77 #define offset_top frame->Xoffset_top
525     #define eptrb frame->Xeptrb
526 nigel 87 #define rdepth frame->Xrdepth
527 nigel 77
528     /* Ditto for the local variables */
529    
530     #ifdef SUPPORT_UTF8
531     #define charptr frame->Xcharptr
532     #endif
533     #define callpat frame->Xcallpat
534 ph10 403 #define codelink frame->Xcodelink
535 nigel 77 #define data frame->Xdata
536     #define next frame->Xnext
537     #define pp frame->Xpp
538     #define prev frame->Xprev
539     #define saved_eptr frame->Xsaved_eptr
540    
541     #define new_recursive frame->Xnew_recursive
542    
543     #define cur_is_word frame->Xcur_is_word
544     #define condition frame->Xcondition
545     #define prev_is_word frame->Xprev_is_word
546    
547     #ifdef SUPPORT_UCP
548     #define prop_type frame->Xprop_type
549 nigel 87 #define prop_value frame->Xprop_value
550 nigel 77 #define prop_fail_result frame->Xprop_fail_result
551 ph10 115 #define oclength frame->Xoclength
552     #define occhars frame->Xocchars
553 nigel 77 #endif
554    
555     #define ctype frame->Xctype
556     #define fc frame->Xfc
557     #define fi frame->Xfi
558     #define length frame->Xlength
559     #define max frame->Xmax
560     #define min frame->Xmin
561     #define number frame->Xnumber
562     #define offset frame->Xoffset
563     #define op frame->Xop
564     #define save_capture_last frame->Xsave_capture_last
565     #define save_offset1 frame->Xsave_offset1
566     #define save_offset2 frame->Xsave_offset2
567     #define save_offset3 frame->Xsave_offset3
568     #define stacksave frame->Xstacksave
569    
570     #define newptrb frame->Xnewptrb
571    
572     /* When recursion is being used, local variables are allocated on the stack and
573     get preserved during recursion in the normal way. In this environment, fi and
574     i, and fc and c, can be the same variables. */
575    
576 nigel 93 #else /* NO_RECURSE not defined */
577 nigel 77 #define fi i
578     #define fc c
579    
580 ph10 604 /* Many of the following variables are used only in small blocks of the code.
581     My normal style of coding would have declared them within each of those blocks.
582     However, in order to accommodate the version of this code that uses an external
583     "stack" implemented on the heap, it is easier to declare them all here, so the
584     declarations can be cut out in a block. The only declarations within blocks
585     below are for variables that do not have to be preserved over a recursive call
586     to RMATCH(). */
587 nigel 77
588 ph10 625 #ifdef SUPPORT_UTF8
589     const uschar *charptr;
590     #endif
591     const uschar *callpat;
592     const uschar *data;
593     const uschar *next;
594     USPTR pp;
595     const uschar *prev;
596     USPTR saved_eptr;
597    
598     recursion_info new_recursive;
599    
600     BOOL cur_is_word;
601 nigel 87 BOOL condition;
602 nigel 77 BOOL prev_is_word;
603    
604     #ifdef SUPPORT_UCP
605     int prop_type;
606 nigel 87 int prop_value;
607 nigel 77 int prop_fail_result;
608 ph10 115 int oclength;
609     uschar occhars[8];
610 nigel 77 #endif
611    
612 ph10 399 int codelink;
613 nigel 77 int ctype;
614     int length;
615     int max;
616     int min;
617     int number;
618     int offset;
619     int op;
620     int save_capture_last;
621     int save_offset1, save_offset2, save_offset3;
622     int stacksave[REC_STACK_SAVE_MAX];
623    
624     eptrblock newptrb;
625 nigel 93 #endif /* NO_RECURSE */
626 nigel 77
627 ph10 625 /* To save space on the stack and in the heap frame, I have doubled up on some
628     of the local variables that are used only in localised parts of the code, but
629     still need to be preserved over recursive calls of match(). These macros define
630 ph10 604 the alternative names that are used. */
631    
632     #define allow_zero cur_is_word
633     #define cbegroup condition
634     #define code_offset codelink
635     #define condassert condition
636     #define matched_once prev_is_word
637    
638 nigel 77 /* These statements are here to stop the compiler complaining about unitialized
639     variables. */
640    
641     #ifdef SUPPORT_UCP
642 nigel 87 prop_value = 0;
643 nigel 77 prop_fail_result = 0;
644     #endif
645    
646 nigel 93
647 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
648     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
649     used. Thanks to Ian Taylor for noticing this possibility and sending the
650     original patch. */
651    
652     TAIL_RECURSE:
653    
654 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
655     are specified by the macro RMATCH and RRETURN is used to return. When
656     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
657 ph10 475 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
658 nigel 87 defined). However, RMATCH isn't like a function call because it's quite a
659     complicated macro. It has to be used in one particular way. This shouldn't,
660     however, impact performance when true recursion is being used. */
661 nigel 77
662 ph10 164 #ifdef SUPPORT_UTF8
663     utf8 = md->utf8; /* Local copy of the flag */
664     #else
665     utf8 = FALSE;
666     #endif
667    
668 nigel 87 /* First check that we haven't called match() too many times, or that we
669     haven't exceeded the recursive call limit. */
670    
671 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
672 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
673 nigel 77
674 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
675 ph10 625 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
676     done this way to save having to use another function argument, which would take
677 ph10 604 up space on the stack. See also MATCH_CONDASSERT below.
678 nigel 77
679 ph10 604 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
680     such remembered pointers, to be checked when we hit the closing ket, in order
681     to break infinite loops that match no characters. When match() is called in
682     other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
683     NOT be used with tail recursion, because the memory block that is used is on
684     the stack, so a new one may be required for each match(). */
685    
686     if (md->match_function_type == MATCH_CBEGROUP)
687 nigel 77 {
688 ph10 197 newptrb.epb_saved_eptr = eptr;
689     newptrb.epb_prev = eptrb;
690     eptrb = &newptrb;
691 ph10 604 md->match_function_type = 0;
692 nigel 77 }
693    
694 nigel 93 /* Now start processing the opcodes. */
695 nigel 77
696     for (;;)
697     {
698 nigel 93 minimize = possessive = FALSE;
699 nigel 77 op = *ecode;
700 ph10 625
701 nigel 93 switch(op)
702     {
703 ph10 510 case OP_MARK:
704     markptr = ecode + 2;
705     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
706 ph10 604 eptrb, RM55);
707 ph10 512
708     /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
709     argument, and we must check whether that argument matches this MARK's
710     argument. It is passed back in md->start_match_ptr (an overloading of that
711     variable). If it does match, we reset that variable to the current subject
712     position and return MATCH_SKIP. Otherwise, pass back the return code
713 ph10 510 unaltered. */
714 ph10 512
715     if (rrc == MATCH_SKIP_ARG &&
716 ph10 510 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
717     {
718     md->start_match_ptr = eptr;
719     RRETURN(MATCH_SKIP);
720     }
721    
722 ph10 512 if (md->mark == NULL) md->mark = markptr;
723 ph10 510 RRETURN(rrc);
724    
725 ph10 210 case OP_FAIL:
726 ph10 510 MRRETURN(MATCH_NOMATCH);
727 ph10 211
728 ph10 551 /* COMMIT overrides PRUNE, SKIP, and THEN */
729 ph10 553
730 ph10 510 case OP_COMMIT:
731     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
732 ph10 604 eptrb, RM52);
733 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
734 ph10 553 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
735     rrc != MATCH_THEN)
736 ph10 551 RRETURN(rrc);
737 ph10 510 MRRETURN(MATCH_COMMIT);
738    
739 ph10 551 /* PRUNE overrides THEN */
740 ph10 553
741 ph10 210 case OP_PRUNE:
742     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
743 ph10 604 eptrb, RM51);
744 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
745 ph10 510 MRRETURN(MATCH_PRUNE);
746 ph10 211
747 ph10 510 case OP_PRUNE_ARG:
748     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
749 ph10 604 eptrb, RM56);
750 ph10 551 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 ph10 510 md->mark = ecode + 2;
752     RRETURN(MATCH_PRUNE);
753 ph10 211
754 ph10 551 /* SKIP overrides PRUNE and THEN */
755 ph10 553
756 ph10 210 case OP_SKIP:
757     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
758 ph10 604 eptrb, RM53);
759 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
760 ph10 551 RRETURN(rrc);
761 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
762 ph10 510 MRRETURN(MATCH_SKIP);
763 ph10 211
764 ph10 510 case OP_SKIP_ARG:
765     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
766 ph10 604 eptrb, RM57);
767 ph10 553 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 ph10 551 RRETURN(rrc);
769 ph10 512
770     /* Pass back the current skip name by overloading md->start_match_ptr and
771     returning the special MATCH_SKIP_ARG return code. This will either be
772     caught by a matching MARK, or get to the top, where it is treated the same
773 ph10 510 as PRUNE. */
774 ph10 512
775 ph10 510 md->start_match_ptr = ecode + 2;
776 ph10 512 RRETURN(MATCH_SKIP_ARG);
777 ph10 553
778 ph10 550 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
779 ph10 553 the alt that is at the start of the current branch. This makes it possible
780     to skip back past alternatives that precede the THEN within the current
781     branch. */
782 ph10 512
783 ph10 210 case OP_THEN:
784     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
785 ph10 604 eptrb, RM54);
786 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
787 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
788 ph10 510 MRRETURN(MATCH_THEN);
789    
790     case OP_THEN_ARG:
791 ph10 553 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
792 ph10 604 offset_top, md, eptrb, RM58);
793 ph10 510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
794 ph10 550 md->start_match_ptr = ecode - GET(ecode, 1);
795     md->mark = ecode + LINK_SIZE + 2;
796 ph10 212 RRETURN(MATCH_THEN);
797 ph10 211
798 ph10 604 /* Handle a capturing bracket, other than those that are possessive with an
799     unlimited repeat. If there is space in the offset vector, save the current
800     subject position in the working slot at the top of the vector. We mustn't
801     change the current values of the data slot, because they may be set from a
802     previous iteration of this group, and be referred to by a reference inside
803 ph10 625 the group. A failure to match might occur after the group has succeeded,
804 ph10 617 if something later on doesn't match. For this reason, we need to restore
805     the working value and also the values of the final offsets, in case they
806     were set by a previous iteration of the same bracket.
807 nigel 77
808 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
809     a non-capturing bracket. Don't worry about setting the flag for the error
810     case here; that is handled in the code for KET. */
811 nigel 77
812 nigel 93 case OP_CBRA:
813     case OP_SCBRA:
814     number = GET2(ecode, 1+LINK_SIZE);
815 nigel 77 offset = number << 1;
816 ph10 625
817 ph10 475 #ifdef PCRE_DEBUG
818 nigel 93 printf("start bracket %d\n", number);
819     printf("subject=");
820 nigel 77 pchars(eptr, 16, TRUE, md);
821     printf("\n");
822     #endif
823    
824     if (offset < md->offset_max)
825     {
826     save_offset1 = md->offset_vector[offset];
827     save_offset2 = md->offset_vector[offset+1];
828     save_offset3 = md->offset_vector[md->offset_end - number];
829     save_capture_last = md->capture_last;
830    
831     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
832 ph10 531 md->offset_vector[md->offset_end - number] =
833 ph10 530 (int)(eptr - md->start_subject);
834 nigel 77
835 ph10 604 for (;;)
836 nigel 77 {
837 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
838     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
839 ph10 604 eptrb, RM1);
840 ph10 618 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
841 ph10 550 if (rrc != MATCH_NOMATCH &&
842     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
843     RRETURN(rrc);
844 nigel 77 md->capture_last = save_capture_last;
845     ecode += GET(ecode, 1);
846 ph10 625 if (*ecode != OP_ALT) break;
847 nigel 77 }
848    
849     DPRINTF(("bracket %d failed\n", number));
850     md->offset_vector[offset] = save_offset1;
851     md->offset_vector[offset+1] = save_offset2;
852     md->offset_vector[md->offset_end - number] = save_offset3;
853 ph10 625
854     /* At this point, rrc will be one of MATCH_ONCE, MATCH_NOMATCH, or
855 ph10 618 MATCH_THEN. */
856 nigel 77
857 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
858 ph10 618 RRETURN(((rrc == MATCH_ONCE)? MATCH_ONCE:MATCH_NOMATCH));
859 nigel 77 }
860    
861 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
862     as a non-capturing bracket. */
863 nigel 77
864 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
865     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
866    
867 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
868 nigel 77
869 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
871    
872 ph10 618 /* Non-capturing or atomic group, except for possessive with unlimited
873     repeat. Loop for all the alternatives. When we get to the final alternative
874     within the brackets, we used to return the result of a recursive call to
875     match() whatever happened so it was possible to reduce stack usage by
876     turning this into a tail recursion, except in the case of a possibly empty
877     group. However, now that there is the possiblity of (*THEN) occurring in
878 ph10 625 the final alternative, this optimization is no longer possible.
879    
880     MATCH_ONCE is returned when the end of an atomic group is successfully
881     reached, but subsequent matching fails. It passes back up the tree (causing
882     captured values to be reset) until the original atomic group level is
883 ph10 618 reached. This is tested by comparing md->once_target with the start of the
884     group. At this point, the return is converted into MATCH_NOMATCH so that
885     previous backup points can be taken. */
886 nigel 77
887 ph10 618 case OP_ONCE:
888 nigel 93 case OP_BRA:
889     case OP_SBRA:
890     DPRINTF(("start non-capturing bracket\n"));
891 ph10 618
892 nigel 91 for (;;)
893 nigel 77 {
894 ph10 618 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
895 ph10 625 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
896 ph10 604 RM2);
897 ph10 550 if (rrc != MATCH_NOMATCH &&
898     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
899 ph10 625 {
900 ph10 618 if (rrc == MATCH_ONCE)
901     {
902     const uschar *scode = ecode;
903     if (*scode != OP_ONCE) /* If not at start, find it */
904     {
905     while (*scode == OP_ALT) scode += GET(scode, 1);
906     scode -= GET(scode, 1);
907 ph10 625 }
908 ph10 618 if (md->once_target == scode) rrc = MATCH_NOMATCH;
909 ph10 625 }
910 ph10 550 RRETURN(rrc);
911 ph10 625 }
912 nigel 77 ecode += GET(ecode, 1);
913 ph10 625 if (*ecode != OP_ALT) break;
914 nigel 77 }
915 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
916     RRETURN(MATCH_NOMATCH);
917    
918 ph10 625 /* Handle possessive capturing brackets with an unlimited repeat. We come
919 ph10 604 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
920     handled similarly to the normal case above. However, the matching is
921     different. The end of these brackets will always be OP_KETRPOS, which
922     returns MATCH_KETRPOS without going further in the pattern. By this means
923     we can handle the group by iteration rather than recursion, thereby
924     reducing the amount of stack needed. */
925 ph10 625
926 ph10 604 case OP_CBRAPOS:
927     case OP_SCBRAPOS:
928     allow_zero = FALSE;
929 ph10 625
930 ph10 604 POSSESSIVE_CAPTURE:
931     number = GET2(ecode, 1+LINK_SIZE);
932     offset = number << 1;
933    
934     #ifdef PCRE_DEBUG
935     printf("start possessive bracket %d\n", number);
936     printf("subject=");
937     pchars(eptr, 16, TRUE, md);
938     printf("\n");
939     #endif
940    
941     if (offset < md->offset_max)
942     {
943     matched_once = FALSE;
944 ph10 625 code_offset = ecode - md->start_code;
945 ph10 604
946     save_offset1 = md->offset_vector[offset];
947     save_offset2 = md->offset_vector[offset+1];
948     save_offset3 = md->offset_vector[md->offset_end - number];
949     save_capture_last = md->capture_last;
950    
951     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
952 ph10 625
953     /* Each time round the loop, save the current subject position for use
954     when the group matches. For MATCH_MATCH, the group has matched, so we
955     restart it with a new subject starting position, remembering that we had
956     at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
957     usual. If we haven't matched any alternatives in any iteration, check to
958     see if a previous iteration matched. If so, the group has matched;
959     continue from afterwards. Otherwise it has failed; restore the previous
960 ph10 604 capture values before returning NOMATCH. */
961 ph10 625
962 ph10 604 for (;;)
963     {
964     md->offset_vector[md->offset_end - number] =
965     (int)(eptr - md->start_subject);
966 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
967 ph10 604 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
968     eptrb, RM63);
969     if (rrc == MATCH_KETRPOS)
970     {
971     offset_top = md->end_offset_top;
972     eptr = md->end_match_ptr;
973 ph10 625 ecode = md->start_code + code_offset;
974 ph10 604 save_capture_last = md->capture_last;
975 ph10 625 matched_once = TRUE;
976     continue;
977     }
978 ph10 604 if (rrc != MATCH_NOMATCH &&
979     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
980     RRETURN(rrc);
981     md->capture_last = save_capture_last;
982     ecode += GET(ecode, 1);
983 ph10 625 if (*ecode != OP_ALT) break;
984 ph10 604 }
985 ph10 610
986 ph10 604 if (!matched_once)
987 ph10 625 {
988 ph10 604 md->offset_vector[offset] = save_offset1;
989     md->offset_vector[offset+1] = save_offset2;
990     md->offset_vector[md->offset_end - number] = save_offset3;
991     }
992 ph10 625
993 ph10 609 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
994 ph10 604 if (allow_zero || matched_once)
995 ph10 625 {
996 ph10 604 ecode += 1 + LINK_SIZE;
997     break;
998 ph10 625 }
999    
1000 ph10 604 RRETURN(MATCH_NOMATCH);
1001     }
1002 ph10 625
1003 ph10 604 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1004     as a non-capturing bracket. */
1005    
1006     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1007     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1008    
1009     DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1010    
1011     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1012     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1013    
1014 ph10 625 /* Non-capturing possessive bracket with unlimited repeat. We come here
1015 ph10 604 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1016     without the capturing complication. It is written out separately for speed
1017     and cleanliness. */
1018    
1019     case OP_BRAPOS:
1020     case OP_SBRAPOS:
1021 ph10 625 allow_zero = FALSE;
1022    
1023 ph10 604 POSSESSIVE_NON_CAPTURE:
1024     matched_once = FALSE;
1025 ph10 625 code_offset = ecode - md->start_code;
1026 ph10 604
1027     for (;;)
1028     {
1029 ph10 625 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1030 ph10 604 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1031 ph10 609 eptrb, RM48);
1032 ph10 604 if (rrc == MATCH_KETRPOS)
1033     {
1034 ph10 610 offset_top = md->end_offset_top;
1035 ph10 604 eptr = md->end_match_ptr;
1036 ph10 625 ecode = md->start_code + code_offset;
1037     matched_once = TRUE;
1038     continue;
1039     }
1040 ph10 604 if (rrc != MATCH_NOMATCH &&
1041     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1042     RRETURN(rrc);
1043     ecode += GET(ecode, 1);
1044 ph10 625 if (*ecode != OP_ALT) break;
1045 ph10 604 }
1046 ph10 625
1047     if (matched_once || allow_zero)
1048 ph10 604 {
1049     ecode += 1 + LINK_SIZE;
1050     break;
1051 ph10 625 }
1052 ph10 604 RRETURN(MATCH_NOMATCH);
1053    
1054     /* Control never reaches here. */
1055    
1056 nigel 77 /* Conditional group: compilation checked that there are no more than
1057     two branches. If the condition is false, skipping the first branch takes us
1058     past the end if there is only one branch, but that's OK because that is
1059 ph10 609 exactly what going to the ket would do. */
1060 nigel 77
1061     case OP_COND:
1062 nigel 93 case OP_SCOND:
1063 ph10 604 codelink = GET(ecode, 1);
1064 ph10 406
1065 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
1066     inserted between OP_COND and an assertion condition. */
1067 ph10 392
1068 ph10 381 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1069     {
1070     if (pcre_callout != NULL)
1071     {
1072     pcre_callout_block cb;
1073     cb.version = 1; /* Version 1 of the callout block */
1074     cb.callout_number = ecode[LINK_SIZE+2];
1075     cb.offset_vector = md->offset_vector;
1076     cb.subject = (PCRE_SPTR)md->start_subject;
1077 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1078     cb.start_match = (int)(mstart - md->start_subject);
1079     cb.current_position = (int)(eptr - md->start_subject);
1080 ph10 381 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1081     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1082     cb.capture_top = offset_top/2;
1083     cb.capture_last = md->capture_last;
1084     cb.callout_data = md->callout_data;
1085 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1086 ph10 381 if (rrc < 0) RRETURN(rrc);
1087     }
1088     ecode += _pcre_OP_lengths[OP_CALLOUT];
1089     }
1090 ph10 392
1091 ph10 399 condcode = ecode[LINK_SIZE+1];
1092 ph10 406
1093 ph10 381 /* Now see what the actual condition is */
1094 ph10 392
1095 ph10 459 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1096 nigel 77 {
1097 ph10 459 if (md->recursive == NULL) /* Not recursing => FALSE */
1098     {
1099 ph10 461 condition = FALSE;
1100     ecode += GET(ecode, 1);
1101     }
1102 ph10 459 else
1103 ph10 461 {
1104 ph10 459 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1105     condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1106 ph10 461
1107 ph10 459 /* If the test is for recursion into a specific subpattern, and it is
1108     false, but the test was set up by name, scan the table to see if the
1109     name refers to any other numbers, and test them. The condition is true
1110     if any one is set. */
1111 ph10 461
1112 ph10 459 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1113     {
1114     uschar *slotA = md->name_table;
1115     for (i = 0; i < md->name_count; i++)
1116 ph10 461 {
1117     if (GET2(slotA, 0) == recno) break;
1118 ph10 459 slotA += md->name_entry_size;
1119     }
1120 ph10 461
1121 ph10 459 /* Found a name for the number - there can be only one; duplicate
1122     names for different numbers are allowed, but not vice versa. First
1123     scan down for duplicates. */
1124 ph10 461
1125 ph10 459 if (i < md->name_count)
1126 ph10 461 {
1127 ph10 459 uschar *slotB = slotA;
1128     while (slotB > md->name_table)
1129     {
1130     slotB -= md->name_entry_size;
1131     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1132     {
1133     condition = GET2(slotB, 0) == md->recursive->group_num;
1134 ph10 461 if (condition) break;
1135     }
1136 ph10 459 else break;
1137 ph10 461 }
1138    
1139 ph10 459 /* Scan up for duplicates */
1140 ph10 461
1141 ph10 459 if (!condition)
1142 ph10 461 {
1143 ph10 459 slotB = slotA;
1144     for (i++; i < md->name_count; i++)
1145     {
1146     slotB += md->name_entry_size;
1147     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1148     {
1149     condition = GET2(slotB, 0) == md->recursive->group_num;
1150     if (condition) break;
1151 ph10 461 }
1152 ph10 459 else break;
1153 ph10 461 }
1154     }
1155 ph10 459 }
1156 ph10 461 }
1157    
1158 ph10 459 /* Chose branch according to the condition */
1159 ph10 461
1160 ph10 459 ecode += condition? 3 : GET(ecode, 1);
1161     }
1162 ph10 461 }
1163 nigel 93
1164 ph10 459 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1165 nigel 93 {
1166 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1167 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1168 ph10 461
1169 ph10 459 /* If the numbered capture is unset, but the reference was by name,
1170 ph10 461 scan the table to see if the name refers to any other numbers, and test
1171     them. The condition is true if any one is set. This is tediously similar
1172     to the code above, but not close enough to try to amalgamate. */
1173    
1174 ph10 459 if (!condition && condcode == OP_NCREF)
1175     {
1176 ph10 461 int refno = offset >> 1;
1177 ph10 459 uschar *slotA = md->name_table;
1178 ph10 461
1179 ph10 459 for (i = 0; i < md->name_count; i++)
1180 ph10 461 {
1181     if (GET2(slotA, 0) == refno) break;
1182 ph10 459 slotA += md->name_entry_size;
1183     }
1184 ph10 461
1185     /* Found a name for the number - there can be only one; duplicate names
1186     for different numbers are allowed, but not vice versa. First scan down
1187 ph10 459 for duplicates. */
1188 ph10 461
1189 ph10 459 if (i < md->name_count)
1190 ph10 461 {
1191 ph10 459 uschar *slotB = slotA;
1192     while (slotB > md->name_table)
1193     {
1194     slotB -= md->name_entry_size;
1195     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1196     {
1197     offset = GET2(slotB, 0) << 1;
1198 ph10 461 condition = offset < offset_top &&
1199 ph10 459 md->offset_vector[offset] >= 0;
1200 ph10 461 if (condition) break;
1201     }
1202 ph10 459 else break;
1203 ph10 461 }
1204    
1205 ph10 459 /* Scan up for duplicates */
1206 ph10 461
1207 ph10 459 if (!condition)
1208 ph10 461 {
1209 ph10 459 slotB = slotA;
1210     for (i++; i < md->name_count; i++)
1211     {
1212     slotB += md->name_entry_size;
1213     if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1214     {
1215     offset = GET2(slotB, 0) << 1;
1216 ph10 461 condition = offset < offset_top &&
1217 ph10 459 md->offset_vector[offset] >= 0;
1218 ph10 461 if (condition) break;
1219     }
1220 ph10 459 else break;
1221 ph10 461 }
1222     }
1223 ph10 459 }
1224 ph10 461 }
1225    
1226 ph10 459 /* Chose branch according to the condition */
1227    
1228 nigel 93 ecode += condition? 3 : GET(ecode, 1);
1229 nigel 77 }
1230    
1231 ph10 399 else if (condcode == OP_DEF) /* DEFINE - always false */
1232 nigel 93 {
1233     condition = FALSE;
1234     ecode += GET(ecode, 1);
1235     }
1236    
1237 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
1238 ph10 604 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1239     an assertion. */
1240 nigel 77
1241     else
1242     {
1243 ph10 625 md->match_function_type = MATCH_CONDASSERT;
1244 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1245 nigel 77 if (rrc == MATCH_MATCH)
1246     {
1247 ph10 619 if (md->end_offset_top > offset_top)
1248     offset_top = md->end_offset_top; /* Captures may have happened */
1249 nigel 93 condition = TRUE;
1250     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1251 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1252     }
1253 ph10 550 else if (rrc != MATCH_NOMATCH &&
1254     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1255 nigel 77 {
1256     RRETURN(rrc); /* Need braces because of following else */
1257     }
1258 nigel 93 else
1259     {
1260     condition = FALSE;
1261 ph10 399 ecode += codelink;
1262 nigel 93 }
1263     }
1264 nigel 91
1265 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
1266 ph10 609 we used to use tail recursion to avoid using another stack frame, except
1267 ph10 625 when there was unlimited repeat of a possibly empty group. However, that
1268     strategy no longer works because of the possibilty of (*THEN) being
1269 ph10 609 encountered in the branch. A recursive call to match() is always required,
1270     unless the second alternative doesn't exist, in which case we can just
1271     plough on. */
1272 nigel 91
1273 nigel 93 if (condition || *ecode == OP_ALT)
1274     {
1275 ph10 625 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1276 ph10 609 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1277 ph10 625 if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1278 ph10 609 rrc = MATCH_NOMATCH;
1279     RRETURN(rrc);
1280 nigel 77 }
1281 ph10 395 else /* Condition false & no alternative */
1282 nigel 93 {
1283     ecode += 1 + LINK_SIZE;
1284     }
1285     break;
1286 nigel 77
1287 ph10 461
1288 ph10 447 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1289     to close any currently open capturing brackets. */
1290 ph10 461
1291 ph10 447 case OP_CLOSE:
1292 ph10 461 number = GET2(ecode, 1);
1293 ph10 447 offset = number << 1;
1294 ph10 461
1295 ph10 475 #ifdef PCRE_DEBUG
1296 ph10 447 printf("end bracket %d at *ACCEPT", number);
1297     printf("\n");
1298     #endif
1299 nigel 77
1300 ph10 447 md->capture_last = number;
1301     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1302     {
1303     md->offset_vector[offset] =
1304     md->offset_vector[md->offset_end - number];
1305 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1306 ph10 447 if (offset_top <= offset) offset_top = offset + 2;
1307     }
1308     ecode += 3;
1309 ph10 461 break;
1310 ph10 447
1311    
1312 ph10 619 /* End of the pattern, either real or forced. */
1313 nigel 77
1314 ph10 619 case OP_END:
1315 ph10 210 case OP_ACCEPT:
1316 ph10 625 case OP_ASSERT_ACCEPT:
1317    
1318 ph10 619 /* If we have matched an empty string, fail if not in an assertion and not
1319     in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1320 ph10 613 is set and we have matched at the start of the subject. In both cases,
1321     backtracking will then try other alternatives, if any. */
1322 ph10 443
1323 ph10 619 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1324 ph10 618 md->recursive == NULL &&
1325 ph10 619 (md->notempty ||
1326     (md->notempty_atstart &&
1327     mstart == md->start_subject + md->start_offset)))
1328 ph10 510 MRRETURN(MATCH_NOMATCH);
1329 ph10 443
1330 ph10 442 /* Otherwise, we have a match. */
1331 ph10 625
1332 ph10 168 md->end_match_ptr = eptr; /* Record where we ended */
1333     md->end_offset_top = offset_top; /* and how many extracts were taken */
1334 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1335 nigel 77
1336 ph10 512 /* For some reason, the macros don't work properly if an expression is
1337     given as the argument to MRRETURN when the heap is in use. */
1338    
1339     rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1340     MRRETURN(rrc);
1341    
1342 nigel 77 /* Assertion brackets. Check the alternative branches in turn - the
1343     matching won't pass the KET for an assertion. If any one branch matches,
1344     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1345     start of each branch to move the current point backwards, so the code at
1346 ph10 625 this level is identical to the lookahead case. When the assertion is part
1347     of a condition, we want to return immediately afterwards. The caller of
1348     this incarnation of the match() function will have set MATCH_CONDASSERT in
1349     md->match_function type, and one of these opcodes will be the first opcode
1350     that is processed. We use a local variable that is preserved over calls to
1351 ph10 604 match() to remember this case. */
1352 nigel 77
1353     case OP_ASSERT:
1354     case OP_ASSERTBACK:
1355 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1356     {
1357     condassert = TRUE;
1358     md->match_function_type = 0;
1359     }
1360 ph10 625 else condassert = FALSE;
1361    
1362 nigel 77 do
1363     {
1364 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1365 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1366 ph10 500 {
1367     mstart = md->start_match_ptr; /* In case \K reset it */
1368 ph10 630 markptr = md->mark;
1369 ph10 500 break;
1370 ph10 501 }
1371 ph10 550 if (rrc != MATCH_NOMATCH &&
1372     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1373     RRETURN(rrc);
1374 nigel 77 ecode += GET(ecode, 1);
1375     }
1376     while (*ecode == OP_ALT);
1377 ph10 625
1378 ph10 510 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1379 nigel 77
1380     /* If checking an assertion for a condition, return MATCH_MATCH. */
1381    
1382 ph10 604 if (condassert) RRETURN(MATCH_MATCH);
1383 nigel 77
1384     /* Continue from after the assertion, updating the offsets high water
1385     mark, since extracts may have been taken during the assertion. */
1386    
1387     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1388     ecode += 1 + LINK_SIZE;
1389     offset_top = md->end_offset_top;
1390     continue;
1391    
1392 ph10 473 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1393 ph10 482 PRUNE, or COMMIT means we must assume failure without checking subsequent
1394 ph10 473 branches. */
1395 nigel 77
1396     case OP_ASSERT_NOT:
1397     case OP_ASSERTBACK_NOT:
1398 ph10 604 if (md->match_function_type == MATCH_CONDASSERT)
1399     {
1400     condassert = TRUE;
1401     md->match_function_type = 0;
1402     }
1403 ph10 625 else condassert = FALSE;
1404 ph10 604
1405 nigel 77 do
1406     {
1407 ph10 604 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1408 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1409 ph10 473 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1410     {
1411     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1412 ph10 482 break;
1413     }
1414 ph10 550 if (rrc != MATCH_NOMATCH &&
1415     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1416     RRETURN(rrc);
1417 nigel 77 ecode += GET(ecode,1);
1418     }
1419     while (*ecode == OP_ALT);
1420    
1421 ph10 604 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1422 ph10 625
1423 nigel 77 ecode += 1 + LINK_SIZE;
1424     continue;
1425    
1426     /* Move the subject pointer back. This occurs only at the start of
1427     each branch of a lookbehind assertion. If we are too close to the start to
1428     move back, this match function fails. When working with UTF-8 we move
1429     back a number of characters, not bytes. */
1430    
1431     case OP_REVERSE:
1432     #ifdef SUPPORT_UTF8
1433     if (utf8)
1434     {
1435 nigel 93 i = GET(ecode, 1);
1436     while (i-- > 0)
1437 nigel 77 {
1438     eptr--;
1439 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1440 ph10 207 BACKCHAR(eptr);
1441 nigel 77 }
1442     }
1443     else
1444     #endif
1445    
1446     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1447    
1448     {
1449 nigel 93 eptr -= GET(ecode, 1);
1450 ph10 510 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1451 nigel 77 }
1452    
1453 ph10 435 /* Save the earliest consulted character, then skip to next op code */
1454 nigel 77
1455 ph10 435 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1456 nigel 77 ecode += 1 + LINK_SIZE;
1457     break;
1458    
1459     /* The callout item calls an external function, if one is provided, passing
1460     details of the match so far. This is mainly for debugging, though the
1461     function is able to force a failure. */
1462    
1463     case OP_CALLOUT:
1464     if (pcre_callout != NULL)
1465     {
1466     pcre_callout_block cb;
1467     cb.version = 1; /* Version 1 of the callout block */
1468     cb.callout_number = ecode[1];
1469     cb.offset_vector = md->offset_vector;
1470 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1471 ph10 530 cb.subject_length = (int)(md->end_subject - md->start_subject);
1472     cb.start_match = (int)(mstart - md->start_subject);
1473     cb.current_position = (int)(eptr - md->start_subject);
1474 nigel 77 cb.pattern_position = GET(ecode, 2);
1475     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1476     cb.capture_top = offset_top/2;
1477     cb.capture_last = md->capture_last;
1478     cb.callout_data = md->callout_data;
1479 ph10 510 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1480 nigel 77 if (rrc < 0) RRETURN(rrc);
1481     }
1482     ecode += 2 + 2*LINK_SIZE;
1483     break;
1484    
1485     /* Recursion either matches the current regex, or some subexpression. The
1486     offset data is the offset to the starting bracket from the start of the
1487     whole pattern. (This is so that it works from duplicated subpatterns.)
1488 ph10 625
1489 ph10 618 The state of the capturing groups is preserved over recursion, and
1490 ph10 625 re-instated afterwards. We don't know how many are started and not yet
1491 ph10 618 finished (offset_top records the completed total) so we just have to save
1492     all the potential data. There may be up to 65535 such values, which is too
1493     large to put on the stack, but using malloc for small numbers seems
1494     expensive. As a compromise, the stack is used when there are no more than
1495     REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1496 nigel 77
1497     There are also other values that have to be saved. We use a chained
1498     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1499 ph10 625 for the original version of this logic. It has, however, been hacked around
1500 ph10 618 a lot, so he is not to blame for the current way it works. */
1501 nigel 77
1502     case OP_RECURSE:
1503     {
1504     callpat = md->start_code + GET(ecode, 1);
1505 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1506     GET2(callpat, 1 + LINK_SIZE);
1507 nigel 77
1508     /* Add to "recursing stack" */
1509    
1510     new_recursive.prevrec = md->recursive;
1511     md->recursive = &new_recursive;
1512    
1513 ph10 618 /* Where to continue from afterwards */
1514 nigel 77
1515     ecode += 1 + LINK_SIZE;
1516    
1517 ph10 618 /* Now save the offset data */
1518 nigel 77
1519     new_recursive.saved_max = md->offset_end;
1520     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1521     new_recursive.offset_save = stacksave;
1522     else
1523     {
1524     new_recursive.offset_save =
1525     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1526     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1527     }
1528     memcpy(new_recursive.offset_save, md->offset_vector,
1529     new_recursive.saved_max * sizeof(int));
1530 ph10 625
1531 ph10 618 /* OK, now we can do the recursion. After processing each alternative,
1532 ph10 625 restore the offset data. If there were nested recursions, md->recursive
1533 ph10 618 might be changed, so reset it before looping. */
1534 nigel 77
1535     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1536 ph10 604 cbegroup = (*callpat >= OP_SBRA);
1537 nigel 77 do
1538     {
1539 ph10 604 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1540 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1541 ph10 604 md, eptrb, RM6);
1542 ph10 618 memcpy(md->offset_vector, new_recursive.offset_save,
1543     new_recursive.saved_max * sizeof(int));
1544 ph10 511 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1545 nigel 77 {
1546 nigel 87 DPRINTF(("Recursion matched\n"));
1547 nigel 77 md->recursive = new_recursive.prevrec;
1548     if (new_recursive.offset_save != stacksave)
1549     (pcre_free)(new_recursive.offset_save);
1550 ph10 618
1551     /* Set where we got to in the subject, and reset the start in case
1552 ph10 625 it was changed by \K. This *is* propagated back out of a recursion,
1553     for Perl compatibility. */
1554    
1555 ph10 618 eptr = md->end_match_ptr;
1556     mstart = md->start_match_ptr;
1557     goto RECURSION_MATCHED; /* Exit loop; end processing */
1558 nigel 77 }
1559 ph10 550 else if (rrc != MATCH_NOMATCH &&
1560     (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1561 nigel 87 {
1562     DPRINTF(("Recursion gave error %d\n", rrc));
1563 ph10 400 if (new_recursive.offset_save != stacksave)
1564     (pcre_free)(new_recursive.offset_save);
1565 nigel 87 RRETURN(rrc);
1566     }
1567 nigel 77
1568     md->recursive = &new_recursive;
1569     callpat += GET(callpat, 1);
1570     }
1571     while (*callpat == OP_ALT);
1572    
1573     DPRINTF(("Recursion didn't match\n"));
1574     md->recursive = new_recursive.prevrec;
1575     if (new_recursive.offset_save != stacksave)
1576     (pcre_free)(new_recursive.offset_save);
1577 ph10 510 MRRETURN(MATCH_NOMATCH);
1578 nigel 77 }
1579 ph10 625
1580 ph10 618 RECURSION_MATCHED:
1581     break;
1582 nigel 77
1583     /* An alternation is the end of a branch; scan along to find the end of the
1584     bracketed group and go to there. */
1585    
1586     case OP_ALT:
1587     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1588     break;
1589    
1590 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1591     indicating that it may occur zero times. It may repeat infinitely, or not
1592     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1593     with fixed upper repeat limits are compiled as a number of copies, with the
1594     optional ones preceded by BRAZERO or BRAMINZERO. */
1595 ph10 625
1596 nigel 77 case OP_BRAZERO:
1597 ph10 604 next = ecode + 1;
1598     RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1599     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1600     do next += GET(next, 1); while (*next == OP_ALT);
1601     ecode = next + 1 + LINK_SIZE;
1602 nigel 77 break;
1603 ph10 625
1604 nigel 77 case OP_BRAMINZERO:
1605 ph10 604 next = ecode + 1;
1606     do next += GET(next, 1); while (*next == OP_ALT);
1607     RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1608     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1609     ecode++;
1610 nigel 77 break;
1611    
1612 ph10 335 case OP_SKIPZERO:
1613 ph10 604 next = ecode+1;
1614     do next += GET(next,1); while (*next == OP_ALT);
1615     ecode = next + 1 + LINK_SIZE;
1616 ph10 335 break;
1617 ph10 625
1618 ph10 604 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1619     here; just jump to the group, with allow_zero set TRUE. */
1620 ph10 625
1621 ph10 604 case OP_BRAPOSZERO:
1622 ph10 625 op = *(++ecode);
1623 ph10 604 allow_zero = TRUE;
1624     if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1625     goto POSSESSIVE_NON_CAPTURE;
1626 ph10 335
1627 nigel 93 /* End of a group, repeated or non-repeating. */
1628 nigel 77
1629     case OP_KET:
1630     case OP_KETRMIN:
1631     case OP_KETRMAX:
1632 ph10 625 case OP_KETRPOS:
1633 nigel 91 prev = ecode - GET(ecode, 1);
1634 ph10 625
1635 nigel 93 /* If this was a group that remembered the subject start, in order to break
1636     infinite repeats of empty string matches, retrieve the subject start from
1637     the chain. Otherwise, set it NULL. */
1638 nigel 77
1639 ph10 618 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1640 nigel 93 {
1641     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1642     eptrb = eptrb->epb_prev; /* Backup to previous group */
1643     }
1644     else saved_eptr = NULL;
1645 nigel 77
1646 ph10 618 /* If we are at the end of an assertion group, stop matching and return
1647     MATCH_MATCH, but record the current high water mark for use by positive
1648     assertions. We also need to record the match start in case it was changed
1649     by \K. */
1650 nigel 93
1651 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1652 ph10 618 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT)
1653 nigel 91 {
1654     md->end_match_ptr = eptr; /* For ONCE */
1655     md->end_offset_top = offset_top;
1656 ph10 500 md->start_match_ptr = mstart;
1657 ph10 630 MRRETURN(MATCH_MATCH); /* Sets md->mark */
1658 nigel 91 }
1659 nigel 77
1660 nigel 93 /* For capturing groups we have to check the group number back at the start
1661     and if necessary complete handling an extraction by setting the offsets and
1662 ph10 618 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1663     into group 0, so it won't be picked up here. Instead, we catch it when the
1664     OP_END is reached. Other recursion is handled here. We just have to record
1665     the current subject position and start match pointer and give a MATCH
1666     return. */
1667 nigel 77
1668 ph10 604 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1669     *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1670 nigel 91 {
1671 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1672 nigel 91 offset = number << 1;
1673 ph10 461
1674 ph10 475 #ifdef PCRE_DEBUG
1675 nigel 91 printf("end bracket %d", number);
1676     printf("\n");
1677 nigel 77 #endif
1678    
1679 ph10 618 /* Handle a recursively called group. */
1680    
1681     if (md->recursive != NULL && md->recursive->group_num == number)
1682     {
1683     md->end_match_ptr = eptr;
1684     md->start_match_ptr = mstart;
1685     RRETURN(MATCH_MATCH);
1686     }
1687    
1688     /* Deal with capturing */
1689    
1690 nigel 93 md->capture_last = number;
1691     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1692 nigel 91 {
1693 ph10 625 /* If offset is greater than offset_top, it means that we are
1694     "skipping" a capturing group, and that group's offsets must be marked
1695     unset. In earlier versions of PCRE, all the offsets were unset at the
1696     start of matching, but this doesn't work because atomic groups and
1697 ph10 615 assertions can cause a value to be set that should later be unset.
1698     Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1699 ph10 625 part of the atomic group, but this is not on the final matching path,
1700     so must be unset when 2 is set. (If there is no group 2, there is no
1701 ph10 615 problem, because offset_top will then be 2, indicating no capture.) */
1702 ph10 625
1703 ph10 615 if (offset > offset_top)
1704     {
1705     register int *iptr = md->offset_vector + offset_top;
1706     register int *iend = md->offset_vector + offset;
1707     while (iptr < iend) *iptr++ = -1;
1708 ph10 625 }
1709    
1710 ph10 615 /* Now make the extraction */
1711    
1712 nigel 93 md->offset_vector[offset] =
1713     md->offset_vector[md->offset_end - number];
1714 ph10 530 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1715 nigel 93 if (offset_top <= offset) offset_top = offset + 2;
1716     }
1717 nigel 91 }
1718 nigel 77
1719 ph10 618 /* For an ordinary non-repeating ket, just continue at this level. This
1720     also happens for a repeating ket if no characters were matched in the
1721     group. This is the forcible breaking of infinite loops as implemented in
1722 ph10 625 Perl 5.005. For a non-repeating atomic group, establish a backup point by
1723     processing the rest of the pattern at a lower level. If this results in a
1724     NOMATCH return, pass MATCH_ONCE back to the original OP_ONCE level, thereby
1725     bypassing intermediate backup points, but resetting any captures that
1726 ph10 618 happened along the way. */
1727 nigel 77
1728 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1729     {
1730 ph10 618 if (*prev == OP_ONCE)
1731     {
1732     RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1733     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1734     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1735 ph10 625 RRETURN(MATCH_ONCE);
1736     }
1737 ph10 618 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1738 nigel 91 break;
1739     }
1740 ph10 625
1741     /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1742 ph10 604 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1743     at a time from the outer level, thus saving stack. */
1744 ph10 625
1745 ph10 604 if (*ecode == OP_KETRPOS)
1746 ph10 625 {
1747 ph10 604 md->end_match_ptr = eptr;
1748 ph10 625 md->end_offset_top = offset_top;
1749 ph10 604 RRETURN(MATCH_KETRPOS);
1750 ph10 625 }
1751 nigel 77
1752 ph10 604 /* The normal repeating kets try the rest of the pattern or restart from
1753     the preceding bracket, in the appropriate order. In the second case, we can
1754     use tail recursion to avoid using another stack frame, unless we have an
1755 ph10 618 an atomic group or an unlimited repeat of a group that can match an empty
1756     string. */
1757 nigel 77
1758 nigel 91 if (*ecode == OP_KETRMIN)
1759     {
1760 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1761 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1762 ph10 618 if (*prev == OP_ONCE)
1763     {
1764 ph10 623 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1765 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1766     md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1767 ph10 625 RRETURN(MATCH_ONCE);
1768     }
1769 ph10 604 if (*prev >= OP_SBRA) /* Could match an empty string */
1770 ph10 197 {
1771 ph10 625 md->match_function_type = MATCH_CBEGROUP;
1772 ph10 604 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1773 ph10 197 RRETURN(rrc);
1774     }
1775 nigel 91 ecode = prev;
1776     goto TAIL_RECURSE;
1777 nigel 77 }
1778 nigel 91 else /* OP_KETRMAX */
1779     {
1780 ph10 625 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1781 ph10 604 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1782 ph10 618 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1783 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1784 ph10 618 if (*prev == OP_ONCE)
1785     {
1786 ph10 623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1787 ph10 618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1788     md->once_target = prev;
1789 ph10 625 RRETURN(MATCH_ONCE);
1790     }
1791 nigel 91 ecode += 1 + LINK_SIZE;
1792     goto TAIL_RECURSE;
1793     }
1794     /* Control never gets here */
1795 nigel 77
1796 ph10 602 /* Not multiline mode: start of subject assertion, unless notbol. */
1797 nigel 77
1798     case OP_CIRC:
1799 ph10 510 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1800 ph10 625
1801 nigel 77 /* Start of subject assertion */
1802    
1803     case OP_SOD:
1804 ph10 510 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1805 nigel 77 ecode++;
1806     break;
1807 ph10 625
1808 ph10 602 /* Multiline mode: start of subject unless notbol, or after any newline. */
1809 nigel 77
1810 ph10 602 case OP_CIRCM:
1811     if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1812     if (eptr != md->start_subject &&
1813     (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1814     MRRETURN(MATCH_NOMATCH);
1815     ecode++;
1816     break;
1817    
1818 nigel 77 /* Start of match assertion */
1819    
1820     case OP_SOM:
1821 ph10 510 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1822 nigel 77 ecode++;
1823     break;
1824 ph10 172
1825 ph10 168 /* Reset the start of match point */
1826 ph10 172
1827 ph10 168 case OP_SET_SOM:
1828     mstart = eptr;
1829 ph10 172 ecode++;
1830     break;
1831 nigel 77
1832 ph10 602 /* Multiline mode: assert before any newline, or before end of subject
1833     unless noteol is set. */
1834 nigel 77
1835 ph10 602 case OP_DOLLM:
1836     if (eptr < md->end_subject)
1837     { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1838     else
1839 nigel 77 {
1840 ph10 510 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1841 ph10 602 SCHECK_PARTIAL();
1842 nigel 77 }
1843 ph10 602 ecode++;
1844     break;
1845 ph10 579
1846 ph10 625 /* Not multiline mode: assert before a terminating newline or before end of
1847 ph10 602 subject unless noteol is set. */
1848    
1849     case OP_DOLL:
1850     if (md->noteol) MRRETURN(MATCH_NOMATCH);
1851     if (!md->endonly) goto ASSERT_NL_OR_EOS;
1852    
1853 nigel 91 /* ... else fall through for endonly */
1854 nigel 77
1855     /* End of subject assertion (\z) */
1856    
1857     case OP_EOD:
1858 ph10 510 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1859 ph10 553 SCHECK_PARTIAL();
1860 nigel 77 ecode++;
1861     break;
1862    
1863     /* End of subject or ending \n assertion (\Z) */
1864    
1865     case OP_EODN:
1866 ph10 553 ASSERT_NL_OR_EOS:
1867     if (eptr < md->end_subject &&
1868 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1869 ph10 510 MRRETURN(MATCH_NOMATCH);
1870 ph10 579
1871 ph10 553 /* Either at end of string or \n before end. */
1872 ph10 579
1873 ph10 553 SCHECK_PARTIAL();
1874 nigel 77 ecode++;
1875     break;
1876    
1877     /* Word boundary assertions */
1878    
1879     case OP_NOT_WORD_BOUNDARY:
1880     case OP_WORD_BOUNDARY:
1881     {
1882    
1883     /* Find out if the previous and current characters are "word" characters.
1884     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1885 ph10 443 be "non-word" characters. Remember the earliest consulted character for
1886 ph10 435 partial matching. */
1887 nigel 77
1888     #ifdef SUPPORT_UTF8
1889     if (utf8)
1890     {
1891 ph10 518 /* Get status of previous character */
1892 ph10 527
1893 nigel 77 if (eptr == md->start_subject) prev_is_word = FALSE; else
1894     {
1895 ph10 409 USPTR lastptr = eptr - 1;
1896 nigel 77 while((*lastptr & 0xc0) == 0x80) lastptr--;
1897 ph10 443 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1898 nigel 77 GETCHAR(c, lastptr);
1899 ph10 527 #ifdef SUPPORT_UCP
1900 ph10 518 if (md->use_ucp)
1901     {
1902     if (c == '_') prev_is_word = TRUE; else
1903 ph10 527 {
1904 ph10 518 int cat = UCD_CATEGORY(c);
1905     prev_is_word = (cat == ucp_L || cat == ucp_N);
1906 ph10 527 }
1907     }
1908     else
1909     #endif
1910 nigel 77 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1911     }
1912 ph10 527
1913 ph10 518 /* Get status of next character */
1914 ph10 527
1915 ph10 443 if (eptr >= md->end_subject)
1916 nigel 77 {
1917 ph10 443 SCHECK_PARTIAL();
1918     cur_is_word = FALSE;
1919 ph10 428 }
1920     else
1921     {
1922 nigel 77 GETCHAR(c, eptr);
1923 ph10 527 #ifdef SUPPORT_UCP
1924 ph10 518 if (md->use_ucp)
1925     {
1926     if (c == '_') cur_is_word = TRUE; else
1927 ph10 527 {
1928 ph10 518 int cat = UCD_CATEGORY(c);
1929     cur_is_word = (cat == ucp_L || cat == ucp_N);
1930 ph10 527 }
1931     }
1932     else
1933     #endif
1934 nigel 77 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1935     }
1936     }
1937     else
1938     #endif
1939    
1940 ph10 527 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1941 ph10 518 consistency with the behaviour of \w we do use it in this case. */
1942 nigel 77
1943     {
1944 ph10 518 /* Get status of previous character */
1945 ph10 527
1946 ph10 435 if (eptr == md->start_subject) prev_is_word = FALSE; else
1947     {
1948 ph10 443 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1949 ph10 527 #ifdef SUPPORT_UCP
1950 ph10 518 if (md->use_ucp)
1951     {
1952 ph10 527 c = eptr[-1];
1953 ph10 518 if (c == '_') prev_is_word = TRUE; else
1954 ph10 527 {
1955 ph10 518 int cat = UCD_CATEGORY(c);
1956     prev_is_word = (cat == ucp_L || cat == ucp_N);
1957 ph10 527 }
1958     }
1959     else
1960     #endif
1961 ph10 435 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1962     }
1963 ph10 527
1964 ph10 518 /* Get status of next character */
1965 ph10 527
1966 ph10 443 if (eptr >= md->end_subject)
1967 ph10 428 {
1968 ph10 443 SCHECK_PARTIAL();
1969     cur_is_word = FALSE;
1970 ph10 428 }
1971 ph10 527 else
1972     #ifdef SUPPORT_UCP
1973 ph10 518 if (md->use_ucp)
1974     {
1975 ph10 527 c = *eptr;
1976 ph10 518 if (c == '_') cur_is_word = TRUE; else
1977 ph10 527 {
1978 ph10 518 int cat = UCD_CATEGORY(c);
1979     cur_is_word = (cat == ucp_L || cat == ucp_N);
1980 ph10 527 }
1981     }
1982     else
1983     #endif
1984 ph10 518 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1985 nigel 77 }
1986    
1987     /* Now see if the situation is what we want */
1988    
1989     if ((*ecode++ == OP_WORD_BOUNDARY)?
1990     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1991 ph10 510 MRRETURN(MATCH_NOMATCH);
1992 nigel 77 }
1993     break;
1994    
1995     /* Match a single character type; inline for speed */
1996    
1997     case OP_ANY:
1998 ph10 510 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1999 ph10 345 /* Fall through */
2000    
2001 ph10 341 case OP_ALLANY:
2002 ph10 443 if (eptr++ >= md->end_subject)
2003 ph10 428 {
2004 ph10 443 SCHECK_PARTIAL();
2005 ph10 510 MRRETURN(MATCH_NOMATCH);
2006 ph10 443 }
2007 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2008 nigel 77 ecode++;
2009     break;
2010    
2011     /* Match a single byte, even in UTF-8 mode. This opcode really does match
2012     any byte, even newline, independent of the setting of PCRE_DOTALL. */
2013    
2014     case OP_ANYBYTE:
2015 ph10 443 if (eptr++ >= md->end_subject)
2016 ph10 428 {
2017 ph10 443 SCHECK_PARTIAL();
2018 ph10 510 MRRETURN(MATCH_NOMATCH);
2019 ph10 443 }
2020 nigel 77 ecode++;
2021     break;
2022    
2023     case OP_NOT_DIGIT:
2024 ph10 443 if (eptr >= md->end_subject)
2025 ph10 428 {
2026 ph10 443 SCHECK_PARTIAL();
2027 ph10 510 MRRETURN(MATCH_NOMATCH);
2028 ph10 443 }
2029 nigel 77 GETCHARINCTEST(c, eptr);
2030     if (
2031     #ifdef SUPPORT_UTF8
2032     c < 256 &&
2033     #endif
2034     (md->ctypes[c] & ctype_digit) != 0
2035     )
2036 ph10 510 MRRETURN(MATCH_NOMATCH);
2037 nigel 77 ecode++;
2038     break;
2039    
2040     case OP_DIGIT:
2041 ph10 443 if (eptr >= md->end_subject)
2042 ph10 428 {
2043 ph10 443 SCHECK_PARTIAL();
2044 ph10 510 MRRETURN(MATCH_NOMATCH);
2045 ph10 443 }
2046 nigel 77 GETCHARINCTEST(c, eptr);
2047     if (
2048     #ifdef SUPPORT_UTF8
2049     c >= 256 ||
2050     #endif
2051     (md->ctypes[c] & ctype_digit) == 0
2052     )
2053 ph10 510 MRRETURN(MATCH_NOMATCH);
2054 nigel 77 ecode++;
2055     break;
2056    
2057     case OP_NOT_WHITESPACE:
2058 ph10 443 if (eptr >= md->end_subject)
2059 ph10 428 {
2060 ph10 443 SCHECK_PARTIAL();
2061 ph10 510 MRRETURN(MATCH_NOMATCH);
2062 ph10 443 }
2063 nigel 77 GETCHARINCTEST(c, eptr);
2064     if (
2065     #ifdef SUPPORT_UTF8
2066     c < 256 &&
2067     #endif
2068     (md->ctypes[c] & ctype_space) != 0
2069     )
2070 ph10 510 MRRETURN(MATCH_NOMATCH);
2071 nigel 77 ecode++;
2072     break;
2073    
2074     case OP_WHITESPACE:
2075 ph10 443 if (eptr >= md->end_subject)
2076 ph10 428 {
2077 ph10 443 SCHECK_PARTIAL();
2078 ph10 510 MRRETURN(MATCH_NOMATCH);
2079 ph10 443 }
2080 nigel 77 GETCHARINCTEST(c, eptr);
2081     if (
2082     #ifdef SUPPORT_UTF8
2083     c >= 256 ||
2084     #endif
2085     (md->ctypes[c] & ctype_space) == 0
2086     )
2087 ph10 510 MRRETURN(MATCH_NOMATCH);
2088 nigel 77 ecode++;
2089     break;
2090    
2091     case OP_NOT_WORDCHAR:
2092 ph10 443 if (eptr >= md->end_subject)
2093 ph10 428 {
2094 ph10 443 SCHECK_PARTIAL();
2095 ph10 510 MRRETURN(MATCH_NOMATCH);
2096 ph10 443 }
2097 nigel 77 GETCHARINCTEST(c, eptr);
2098     if (
2099     #ifdef SUPPORT_UTF8
2100     c < 256 &&
2101     #endif
2102     (md->ctypes[c] & ctype_word) != 0
2103     )
2104 ph10 510 MRRETURN(MATCH_NOMATCH);
2105 nigel 77 ecode++;
2106     break;
2107    
2108     case OP_WORDCHAR:
2109 ph10 443 if (eptr >= md->end_subject)
2110 ph10 428 {
2111 ph10 443 SCHECK_PARTIAL();
2112 ph10 510 MRRETURN(MATCH_NOMATCH);
2113 ph10 443 }
2114 nigel 77 GETCHARINCTEST(c, eptr);
2115     if (
2116     #ifdef SUPPORT_UTF8
2117     c >= 256 ||
2118     #endif
2119     (md->ctypes[c] & ctype_word) == 0
2120     )
2121 ph10 510 MRRETURN(MATCH_NOMATCH);
2122 nigel 77 ecode++;
2123     break;
2124    
2125 nigel 93 case OP_ANYNL:
2126 ph10 443 if (eptr >= md->end_subject)
2127 ph10 428 {
2128 ph10 443 SCHECK_PARTIAL();
2129 ph10 510 MRRETURN(MATCH_NOMATCH);
2130 ph10 443 }
2131 nigel 93 GETCHARINCTEST(c, eptr);
2132     switch(c)
2133     {
2134 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2135 ph10 625
2136 nigel 93 case 0x000d:
2137     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2138     break;
2139 ph10 231
2140 nigel 93 case 0x000a:
2141 ph10 231 break;
2142    
2143 nigel 93 case 0x000b:
2144     case 0x000c:
2145     case 0x0085:
2146     case 0x2028:
2147     case 0x2029:
2148 ph10 510 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2149 nigel 93 break;
2150     }
2151     ecode++;
2152     break;
2153    
2154 ph10 178 case OP_NOT_HSPACE:
2155 ph10 443 if (eptr >= md->end_subject)
2156 ph10 428 {
2157 ph10 443 SCHECK_PARTIAL();
2158 ph10 510 MRRETURN(MATCH_NOMATCH);
2159 ph10 443 }
2160 ph10 178 GETCHARINCTEST(c, eptr);
2161     switch(c)
2162     {
2163     default: break;
2164     case 0x09: /* HT */
2165     case 0x20: /* SPACE */
2166     case 0xa0: /* NBSP */
2167     case 0x1680: /* OGHAM SPACE MARK */
2168     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2169     case 0x2000: /* EN QUAD */
2170     case 0x2001: /* EM QUAD */
2171     case 0x2002: /* EN SPACE */
2172     case 0x2003: /* EM SPACE */
2173     case 0x2004: /* THREE-PER-EM SPACE */
2174     case 0x2005: /* FOUR-PER-EM SPACE */
2175     case 0x2006: /* SIX-PER-EM SPACE */
2176     case 0x2007: /* FIGURE SPACE */
2177     case 0x2008: /* PUNCTUATION SPACE */
2178     case 0x2009: /* THIN SPACE */
2179     case 0x200A: /* HAIR SPACE */
2180     case 0x202f: /* NARROW NO-BREAK SPACE */
2181     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2182     case 0x3000: /* IDEOGRAPHIC SPACE */
2183 ph10 510 MRRETURN(MATCH_NOMATCH);
2184 ph10 178 }
2185     ecode++;
2186     break;
2187    
2188     case OP_HSPACE:
2189 ph10 443 if (eptr >= md->end_subject)
2190 ph10 428 {
2191 ph10 443 SCHECK_PARTIAL();
2192 ph10 510 MRRETURN(MATCH_NOMATCH);
2193 ph10 443 }
2194 ph10 178 GETCHARINCTEST(c, eptr);
2195     switch(c)
2196     {
2197 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2198 ph10 178 case 0x09: /* HT */
2199     case 0x20: /* SPACE */
2200     case 0xa0: /* NBSP */
2201     case 0x1680: /* OGHAM SPACE MARK */
2202     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2203     case 0x2000: /* EN QUAD */
2204     case 0x2001: /* EM QUAD */
2205     case 0x2002: /* EN SPACE */
2206     case 0x2003: /* EM SPACE */
2207     case 0x2004: /* THREE-PER-EM SPACE */
2208     case 0x2005: /* FOUR-PER-EM SPACE */
2209     case 0x2006: /* SIX-PER-EM SPACE */
2210     case 0x2007: /* FIGURE SPACE */
2211     case 0x2008: /* PUNCTUATION SPACE */
2212     case 0x2009: /* THIN SPACE */
2213     case 0x200A: /* HAIR SPACE */
2214     case 0x202f: /* NARROW NO-BREAK SPACE */
2215     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2216     case 0x3000: /* IDEOGRAPHIC SPACE */
2217     break;
2218     }
2219     ecode++;
2220     break;
2221    
2222     case OP_NOT_VSPACE:
2223 ph10 443 if (eptr >= md->end_subject)
2224 ph10 428 {
2225 ph10 443 SCHECK_PARTIAL();
2226 ph10 510 MRRETURN(MATCH_NOMATCH);
2227 ph10 443 }
2228 ph10 178 GETCHARINCTEST(c, eptr);
2229     switch(c)
2230     {
2231     default: break;
2232     case 0x0a: /* LF */
2233     case 0x0b: /* VT */
2234     case 0x0c: /* FF */
2235     case 0x0d: /* CR */
2236     case 0x85: /* NEL */
2237     case 0x2028: /* LINE SEPARATOR */
2238     case 0x2029: /* PARAGRAPH SEPARATOR */
2239 ph10 510 MRRETURN(MATCH_NOMATCH);
2240 ph10 178 }
2241     ecode++;
2242     break;
2243    
2244     case OP_VSPACE:
2245 ph10 443 if (eptr >= md->end_subject)
2246 ph10 428 {
2247 ph10 443 SCHECK_PARTIAL();
2248 ph10 510 MRRETURN(MATCH_NOMATCH);
2249 ph10 443 }
2250 ph10 178 GETCHARINCTEST(c, eptr);
2251     switch(c)
2252     {
2253 ph10 510 default: MRRETURN(MATCH_NOMATCH);
2254 ph10 178 case 0x0a: /* LF */
2255     case 0x0b: /* VT */
2256     case 0x0c: /* FF */
2257     case 0x0d: /* CR */
2258     case 0x85: /* NEL */
2259     case 0x2028: /* LINE SEPARATOR */
2260     case 0x2029: /* PARAGRAPH SEPARATOR */
2261     break;
2262     }
2263     ecode++;
2264     break;
2265    
2266 nigel 77 #ifdef SUPPORT_UCP
2267     /* Check the next character by Unicode property. We will get here only
2268     if the support is in the binary; otherwise a compile-time error occurs. */
2269    
2270     case OP_PROP:
2271     case OP_NOTPROP:
2272 ph10 443 if (eptr >= md->end_subject)
2273 ph10 428 {
2274 ph10 443 SCHECK_PARTIAL();
2275 ph10 510 MRRETURN(MATCH_NOMATCH);
2276 ph10 443 }
2277 nigel 77 GETCHARINCTEST(c, eptr);
2278     {
2279 ph10 384 const ucd_record *prop = GET_UCD(c);
2280 nigel 77
2281 nigel 87 switch(ecode[1])
2282     {
2283     case PT_ANY:
2284 ph10 510 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2285 nigel 87 break;
2286 nigel 77
2287 nigel 87 case PT_LAMP:
2288 ph10 349 if ((prop->chartype == ucp_Lu ||
2289     prop->chartype == ucp_Ll ||
2290     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2291 ph10 510 MRRETURN(MATCH_NOMATCH);
2292 ph10 517 break;
2293 nigel 87
2294     case PT_GC:
2295 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2296 ph10 510 MRRETURN(MATCH_NOMATCH);
2297 nigel 87 break;
2298    
2299     case PT_PC:
2300 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2301 ph10 510 MRRETURN(MATCH_NOMATCH);
2302 nigel 87 break;
2303    
2304     case PT_SC:
2305 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
2306 ph10 510 MRRETURN(MATCH_NOMATCH);
2307 nigel 87 break;
2308 ph10 527
2309 ph10 517 /* These are specials */
2310 ph10 527
2311 ph10 517 case PT_ALNUM:
2312     if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2313     _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2314     MRRETURN(MATCH_NOMATCH);
2315 ph10 527 break;
2316    
2317 ph10 517 case PT_SPACE: /* Perl space */
2318     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2319     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2320     == (op == OP_NOTPROP))
2321     MRRETURN(MATCH_NOMATCH);
2322 ph10 527 break;
2323    
2324 ph10 517 case PT_PXSPACE: /* POSIX space */
2325     if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2326 ph10 527 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2327 ph10 517 c == CHAR_FF || c == CHAR_CR)
2328     == (op == OP_NOTPROP))
2329     MRRETURN(MATCH_NOMATCH);
2330 ph10 527 break;
2331 nigel 87
2332 ph10 527 case PT_WORD:
2333 ph10 517 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2334 ph10 527 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2335 ph10 517 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2336     MRRETURN(MATCH_NOMATCH);
2337 ph10 527 break;
2338    
2339 ph10 517 /* This should never occur */
2340    
2341 nigel 87 default:
2342     RRETURN(PCRE_ERROR_INTERNAL);
2343 nigel 77 }
2344 nigel 87
2345     ecode += 3;
2346 nigel 77 }
2347     break;
2348    
2349     /* Match an extended Unicode sequence. We will get here only if the support
2350     is in the binary; otherwise a compile-time error occurs. */
2351    
2352     case OP_EXTUNI:
2353 ph10 443 if (eptr >= md->end_subject)
2354 ph10 428 {
2355 ph10 443 SCHECK_PARTIAL();
2356 ph10 510 MRRETURN(MATCH_NOMATCH);
2357 ph10 443 }
2358 nigel 77 GETCHARINCTEST(c, eptr);
2359 ph10 623 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2360     while (eptr < md->end_subject)
2361 nigel 77 {
2362 ph10 623 int len = 1;
2363     if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2364     if (UCD_CATEGORY(c) != ucp_M) break;
2365     eptr += len;
2366 nigel 77 }
2367     ecode++;
2368     break;
2369     #endif
2370    
2371    
2372     /* Match a back reference, possibly repeatedly. Look past the end of the
2373     item to see if there is repeat information following. The code is similar
2374     to that for character classes, but repeated for efficiency. Then obey
2375     similar code to character type repeats - written out again for speed.
2376     However, if the referenced string is the empty string, always treat
2377     it as matched, any number of times (otherwise there could be infinite
2378     loops). */
2379    
2380     case OP_REF:
2381 ph10 625 case OP_REFI:
2382     caseless = op == OP_REFI;
2383 ph10 595 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2384     ecode += 3;
2385 ph10 345
2386 ph10 595 /* If the reference is unset, there are two possibilities:
2387 ph10 345
2388 ph10 595 (a) In the default, Perl-compatible state, set the length negative;
2389     this ensures that every attempt at a match fails. We can't just fail
2390     here, because of the possibility of quantifiers with zero minima.
2391 ph10 345
2392 ph10 595 (b) If the JavaScript compatibility flag is set, set the length to zero
2393     so that the back reference matches an empty string.
2394 ph10 345
2395 ph10 595 Otherwise, set the length to the length of what was matched by the
2396     referenced subpattern. */
2397 ph10 345
2398 ph10 595 if (offset >= offset_top || md->offset_vector[offset] < 0)
2399     length = (md->jscript_compat)? 0 : -1;
2400     else
2401     length = md->offset_vector[offset+1] - md->offset_vector[offset];
2402 nigel 77
2403 ph10 595 /* Set up for repetition, or handle the non-repeated case */
2404 nigel 77
2405 ph10 595 switch (*ecode)
2406     {
2407     case OP_CRSTAR:
2408     case OP_CRMINSTAR:
2409     case OP_CRPLUS:
2410     case OP_CRMINPLUS:
2411     case OP_CRQUERY:
2412     case OP_CRMINQUERY:
2413     c = *ecode++ - OP_CRSTAR;
2414     minimize = (c & 1) != 0;
2415     min = rep_min[c]; /* Pick up values from tables; */
2416     max = rep_max[c]; /* zero for max => infinity */
2417     if (max == 0) max = INT_MAX;
2418     break;
2419 nigel 77
2420 ph10 595 case OP_CRRANGE:
2421     case OP_CRMINRANGE:
2422     minimize = (*ecode == OP_CRMINRANGE);
2423     min = GET2(ecode, 1);
2424     max = GET2(ecode, 3);
2425     if (max == 0) max = INT_MAX;
2426     ecode += 5;
2427     break;
2428 nigel 77
2429 ph10 595 default: /* No repeat follows */
2430 ph10 602 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2431 ph10 595 {
2432     CHECK_PARTIAL();
2433     MRRETURN(MATCH_NOMATCH);
2434 nigel 77 }
2435 ph10 595 eptr += length;
2436     continue; /* With the main loop */
2437     }
2438 nigel 77
2439 ph10 595 /* Handle repeated back references. If the length of the reference is
2440     zero, just continue with the main loop. */
2441 ph10 443
2442 ph10 595 if (length == 0) continue;
2443 nigel 77
2444 ph10 595 /* First, ensure the minimum number of matches are present. We get back
2445     the length of the reference string explicitly rather than passing the
2446     address of eptr, so that eptr can be a register variable. */
2447 nigel 77
2448 ph10 595 for (i = 1; i <= min; i++)
2449     {
2450 ph10 625 int slength;
2451 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2452 nigel 77 {
2453 ph10 595 CHECK_PARTIAL();
2454     MRRETURN(MATCH_NOMATCH);
2455 nigel 77 }
2456 ph10 595 eptr += slength;
2457     }
2458 nigel 77
2459 ph10 595 /* If min = max, continue at the same level without recursion.
2460     They are not both allowed to be zero. */
2461 nigel 77
2462 ph10 595 if (min == max) continue;
2463 nigel 77
2464 ph10 595 /* If minimizing, keep trying and advancing the pointer */
2465 nigel 77
2466 ph10 595 if (minimize)
2467     {
2468     for (fi = min;; fi++)
2469 nigel 77 {
2470 ph10 625 int slength;
2471 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2472 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2473     if (fi >= max) MRRETURN(MATCH_NOMATCH);
2474 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2475 nigel 77 {
2476 ph10 595 CHECK_PARTIAL();
2477     MRRETURN(MATCH_NOMATCH);
2478 nigel 77 }
2479 ph10 595 eptr += slength;
2480 nigel 77 }
2481 ph10 595 /* Control never gets here */
2482     }
2483 nigel 77
2484 ph10 595 /* If maximizing, find the longest string and work backwards */
2485 nigel 77
2486 ph10 595 else
2487     {
2488     pp = eptr;
2489     for (i = min; i < max; i++)
2490 nigel 77 {
2491 ph10 625 int slength;
2492 ph10 602 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2493 nigel 77 {
2494 ph10 595 CHECK_PARTIAL();
2495     break;
2496 nigel 77 }
2497 ph10 595 eptr += slength;
2498 nigel 77 }
2499 ph10 595 while (eptr >= pp)
2500     {
2501 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2502 ph10 595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2503     eptr -= length;
2504     }
2505     MRRETURN(MATCH_NOMATCH);
2506 nigel 77 }
2507     /* Control never gets here */
2508    
2509     /* Match a bit-mapped character class, possibly repeatedly. This op code is
2510     used when all the characters in the class have values in the range 0-255,
2511     and either the matching is caseful, or the characters are in the range
2512     0-127 when UTF-8 processing is enabled. The only difference between
2513     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2514     encountered.
2515    
2516     First, look past the end of the item to see if there is repeat information
2517     following. Then obey similar code to character type repeats - written out
2518     again for speed. */
2519    
2520     case OP_NCLASS:
2521     case OP_CLASS:
2522     {
2523     data = ecode + 1; /* Save for matching */
2524     ecode += 33; /* Advance past the item */
2525    
2526     switch (*ecode)
2527     {
2528     case OP_CRSTAR:
2529     case OP_CRMINSTAR:
2530     case OP_CRPLUS:
2531     case OP_CRMINPLUS:
2532     case OP_CRQUERY:
2533     case OP_CRMINQUERY:
2534     c = *ecode++ - OP_CRSTAR;
2535     minimize = (c & 1) != 0;
2536     min = rep_min[c]; /* Pick up values from tables; */
2537     max = rep_max[c]; /* zero for max => infinity */
2538     if (max == 0) max = INT_MAX;
2539     break;
2540    
2541     case OP_CRRANGE:
2542     case OP_CRMINRANGE:
2543     minimize = (*ecode == OP_CRMINRANGE);
2544     min = GET2(ecode, 1);
2545     max = GET2(ecode, 3);
2546     if (max == 0) max = INT_MAX;
2547     ecode += 5;
2548     break;
2549    
2550     default: /* No repeat follows */
2551     min = max = 1;
2552     break;
2553     }
2554    
2555     /* First, ensure the minimum number of matches are present. */
2556    
2557     #ifdef SUPPORT_UTF8
2558     /* UTF-8 mode */
2559     if (utf8)
2560     {
2561     for (i = 1; i <= min; i++)
2562     {
2563 ph10 427 if (eptr >= md->end_subject)
2564 ph10 426 {
2565 ph10 428 SCHECK_PARTIAL();
2566 ph10 510 MRRETURN(MATCH_NOMATCH);
2567 ph10 427 }
2568 nigel 77 GETCHARINC(c, eptr);
2569     if (c > 255)
2570     {
2571 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2572 nigel 77 }
2573     else
2574     {
2575 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2576 nigel 77 }
2577     }
2578     }
2579     else
2580     #endif
2581     /* Not UTF-8 mode */
2582     {
2583     for (i = 1; i <= min; i++)
2584     {
2585 ph10 427 if (eptr >= md->end_subject)
2586 ph10 426 {
2587 ph10 428 SCHECK_PARTIAL();
2588 ph10 510 MRRETURN(MATCH_NOMATCH);
2589 ph10 427 }
2590 nigel 77 c = *eptr++;
2591 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2592 nigel 77 }
2593     }
2594    
2595     /* If max == min we can continue with the main loop without the
2596     need to recurse. */
2597    
2598     if (min == max) continue;
2599    
2600     /* If minimizing, keep testing the rest of the expression and advancing
2601     the pointer while it matches the class. */
2602    
2603     if (minimize)
2604     {
2605     #ifdef SUPPORT_UTF8
2606     /* UTF-8 mode */
2607     if (utf8)
2608     {
2609     for (fi = min;; fi++)
2610     {
2611 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2612 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2613 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2614 ph10 427 if (eptr >= md->end_subject)
2615 ph10 426 {
2616 ph10 427 SCHECK_PARTIAL();
2617 ph10 510 MRRETURN(MATCH_NOMATCH);
2618 ph10 427 }
2619 nigel 77 GETCHARINC(c, eptr);
2620     if (c > 255)
2621     {
2622 ph10 510 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2623 nigel 77 }
2624     else
2625     {
2626 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2627 nigel 77 }
2628     }
2629     }
2630     else
2631     #endif
2632     /* Not UTF-8 mode */
2633     {
2634     for (fi = min;; fi++)
2635     {
2636 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2637 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2638 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2639 ph10 427 if (eptr >= md->end_subject)
2640 ph10 426 {
2641 ph10 427 SCHECK_PARTIAL();
2642 ph10 510 MRRETURN(MATCH_NOMATCH);
2643 ph10 427 }
2644 nigel 77 c = *eptr++;
2645 ph10 510 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2646 nigel 77 }
2647     }
2648     /* Control never gets here */
2649     }
2650    
2651     /* If maximizing, find the longest possible run, then work backwards. */
2652    
2653     else
2654     {
2655     pp = eptr;
2656    
2657     #ifdef SUPPORT_UTF8
2658     /* UTF-8 mode */
2659     if (utf8)
2660     {
2661     for (i = min; i < max; i++)
2662     {
2663     int len = 1;
2664 ph10 463 if (eptr >= md->end_subject)
2665 ph10 462 {
2666 ph10 463 SCHECK_PARTIAL();
2667 ph10 462 break;
2668 ph10 463 }
2669 nigel 77 GETCHARLEN(c, eptr, len);
2670     if (c > 255)
2671     {
2672     if (op == OP_CLASS) break;
2673     }
2674     else
2675     {
2676     if ((data[c/8] & (1 << (c&7))) == 0) break;
2677     }
2678     eptr += len;
2679     }
2680     for (;;)
2681     {
2682 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2683 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2684     if (eptr-- == pp) break; /* Stop if tried at original pos */
2685     BACKCHAR(eptr);
2686     }
2687     }
2688     else
2689     #endif
2690     /* Not UTF-8 mode */
2691     {
2692     for (i = min; i < max; i++)
2693     {
2694 ph10 463 if (eptr >= md->end_subject)
2695 ph10 462 {
2696 ph10 463 SCHECK_PARTIAL();
2697 ph10 462 break;
2698 ph10 463 }
2699 nigel 77 c = *eptr;
2700     if ((data[c/8] & (1 << (c&7))) == 0) break;
2701     eptr++;
2702     }
2703     while (eptr >= pp)
2704     {
2705 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2706 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2707 nigel 77 eptr--;
2708     }
2709     }
2710    
2711 ph10 510 MRRETURN(MATCH_NOMATCH);
2712 nigel 77 }
2713     }
2714     /* Control never gets here */
2715    
2716    
2717     /* Match an extended character class. This opcode is encountered only
2718 ph10 384 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2719     mode, because Unicode properties are supported in non-UTF-8 mode. */
2720 nigel 77
2721     #ifdef SUPPORT_UTF8
2722     case OP_XCLASS:
2723     {
2724     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2725     ecode += GET(ecode, 1); /* Advance past the item */
2726    
2727     switch (*ecode)
2728     {
2729     case OP_CRSTAR:
2730     case OP_CRMINSTAR:
2731     case OP_CRPLUS:
2732     case OP_CRMINPLUS:
2733     case OP_CRQUERY:
2734     case OP_CRMINQUERY:
2735     c = *ecode++ - OP_CRSTAR;
2736     minimize = (c & 1) != 0;
2737     min = rep_min[c]; /* Pick up values from tables; */
2738     max = rep_max[c]; /* zero for max => infinity */
2739     if (max == 0) max = INT_MAX;
2740     break;
2741    
2742     case OP_CRRANGE:
2743     case OP_CRMINRANGE:
2744     minimize = (*ecode == OP_CRMINRANGE);
2745     min = GET2(ecode, 1);
2746     max = GET2(ecode, 3);
2747     if (max == 0) max = INT_MAX;
2748     ecode += 5;
2749     break;
2750    
2751     default: /* No repeat follows */
2752     min = max = 1;
2753     break;
2754     }
2755    
2756     /* First, ensure the minimum number of matches are present. */
2757    
2758     for (i = 1; i <= min; i++)
2759     {
2760 ph10 427 if (eptr >= md->end_subject)
2761 ph10 426 {
2762     SCHECK_PARTIAL();
2763 ph10 510 MRRETURN(MATCH_NOMATCH);
2764 ph10 427 }
2765 ph10 384 GETCHARINCTEST(c, eptr);
2766 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2767 nigel 77 }
2768    
2769     /* If max == min we can continue with the main loop without the
2770     need to recurse. */
2771    
2772     if (min == max) continue;
2773    
2774     /* If minimizing, keep testing the rest of the expression and advancing
2775     the pointer while it matches the class. */
2776    
2777     if (minimize)
2778     {
2779     for (fi = min;; fi++)
2780     {
2781 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2782 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2783 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2784 ph10 427 if (eptr >= md->end_subject)
2785 ph10 426 {
2786 ph10 427 SCHECK_PARTIAL();
2787 ph10 510 MRRETURN(MATCH_NOMATCH);
2788 ph10 427 }
2789 ph10 384 GETCHARINCTEST(c, eptr);
2790 ph10 510 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2791 nigel 77 }
2792     /* Control never gets here */
2793     }
2794    
2795     /* If maximizing, find the longest possible run, then work backwards. */
2796    
2797     else
2798     {
2799     pp = eptr;
2800     for (i = min; i < max; i++)
2801     {
2802     int len = 1;
2803 ph10 463 if (eptr >= md->end_subject)
2804 ph10 462 {
2805 ph10 463 SCHECK_PARTIAL();
2806 ph10 462 break;
2807 ph10 463 }
2808 ph10 384 GETCHARLENTEST(c, eptr, len);
2809 nigel 77 if (!_pcre_xclass(c, data)) break;
2810     eptr += len;
2811     }
2812     for(;;)
2813     {
2814 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2815 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2816     if (eptr-- == pp) break; /* Stop if tried at original pos */
2817 ph10 214 if (utf8) BACKCHAR(eptr);
2818 nigel 77 }
2819 ph10 510 MRRETURN(MATCH_NOMATCH);
2820 nigel 77 }
2821    
2822     /* Control never gets here */
2823     }
2824     #endif /* End of XCLASS */
2825    
2826     /* Match a single character, casefully */
2827    
2828     case OP_CHAR:
2829     #ifdef SUPPORT_UTF8
2830     if (utf8)
2831     {
2832     length = 1;
2833     ecode++;
2834     GETCHARLEN(fc, ecode, length);
2835 ph10 443 if (length > md->end_subject - eptr)
2836 ph10 428 {
2837     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2838 ph10 510 MRRETURN(MATCH_NOMATCH);
2839 ph10 443 }
2840 ph10 510 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2841 nigel 77 }
2842     else
2843     #endif
2844    
2845     /* Non-UTF-8 mode */
2846     {
2847 ph10 443 if (md->end_subject - eptr < 1)
2848 ph10 428 {
2849     SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2850 ph10 510 MRRETURN(MATCH_NOMATCH);
2851 ph10 443 }
2852 ph10 510 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2853 nigel 77 ecode += 2;
2854     }
2855     break;
2856    
2857     /* Match a single character, caselessly */
2858    
2859 ph10 602 case OP_CHARI:
2860 nigel 77 #ifdef SUPPORT_UTF8
2861     if (utf8)
2862     {
2863     length = 1;
2864     ecode++;
2865     GETCHARLEN(fc, ecode, length);
2866    
2867 ph10 443 if (length > md->end_subject - eptr)
2868 ph10 428 {
2869     CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2870 ph10 510 MRRETURN(MATCH_NOMATCH);
2871 ph10 443 }
2872 nigel 77
2873     /* If the pattern character's value is < 128, we have only one byte, and
2874     can use the fast lookup table. */
2875    
2876     if (fc < 128)
2877     {
2878 ph10 510 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2879 nigel 77 }
2880    
2881     /* Otherwise we must pick up the subject character */
2882    
2883     else
2884     {
2885 nigel 93 unsigned int dc;
2886 nigel 77 GETCHARINC(dc, eptr);
2887     ecode += length;
2888    
2889     /* If we have Unicode property support, we can use it to test the other
2890 nigel 87 case of the character, if there is one. */
2891 nigel 77
2892     if (fc != dc)
2893     {
2894     #ifdef SUPPORT_UCP
2895 ph10 349 if (dc != UCD_OTHERCASE(fc))
2896 nigel 77 #endif
2897 ph10 510 MRRETURN(MATCH_NOMATCH);
2898 nigel 77 }
2899     }
2900     }
2901     else
2902     #endif /* SUPPORT_UTF8 */
2903    
2904     /* Non-UTF-8 mode */
2905     {
2906 ph10 443 if (md->end_subject - eptr < 1)
2907 ph10 428 {
2908 ph10 443 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2909 ph10 510 MRRETURN(MATCH_NOMATCH);
2910 ph10 443 }
2911 ph10 510 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2912 nigel 77 ecode += 2;
2913     }
2914     break;
2915    
2916 nigel 93 /* Match a single character repeatedly. */
2917 nigel 77
2918     case OP_EXACT:
2919 ph10 602 case OP_EXACTI:
2920 nigel 77 min = max = GET2(ecode, 1);
2921     ecode += 3;
2922     goto REPEATCHAR;
2923    
2924 nigel 93 case OP_POSUPTO:
2925 ph10 602 case OP_POSUPTOI:
2926 nigel 93 possessive = TRUE;
2927     /* Fall through */
2928    
2929 nigel 77 case OP_UPTO:
2930 ph10 602 case OP_UPTOI:
2931 nigel 77 case OP_MINUPTO:
2932 ph10 602 case OP_MINUPTOI:
2933 nigel 77 min = 0;
2934     max = GET2(ecode, 1);
2935 ph10 602 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2936 nigel 77 ecode += 3;
2937     goto REPEATCHAR;
2938    
2939 nigel 93 case OP_POSSTAR:
2940 ph10 602 case OP_POSSTARI:
2941 nigel 93 possessive = TRUE;
2942     min = 0;
2943     max = INT_MAX;
2944     ecode++;
2945     goto REPEATCHAR;
2946    
2947     case OP_POSPLUS:
2948 ph10 602 case OP_POSPLUSI:
2949 nigel 93 possessive = TRUE;
2950     min = 1;
2951     max = INT_MAX;
2952     ecode++;
2953     goto REPEATCHAR;
2954    
2955     case OP_POSQUERY:
2956 ph10 602 case OP_POSQUERYI:
2957 nigel 93 possessive = TRUE;
2958     min = 0;
2959     max = 1;
2960     ecode++;
2961     goto REPEATCHAR;
2962    
2963 nigel 77 case OP_STAR:
2964 ph10 602 case OP_STARI:
2965 nigel 77 case OP_MINSTAR:
2966 ph10 602 case OP_MINSTARI:
2967 nigel 77 case OP_PLUS:
2968 ph10 602 case OP_PLUSI:
2969 nigel 77 case OP_MINPLUS:
2970 ph10 602 case OP_MINPLUSI:
2971 nigel 77 case OP_QUERY:
2972 ph10 602 case OP_QUERYI:
2973 nigel 77 case OP_MINQUERY:
2974 ph10 602 case OP_MINQUERYI:
2975     c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
2976 nigel 77 minimize = (c & 1) != 0;
2977     min = rep_min[c]; /* Pick up values from tables; */
2978     max = rep_max[c]; /* zero for max => infinity */
2979     if (max == 0) max = INT_MAX;
2980    
2981 ph10 426 /* Common code for all repeated single-character matches. */
2982 nigel 77
2983     REPEATCHAR:
2984     #ifdef SUPPORT_UTF8
2985     if (utf8)
2986     {
2987     length = 1;
2988     charptr = ecode;
2989     GETCHARLEN(fc, ecode, length);
2990     ecode += length;
2991    
2992     /* Handle multibyte character matching specially here. There is
2993     support for caseless matching if UCP support is present. */
2994    
2995     if (length > 1)
2996     {
2997     #ifdef SUPPORT_UCP
2998 nigel 93 unsigned int othercase;
2999 ph10 602 if (op >= OP_STARI && /* Caseless */
3000 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
3001 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
3002 ph10 115 else oclength = 0;
3003 nigel 77 #endif /* SUPPORT_UCP */
3004    
3005     for (i = 1; i <= min; i++)
3006     {
3007 ph10 426 if (eptr <= md->end_subject - length &&
3008     memcmp(eptr, charptr, length) == 0) eptr += length;
3009 ph10 123 #ifdef SUPPORT_UCP
3010 ph10 426 else if (oclength > 0 &&
3011     eptr <= md->end_subject - oclength &&
3012     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3013     #endif /* SUPPORT_UCP */
3014 nigel 77 else
3015     {
3016 ph10 426 CHECK_PARTIAL();
3017 ph10 510 MRRETURN(MATCH_NOMATCH);
3018 nigel 77 }
3019     }
3020    
3021     if (min == max) continue;
3022    
3023     if (minimize)
3024     {
3025     for (fi = min;; fi++)
3026     {
3027 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3028 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3029 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3030 ph10 426 if (eptr <= md->end_subject - length &&
3031     memcmp(eptr, charptr, length) == 0) eptr += length;
3032 ph10 123 #ifdef SUPPORT_UCP
3033 ph10 426 else if (oclength > 0 &&
3034     eptr <= md->end_subject - oclength &&
3035     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3036     #endif /* SUPPORT_UCP */
3037 nigel 77 else
3038     {
3039 ph10 426 CHECK_PARTIAL();
3040 ph10 510 MRRETURN(MATCH_NOMATCH);
3041 nigel 77 }
3042     }
3043     /* Control never gets here */
3044     }
3045 nigel 93
3046     else /* Maximize */
3047 nigel 77 {
3048     pp = eptr;
3049     for (i = min; i < max; i++)
3050     {
3051 ph10 426 if (eptr <= md->end_subject - length &&
3052     memcmp(eptr, charptr, length) == 0) eptr += length;
3053 ph10 123 #ifdef SUPPORT_UCP
3054 ph10 426 else if (oclength > 0 &&
3055     eptr <= md->end_subject - oclength &&
3056     memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3057     #endif /* SUPPORT_UCP */
3058 ph10 463 else
3059 ph10 462 {
3060 ph10 463 CHECK_PARTIAL();
3061 ph10 462 break;
3062 ph10 463 }
3063 nigel 77 }
3064 nigel 93
3065     if (possessive) continue;
3066 ph10 427
3067 ph10 120 for(;;)
3068 ph10 426 {
3069 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3070 ph10 426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3071 ph10 510 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3072 ph10 115 #ifdef SUPPORT_UCP
3073 ph10 426 eptr--;
3074     BACKCHAR(eptr);
3075 ph10 123 #else /* without SUPPORT_UCP */
3076 ph10 426 eptr -= length;
3077 ph10 123 #endif /* SUPPORT_UCP */
3078 ph10 426 }
3079 nigel 77 }
3080     /* Control never gets here */
3081     }
3082    
3083     /* If the length of a UTF-8 character is 1, we fall through here, and
3084     obey the code as for non-UTF-8 characters below, though in this case the
3085     value of fc will always be < 128. */
3086     }
3087     else
3088     #endif /* SUPPORT_UTF8 */
3089    
3090     /* When not in UTF-8 mode, load a single-byte character. */
3091    
3092 ph10 426 fc = *ecode++;
3093 ph10 443
3094 nigel 77 /* The value of fc at this point is always less than 256, though we may or
3095     may not be in UTF-8 mode. The code is duplicated for the caseless and
3096     caseful cases, for speed, since matching characters is likely to be quite
3097     common. First, ensure the minimum number of matches are present. If min =
3098     max, continue at the same level without recursing. Otherwise, if
3099     minimizing, keep trying the rest of the expression and advancing one
3100     matching character if failing, up to the maximum. Alternatively, if
3101     maximizing, find the maximum number of characters and work backwards. */
3102    
3103     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3104     max, eptr));
3105    
3106 ph10 602 if (op >= OP_STARI) /* Caseless */
3107 nigel 77 {
3108     fc = md->lcc[fc];
3109     for (i = 1; i <= min; i++)
3110 ph10 426 {
3111     if (eptr >= md->end_subject)
3112     {
3113     SCHECK_PARTIAL();
3114 ph10 510 MRRETURN(MATCH_NOMATCH);
3115 ph10 426 }
3116 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3117 ph10 426 }
3118 nigel 77 if (min == max) continue;
3119     if (minimize)
3120     {
3121     for (fi = min;; fi++)
3122     {
3123 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3124 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3125 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3126 ph10 426 if (eptr >= md->end_subject)
3127     {
3128 ph10 427 SCHECK_PARTIAL();
3129 ph10 510 MRRETURN(MATCH_NOMATCH);
3130 ph10 426 }
3131 ph10 510 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3132 nigel 77 }
3133     /* Control never gets here */
3134     }
3135 nigel 93 else /* Maximize */
3136 nigel 77 {
3137     pp = eptr;
3138     for (i = min; i < max; i++)
3139     {
3140 ph10 463 if (eptr >= md->end_subject)
3141 ph10 462 {
3142     SCHECK_PARTIAL();
3143     break;
3144 ph10 463 }
3145 ph10 462 if (fc != md->lcc[*eptr]) break;
3146 nigel 77 eptr++;
3147     }
3148 ph10 427
3149 nigel 93 if (possessive) continue;
3150 ph10 427
3151 nigel 77 while (eptr >= pp)
3152     {
3153 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3154 nigel 77 eptr--;
3155     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3156     }
3157 ph10 510 MRRETURN(MATCH_NOMATCH);
3158 nigel 77 }
3159     /* Control never gets here */
3160     }
3161    
3162     /* Caseful comparisons (includes all multi-byte characters) */
3163    
3164     else
3165     {
3166 ph10 427 for (i = 1; i <= min; i++)
3167 ph10 426 {
3168     if (eptr >= md->end_subject)
3169     {
3170     SCHECK_PARTIAL();
3171 ph10 510 MRRETURN(MATCH_NOMATCH);
3172 ph10 426 }
3173 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3174 ph10 427 }
3175 ph10 443
3176 nigel 77 if (min == max) continue;
3177 ph10 443
3178 nigel 77 if (minimize)
3179     {
3180     for (fi = min;; fi++)
3181     {
3182 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3183 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3184 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3185 ph10 426 if (eptr >= md->end_subject)
3186 ph10 427 {
3187 ph10 426 SCHECK_PARTIAL();
3188 ph10 510 MRRETURN(MATCH_NOMATCH);
3189 ph10 427 }
3190 ph10 510 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3191 nigel 77 }
3192     /* Control never gets here */
3193     }
3194 nigel 93 else /* Maximize */
3195 nigel 77 {
3196     pp = eptr;
3197     for (i = min; i < max; i++)
3198     {
3199 ph10 463 if (eptr >= md->end_subject)
3200 ph10 462 {
3201 ph10 463 SCHECK_PARTIAL();
3202 ph10 462 break;
3203 ph10 463 }
3204 ph10 462 if (fc != *eptr) break;
3205 nigel 77 eptr++;
3206     }
3207 nigel 93 if (possessive) continue;
3208 ph10 443
3209 nigel 77 while (eptr >= pp)
3210     {
3211 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3212 nigel 77 eptr--;
3213     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3214     }
3215 ph10 510 MRRETURN(MATCH_NOMATCH);
3216 nigel 77 }
3217     }
3218     /* Control never gets here */
3219    
3220     /* Match a negated single one-byte character. The character we are
3221     checking can be multibyte. */
3222    
3223     case OP_NOT:
3224 ph10 625 case OP_NOTI:
3225 ph10 443 if (eptr >= md->end_subject)
3226 ph10 428 {
3227 ph10 443 SCHECK_PARTIAL();
3228 ph10 510 MRRETURN(MATCH_NOMATCH);
3229 ph10 443 }
3230 nigel 77 ecode++;
3231     GETCHARINCTEST(c, eptr);
3232 ph10 602 if (op == OP_NOTI) /* The caseless case */
3233 nigel 77 {
3234     #ifdef SUPPORT_UTF8
3235     if (c < 256)
3236     #endif
3237     c = md->lcc[c];
3238 ph10 510 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3239 nigel 77 }
3240 ph10 602 else /* Caseful */
3241 nigel 77 {
3242 ph10 510 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3243 nigel 77 }
3244     break;
3245    
3246     /* Match a negated single one-byte character repeatedly. This is almost a
3247     repeat of the code for a repeated single character, but I haven't found a
3248     nice way of commoning these up that doesn't require a test of the
3249     positive/negative option for each character match. Maybe that wouldn't add
3250     very much to the time taken, but character matching *is* what this is all
3251     about... */
3252    
3253     case OP_NOTEXACT:
3254 ph10 602 case OP_NOTEXACTI:
3255 nigel 77 min = max = GET2(ecode, 1);
3256     ecode += 3;
3257     goto REPEATNOTCHAR;
3258    
3259     case OP_NOTUPTO:
3260 ph10 602 case OP_NOTUPTOI:
3261 nigel 77 case OP_NOTMINUPTO:
3262 ph10 602 case OP_NOTMINUPTOI:
3263 nigel 77 min = 0;
3264     max = GET2(ecode, 1);
3265 ph10 602 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3266 nigel 77 ecode += 3;
3267     goto REPEATNOTCHAR;
3268    
3269 nigel 93 case OP_NOTPOSSTAR:
3270 ph10 602 case OP_NOTPOSSTARI:
3271 nigel 93 possessive = TRUE;
3272     min = 0;
3273     max = INT_MAX;
3274     ecode++;
3275     goto REPEATNOTCHAR;
3276    
3277     case OP_NOTPOSPLUS:
3278 ph10 602 case OP_NOTPOSPLUSI:
3279 nigel 93 possessive = TRUE;
3280     min = 1;
3281     max = INT_MAX;
3282     ecode++;
3283     goto REPEATNOTCHAR;
3284    
3285     case OP_NOTPOSQUERY:
3286 ph10 602 case OP_NOTPOSQUERYI:
3287 nigel 93 possessive = TRUE;
3288     min = 0;
3289     max = 1;
3290     ecode++;
3291     goto REPEATNOTCHAR;
3292    
3293     case OP_NOTPOSUPTO:
3294 ph10 602 case OP_NOTPOSUPTOI:
3295 nigel 93 possessive = TRUE;
3296     min = 0;
3297     max = GET2(ecode, 1);
3298     ecode += 3;
3299     goto REPEATNOTCHAR;
3300    
3301 nigel 77 case OP_NOTSTAR:
3302 ph10 602 case OP_NOTSTARI:
3303 nigel 77 case OP_NOTMINSTAR:
3304 ph10 602 case OP_NOTMINSTARI:
3305 nigel 77 case OP_NOTPLUS:
3306 ph10 602 case OP_NOTPLUSI:
3307 nigel 77 case OP_NOTMINPLUS:
3308 ph10 602 case OP_NOTMINPLUSI:
3309 nigel 77 case OP_NOTQUERY:
3310 ph10 602 case OP_NOTQUERYI:
3311 nigel 77 case OP_NOTMINQUERY:
3312 ph10 602 case OP_NOTMINQUERYI:
3313     c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3314 nigel 77 minimize = (c & 1) != 0;
3315     min = rep_min[c]; /* Pick up values from tables; */
3316     max = rep_max[c]; /* zero for max => infinity */
3317     if (max == 0) max = INT_MAX;
3318    
3319 ph10 426 /* Common code for all repeated single-byte matches. */
3320 nigel 77
3321     REPEATNOTCHAR:
3322     fc = *ecode++;
3323    
3324     /* The code is duplicated for the caseless and caseful cases, for speed,
3325     since matching characters is likely to be quite common. First, ensure the
3326     minimum number of matches are present. If min = max, continue at the same
3327     level without recursing. Otherwise, if minimizing, keep trying the rest of
3328     the expression and advancing one matching character if failing, up to the
3329     maximum. Alternatively, if maximizing, find the maximum number of
3330     characters and work backwards. */
3331    
3332     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3333     max, eptr));
3334    
3335 ph10 602 if (op >= OP_NOTSTARI) /* Caseless */
3336 nigel 77 {
3337     fc = md->lcc[fc];
3338    
3339     #ifdef SUPPORT_UTF8
3340     /* UTF-8 mode */
3341     if (utf8)
3342     {
3343 nigel 93 register unsigned int d;
3344 nigel 77 for (i = 1; i <= min; i++)
3345     {
3346 ph10 426 if (eptr >= md->end_subject)
3347     {
3348     SCHECK_PARTIAL();
3349 ph10 510 MRRETURN(MATCH_NOMATCH);
3350 ph10 427 }
3351 nigel 77 GETCHARINC(d, eptr);
3352     if (d < 256) d = md->lcc[d];
3353 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3354 nigel 77 }
3355     }
3356     else
3357     #endif
3358    
3359     /* Not UTF-8 mode */
3360     {
3361     for (i = 1; i <= min; i++)
3362 ph10 426 {
3363     if (eptr >= md->end_subject)
3364     {
3365     SCHECK_PARTIAL();
3366 ph10 510 MRRETURN(MATCH_NOMATCH);
3367 ph10 427 }
3368 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3369 ph10 427 }
3370 nigel 77 }
3371    
3372     if (min == max) continue;
3373    
3374     if (minimize)
3375     {
3376     #ifdef SUPPORT_UTF8
3377     /* UTF-8 mode */
3378     if (utf8)
3379     {
3380 nigel 93 register unsigned int d;
3381 nigel 77 for (fi = min;; fi++)
3382     {
3383 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3384 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3385 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3386 ph10 427 if (eptr >= md->end_subject)
3387 ph10 426 {
3388 ph10 427 SCHECK_PARTIAL();
3389 ph10 510 MRRETURN(MATCH_NOMATCH);
3390 ph10 427 }
3391 nigel 77 GETCHARINC(d, eptr);
3392     if (d < 256) d = md->lcc[d];
3393 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3394 nigel 77 }
3395     }
3396     else
3397     #endif
3398     /* Not UTF-8 mode */
3399     {
3400     for (fi = min;; fi++)
3401     {
3402 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3403 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3404 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3405 ph10 426 if (eptr >= md->end_subject)
3406     {
3407     SCHECK_PARTIAL();
3408 ph10 510 MRRETURN(MATCH_NOMATCH);
3409 ph10 426 }
3410 ph10 510 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3411 nigel 77 }
3412     }
3413     /* Control never gets here */
3414     }
3415    
3416     /* Maximize case */
3417    
3418     else
3419     {
3420     pp = eptr;
3421    
3422     #ifdef SUPPORT_UTF8
3423     /* UTF-8 mode */
3424     if (utf8)
3425     {
3426 nigel 93 register unsigned int d;
3427 nigel 77 for (i = min; i < max; i++)
3428     {
3429     int len = 1;
3430 ph10 463 if (eptr >= md->end_subject)
3431 ph10 462 {
3432 ph10 463 SCHECK_PARTIAL();
3433 ph10 462 break;
3434 ph10 463 }
3435 nigel 77 GETCHARLEN(d, eptr, len);
3436     if (d < 256) d = md->lcc[d];
3437     if (fc == d) break;
3438     eptr += len;
3439     }
3440 nigel 93 if (possessive) continue;
3441     for(;;)
3442 nigel 77 {
3443 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3444 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3445     if (eptr-- == pp) break; /* Stop if tried at original pos */
3446     BACKCHAR(eptr);
3447     }
3448     }
3449     else
3450     #endif
3451     /* Not UTF-8 mode */
3452     {
3453     for (i = min; i < max; i++)
3454     {
3455 ph10 463 if (eptr >= md->end_subject)
3456 ph10 462 {
3457     SCHECK_PARTIAL();
3458     break;
3459 ph10 463 }
3460 ph10 462 if (fc == md->lcc[*eptr]) break;
3461 nigel 77 eptr++;
3462     }
3463 nigel 93 if (possessive) continue;
3464 nigel 77 while (eptr >= pp)
3465     {
3466 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3467 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3468     eptr--;
3469     }
3470     }
3471    
3472 ph10 510 MRRETURN(MATCH_NOMATCH);
3473 nigel 77 }
3474     /* Control never gets here */
3475     }
3476    
3477     /* Caseful comparisons */
3478    
3479     else
3480     {
3481     #ifdef SUPPORT_UTF8
3482     /* UTF-8 mode */
3483     if (utf8)
3484     {
3485 nigel 93 register unsigned int d;
3486 nigel 77 for (i = 1; i <= min; i++)
3487     {
3488 ph10 426 if (eptr >= md->end_subject)
3489     {
3490     SCHECK_PARTIAL();
3491 ph10 510 MRRETURN(MATCH_NOMATCH);
3492 ph10 427 }
3493 nigel 77 GETCHARINC(d, eptr);
3494 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3495 nigel 77 }
3496     }
3497     else
3498     #endif
3499     /* Not UTF-8 mode */
3500     {
3501     for (i = 1; i <= min; i++)
3502 ph10 426 {
3503     if (eptr >= md->end_subject)
3504     {
3505     SCHECK_PARTIAL();
3506 ph10 510 MRRETURN(MATCH_NOMATCH);
3507 ph10 427 }
3508 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3509 ph10 427 }
3510 nigel 77 }
3511    
3512     if (min == max) continue;
3513    
3514     if (minimize)
3515     {
3516     #ifdef SUPPORT_UTF8
3517     /* UTF-8 mode */
3518     if (utf8)
3519     {
3520 nigel 93 register unsigned int d;
3521 nigel 77 for (fi = min;; fi++)
3522     {
3523 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3524 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3525 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3526 ph10 427 if (eptr >= md->end_subject)
3527 ph10 426 {
3528 ph10 427 SCHECK_PARTIAL();
3529 ph10 510 MRRETURN(MATCH_NOMATCH);
3530 ph10 427 }
3531 nigel 77 GETCHARINC(d, eptr);
3532 ph10 510 if (fc == d) MRRETURN(MATCH_NOMATCH);
3533 nigel 77 }
3534     }
3535     else
3536     #endif
3537     /* Not UTF-8 mode */
3538     {
3539     for (fi = min;; fi++)
3540     {
3541 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3542 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3543 ph10 510 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3544 ph10 426 if (eptr >= md->end_subject)
3545     {
3546     SCHECK_PARTIAL();
3547 ph10 510 MRRETURN(MATCH_NOMATCH);
3548 ph10 427 }
3549 ph10 510 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3550 nigel 77 }
3551     }
3552     /* Control never gets here */
3553     }
3554    
3555     /* Maximize case */
3556    
3557     else
3558     {
3559     pp = eptr;
3560    
3561     #ifdef SUPPORT_UTF8
3562     /* UTF-8 mode */
3563     if (utf8)
3564     {
3565 nigel 93 register unsigned int d;
3566 nigel 77 for (i = min; i < max; i++)
3567     {
3568     int len = 1;
3569 ph10 463 if (eptr >= md->end_subject)
3570 ph10 462 {
3571 ph10 463 SCHECK_PARTIAL();
3572 ph10 462 break;
3573 ph10 463 }
3574 nigel 77 GETCHARLEN(d, eptr, len);
3575     if (fc == d) break;
3576     eptr += len;
3577     }
3578 nigel 93 if (possessive) continue;
3579 nigel 77 for(;;)
3580     {
3581 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3582 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3583     if (eptr-- == pp) break; /* Stop if tried at original pos */
3584     BACKCHAR(eptr);
3585     }
3586     }
3587     else
3588     #endif
3589     /* Not UTF-8 mode */
3590     {
3591     for (i = min; i < max; i++)
3592     {
3593 ph10 463 if (eptr >= md->end_subject)
3594 ph10 462 {
3595 ph10 463 SCHECK_PARTIAL();
3596 ph10 462 break;
3597 ph10 463 }
3598 ph10 462 if (fc == *eptr) break;
3599 nigel 77 eptr++;
3600     }
3601 nigel 93 if (possessive) continue;
3602 nigel 77 while (eptr >= pp)
3603     {
3604 ph10 604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3605 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3606     eptr--;
3607     }
3608     }
3609    
3610 ph10 510 MRRETURN(MATCH_NOMATCH);
3611 nigel 77 }
3612     }
3613     /* Control never gets here */
3614    
3615     /* Match a single character type repeatedly; several different opcodes
3616     share code. This is very similar to the code for single characters, but we
3617     repeat it in the interests of efficiency. */
3618    
3619     case OP_TYPEEXACT:
3620     min = max = GET2(ecode, 1);
3621     minimize = TRUE;
3622     ecode += 3;
3623     goto REPEATTYPE;
3624    
3625     case OP_TYPEUPTO:
3626     case OP_TYPEMINUPTO:
3627     min = 0;
3628     max = GET2(ecode, 1);
3629     minimize = *ecode == OP_TYPEMINUPTO;
3630     ecode += 3;
3631     goto REPEATTYPE;
3632    
3633 nigel 93 case OP_TYPEPOSSTAR:
3634     possessive = TRUE;
3635     min = 0;
3636     max = INT_MAX;
3637     ecode++;
3638     goto REPEATTYPE;
3639    
3640     case OP_TYPEPOSPLUS:
3641     possessive = TRUE;
3642     min = 1;
3643     max = INT_MAX;
3644     ecode++;
3645     goto REPEATTYPE;
3646    
3647     case OP_TYPEPOSQUERY:
3648     possessive = TRUE;
3649     min = 0;
3650     max = 1;
3651     ecode++;
3652     goto REPEATTYPE;
3653    
3654     case OP_TYPEPOSUPTO:
3655     possessive = TRUE;
3656     min = 0;
3657     max = GET2(ecode, 1);
3658     ecode += 3;
3659     goto REPEATTYPE;
3660    
3661 nigel 77 case OP_TYPESTAR:
3662     case OP_TYPEMINSTAR:
3663     case OP_TYPEPLUS:
3664     case OP_TYPEMINPLUS:
3665     case OP_TYPEQUERY:
3666     case OP_TYPEMINQUERY:
3667     c = *ecode++ - OP_TYPESTAR;
3668     minimize = (c & 1) != 0;
3669     min = rep_min[c]; /* Pick up values from tables; */
3670     max = rep_max[c]; /* zero for max => infinity */
3671     if (max == 0) max = INT_MAX;
3672    
3673     /* Common code for all repeated single character type matches. Note that
3674     in UTF-8 mode, '.' matches a character of any length, but for the other
3675     character types, the valid characters are all one-byte long. */
3676    
3677     REPEATTYPE:
3678     ctype = *ecode++; /* Code for the character type */
3679    
3680     #ifdef SUPPORT_UCP
3681     if (ctype == OP_PROP || ctype == OP_NOTPROP)
3682     {
3683     prop_fail_result = ctype == OP_NOTPROP;
3684     prop_type = *ecode++;
3685 nigel 87 prop_value = *ecode++;
3686 nigel 77 }
3687     else prop_type = -1;
3688     #endif
3689    
3690     /* First, ensure the minimum number of matches are present. Use inline
3691     code for maximizing the speed, and do the type test once at the start
3692 ph10 426 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3693 nigel 77 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3694     and single-bytes. */
3695    
3696     if (min > 0)
3697     {
3698     #ifdef SUPPORT_UCP
3699 nigel 87 if (prop_type >= 0)
3700 nigel 77 {
3701 nigel 87 switch(prop_type)
3702 nigel 77 {
3703 nigel 87 case PT_ANY:
3704 ph10 510 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3705 nigel 87 for (i = 1; i <= min; i++)
3706     {
3707 ph10 427 if (eptr >= md->end_subject)
3708 ph10 426 {
3709 ph10 427 SCHECK_PARTIAL();
3710 ph10 510 MRRETURN(MATCH_NOMATCH);
3711 ph10 427 }
3712 ph10 184 GETCHARINCTEST(c, eptr);
3713 nigel 87 }
3714     break;
3715    
3716     case PT_LAMP:
3717     for (i = 1; i <= min; i++)
3718     {
3719 ph10 625 int chartype;
3720 ph10 427 if (eptr >= md->end_subject)
3721 ph10 426 {
3722 ph10 427 SCHECK_PARTIAL();
3723 ph10 510 MRRETURN(MATCH_NOMATCH);
3724 ph10 427 }
3725 ph10 184 GETCHARINCTEST(c, eptr);
3726 ph10 623 chartype = UCD_CHARTYPE(c);
3727     if ((chartype == ucp_Lu ||
3728     chartype == ucp_Ll ||
3729     chartype == ucp_Lt) == prop_fail_result)
3730 ph10 510 MRRETURN(MATCH_NOMATCH);
3731 nigel 87 }
3732     break;
3733    
3734     case PT_GC:
3735     for (i = 1; i <= min; i++)
3736     {
3737 ph10 427 if (eptr >= md->end_subject)
3738 ph10 426 {
3739 ph10 427 SCHECK_PARTIAL();
3740 ph10 510 MRRETURN(MATCH_NOMATCH);
3741 ph10 427 }
3742 ph10 184 GETCHARINCTEST(c, eptr);
3743 ph10 623 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3744 ph10 510 MRRETURN(MATCH_NOMATCH);
3745 nigel 87 }
3746     break;
3747    
3748     case PT_PC:
3749     for (i = 1; i <= min; i++)
3750     {
3751 ph10 427 if (eptr >= md->end_subject)
3752 ph10 426 {
3753 ph10 427 SCHECK_PARTIAL();
3754 ph10 510 MRRETURN(MATCH_NOMATCH);
3755 ph10 427 }
3756 ph10 184 GETCHARINCTEST(c, eptr);
3757 ph10 623 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3758 ph10 510 MRRETURN(MATCH_NOMATCH);
3759 nigel 87 }
3760     break;
3761    
3762     case PT_SC:
3763     for (i = 1; i <= min; i++)
3764     {
3765 ph10 427 if (eptr >= md->end_subject)
3766 ph10 426 {
3767 ph10 427 SCHECK_PARTIAL();
3768 ph10 510 MRRETURN(MATCH_NOMATCH);
3769 ph10 427 }
3770 ph10 184 GETCHARINCTEST(c, eptr);
3771 ph10 623 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
3772 ph10 510 MRRETURN(MATCH_NOMATCH);
3773 nigel 87 }
3774     break;
3775 ph10 527
3776 ph10 517 case PT_ALNUM:
3777     for (i = 1; i <= min; i++)
3778     {
3779 ph10 625 int category;
3780 ph10 517 if (eptr >= md->end_subject)
3781     {
3782     SCHECK_PARTIAL();
3783     MRRETURN(MATCH_NOMATCH);
3784     }
3785     GETCHARINCTEST(c, eptr);
3786 ph10 623 category = UCD_CATEGORY(c);
3787     if ((category == ucp_L || category == ucp_N) == prop_fail_result)
3788 ph10 517 MRRETURN(MATCH_NOMATCH);
3789     }
3790     break;
3791 ph10 527
3792 ph10 517 case PT_SPACE: /* Perl space */
3793     for (i = 1; i <= min; i++)
3794     {
3795     if (eptr >= md->end_subject)
3796     {
3797     SCHECK_PARTIAL();
3798     MRRETURN(MATCH_NOMATCH);
3799     }
3800     GETCHARINCTEST(c, eptr);
3801 ph10 623 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3802 ph10 527 c == CHAR_FF || c == CHAR_CR)
3803 ph10 517 == prop_fail_result)
3804     MRRETURN(MATCH_NOMATCH);
3805     }
3806     break;
3807 ph10 527
3808 ph10 517 case PT_PXSPACE: /* POSIX space */
3809     for (i = 1; i <= min; i++)
3810     {
3811     if (eptr >= md->end_subject)
3812     {
3813     SCHECK_PARTIAL();
3814     MRRETURN(MATCH_NOMATCH);
3815     }
3816     GETCHARINCTEST(c, eptr);
3817 ph10 623 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3818 ph10 527 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3819 ph10 517 == prop_fail_result)
3820     MRRETURN(MATCH_NOMATCH);
3821     }
3822     break;
3823 ph10 527
3824     case PT_WORD:
3825 ph10 517 for (i = 1; i <= min; i++)
3826     {
3827 ph10 625 int category;
3828 ph10 517 if (eptr >= md->end_subject)
3829     {
3830     SCHECK_PARTIAL();
3831     MRRETURN(MATCH_NOMATCH);
3832     }
3833     GETCHARINCTEST(c, eptr);
3834 ph10 623 category = UCD_CATEGORY(c);
3835     if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
3836 ph10 517 == prop_fail_result)
3837     MRRETURN(MATCH_NOMATCH);
3838     }
3839     break;
3840 ph10 527
3841 ph10 517 /* This should not occur */
3842 nigel 87
3843     default:
3844     RRETURN(PCRE_ERROR_INTERNAL);
3845 nigel 77 }
3846     }
3847    
3848     /* Match extended Unicode sequences. We will get here only if the
3849     support is in the binary; otherwise a compile-time error occurs. */
3850    
3851     else if (ctype == OP_EXTUNI)
3852     {
3853     for (i = 1; i <= min; i++)
3854     {
3855 ph10 427 if (eptr >= md->end_subject)
3856 ph10 426 {
3857 ph10 427 SCHECK_PARTIAL();
3858 ph10 510 MRRETURN(MATCH_NOMATCH);
3859 ph10 427 }
3860 nigel 77 GETCHARINCTEST(c, eptr);
3861 ph10 623 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
3862 nigel 77 while (eptr < md->end_subject)
3863     {
3864     int len = 1;
3865 ph10 623 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
3866     if (UCD_CATEGORY(c) != ucp_M) break;
3867 nigel 77 eptr += len;
3868     }
3869     }
3870     }
3871    
3872     else
3873     #endif /* SUPPORT_UCP */
3874    
3875     /* Handle all other cases when the coding is UTF-8 */
3876    
3877     #ifdef SUPPORT_UTF8
3878     if (utf8) switch(ctype)
3879     {
3880     case OP_ANY:
3881     for (i = 1; i <= min; i++)
3882     {
3883 ph10 426 if (eptr >= md->end_subject)
3884     {
3885 ph10 427 SCHECK_PARTIAL();
3886 ph10 510 MRRETURN(MATCH_NOMATCH);
3887 ph10 427 }
3888 ph10 510 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3889 nigel 91 eptr++;
3890 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3891     }
3892     break;
3893    
3894 ph10 341 case OP_ALLANY:
3895     for (i = 1; i <= min; i++)
3896     {
3897 ph10 427 if (eptr >= md->end_subject)
3898 ph10 426 {
3899     SCHECK_PARTIAL();
3900 ph10 510 MRRETURN(MATCH_NOMATCH);
3901 ph10 427 }
3902 ph10 341 eptr++;
3903     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3904     }
3905     break;
3906    
3907 nigel 77 case OP_ANYBYTE:
3908 ph10 510 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3909 nigel 77 eptr += min;
3910     break;
3911    
3912 nigel 93 case OP_ANYNL:
3913     for (i = 1; i <= min; i++)
3914     {
3915 ph10 427 if (eptr >= md->end_subject)
3916 ph10 426 {
3917     SCHECK_PARTIAL();
3918 ph10 510 MRRETURN(MATCH_NOMATCH);
3919 ph10 427 }
3920 nigel 93 GETCHARINC(c, eptr);
3921     switch(c)
3922     {
3923 ph10 510 default: MRRETURN(MATCH_NOMATCH);
3924 ph10 625
3925 nigel 93 case 0x000d:
3926     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3927     break;
3928 ph10 231
3929 nigel 93 case 0x000a:
3930 ph10 231 break;
3931    
3932 nigel 93 case 0x000b:
3933     case 0x000c:
3934     case 0x0085:
3935     case 0x2028:
3936     case 0x2029:
3937 ph10 510 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3938 nigel 93 break;
3939     }
3940     }
3941     break;
3942    
3943 ph10 178 case OP_NOT_HSPACE: